Cudaプログラミング入門例15


#include 
#include 
#include 
#include 

#define BLOCK_SIZE 16
static void HandleError(cudaError_t err, const char *file, int line)
{
	if (err != cudaSuccess)
	{
		printf("%s in %s at line %d
", cudaGetErrorString(err), file, line); exit(EXIT_FAILURE); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define HANDLE_NULL( a ) {if ((a) == NULL) { \ printf("Host memory failed in %s at line %d
", \ __FILE__, __LINE__); \ exit(EXIT_FAILURE); }} static void GenerateNumbers(int *number, int size) { for (int i = 0; i < size; i++) { // int number[i] = rand() % 10; } } static bool InitCUDA() { int count; cudaGetDeviceCount(&count); if (count == 0) { fprintf(stderr, "There is no device.
"); return false; } int i; for (i = 0; i < count; i++) { cudaDeviceProp prop; if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { break; } } } if (i >= count) { fprintf(stderr, "There is no device supporting CUDA 1.x.
"); return false; } cudaSetDevice(i); return true; } // block, block 256 , , // , cpu 。 。 __global__ static void sumOfSquares(int *num, int size, int* result, clock_t* time) { extern __shared__ int temp[]; const int tid = threadIdx.x; const int bid = blockIdx.x; int i; temp[tid] = 0; if (tid == 0) time[bid] = clock(); for (i = tid + bid * blockDim.x; i < size; i += gridDim.x * blockDim.x) { temp[tid] += num[i] * num[i]; } __syncthreads(); int offset = blockDim.x >> 1; while (offset > 0) { if (tid < offset) { temp[tid] += temp[tid + offset]; } __syncthreads(); offset >>= 1; } if (tid == 0) { result[bid] = temp[0]; time[bid + gridDim.x] = clock(); } } int main(int argc, char *argv[]) { if (!InitCUDA()) { return -1; } printf("CUDA initialized.
"); //const int block_num = 32; const int block_num = 16; const int thread_num = 256; const int DATA_SIZE = 1024 * 16; int data[DATA_SIZE]; GenerateNumbers(data, DATA_SIZE); int* gpudata, *result; clock_t* devTime, gpuTime[2 * block_num], cpuTime; HANDLE_ERROR(cudaMalloc((void **)&gpudata, sizeof(int)* DATA_SIZE)); HANDLE_ERROR(cudaMalloc((void**)&result, sizeof(int)* block_num)); HANDLE_ERROR(cudaMalloc((void**)&devTime, sizeof(clock_t)* 2 * block_num)); HANDLE_ERROR(cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice)); sumOfSquares << > >(gpudata, DATA_SIZE, result, devTime); long long sum = 0; int gpuSum[block_num]; HANDLE_ERROR(cudaMemcpy(gpuSum, result, sizeof(int)* block_num, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(gpuTime, devTime, sizeof(clock_t)* 2 * block_num, cudaMemcpyDeviceToHost)); cudaFree(gpudata); cudaFree(result); cudaFree(devTime); for (int i = 0; i < block_num; i++) { sum += gpuSum[i]; } clock_t minStart = gpuTime[0]; clock_t maxEnd = gpuTime[block_num]; for (int i = 1; i < block_num; i++) { if (minStart > gpuTime[i]) { minStart = gpuTime[i]; } if (maxEnd < gpuTime[i + block_num]) { maxEnd = gpuTime[i + block_num]; } } printf("sum (GPU): %ld, time: %d
", sum, maxEnd - minStart); long long sumCPU = 0; cpuTime = clock(); for (int i = 0; i < DATA_SIZE; i++) { sumCPU += data[i] * data[i]; } cpuTime = clock() - cpuTime; printf("sum (CPU): %ld, time: %d
", sumCPU, cpuTime); printf("Result %s
", sum == sumCPU ? "OK" : "Wrong"); //remember to release the device cudaDeviceReset(); return 0; }