Cudaプログラミング入門例9


#include 
#include 
#include 
#include 

#define BLOCK_SIZE 16
static void HandleError(cudaError_t err, const char *file, int line)
{
	if (err != cudaSuccess)
	{
		printf("%s in %s at line %d
", cudaGetErrorString(err), file, line); exit(EXIT_FAILURE); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define HANDLE_NULL( a ) {if ((a) == NULL) { \ printf("Host memory failed in %s at line %d
", \ __FILE__, __LINE__); \ exit(EXIT_FAILURE); }} static void GenerateNumbers(int *number, int size) { for (int i = 0; i < size; i++) { number[i] = rand() % 10; } } static bool InitCUDA() { int count; cudaGetDeviceCount(&count); if (count == 0) { fprintf(stderr, "There is no device.
"); return false; } int i; for (i = 0; i < count; i++) { cudaDeviceProp prop; if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { break; } } } if (i >= count) { fprintf(stderr, "There is no device supporting CUDA 1.x.
"); return false; } cudaSetDevice(i); return true; } //1 block,block 256 thread,threadIdx.x = 0 , // __global__ static void sumOfSquares(int *num, int size, int* result, clock_t* time) { extern __shared__ int temp[]; int sum = 0; clock_t start; const int tid = threadIdx.x; const int gap = (size + blockDim.x - 1) / blockDim.x; temp[tid] = 0; if (tid == 0) { start = clock(); } for (int index = tid * gap; index < (tid + 1) * gap; index++) { if (index < size) { sum += num[index] * num[index]; } } temp[tid] = sum; sum = 0; int offset = blockDim.x >> 1; while (offset > 0) { __syncthreads(); if (tid < offset) { temp[tid] += temp[tid + offset]; } offset >>= 1; } if (tid == 0) { *result = temp[0]; *time = clock() - start; } } int main(int argc, char *argv[]) { if (!InitCUDA()) { return -1; } printf("CUDA initialized.
"); const int thread_num = 256; const int DATA_SIZE = 1024; int data[DATA_SIZE]; GenerateNumbers(data, DATA_SIZE); int* gpudata, *result; clock_t* devTime, gpuTime, cpuTime; HANDLE_ERROR(cudaMalloc((void **)&gpudata, sizeof(int)* DATA_SIZE)); HANDLE_ERROR(cudaMalloc((void**)&result, sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&devTime, sizeof(clock_t))); HANDLE_ERROR(cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice)); sumOfSquares << <1, 1, thread_num * sizeof(int) >> >(gpudata, DATA_SIZE, result, devTime); int sum; HANDLE_ERROR(cudaMemcpy(&sum, result, sizeof(int), cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(&gpuTime, devTime, sizeof(clock_t), cudaMemcpyDeviceToHost)); cudaFree(gpudata); cudaFree(result); cudaFree(devTime); printf("sum (GPU): %d, time: %d
", sum, gpuTime); int sumCPU = 0; cpuTime = clock(); for (int i = 0; i < DATA_SIZE; i++) { sumCPU += data[i] * data[i]; } cpuTime = clock() - cpuTime; printf("sum (CPU): %d, time: %d
", sumCPU, cpuTime); printf("Result %s
", sum == sumCPU ? "OK" : "Wrong"); //remember to release the device cudaDeviceReset(); return 0; }