Cudaプログラミング入門例15
3875 ワード
#include
#include
#include
#include
#define BLOCK_SIZE 16
static void HandleError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d
", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define HANDLE_NULL( a ) {if ((a) == NULL) { \
printf("Host memory failed in %s at line %d
", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); }}
static void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++)
{
// int
number[i] = rand() % 10;
}
}
static bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if (count == 0)
{
fprintf(stderr, "There is no device.
");
return false;
}
int i;
for (i = 0; i < count; i++)
{
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if (prop.major >= 1)
{
break;
}
}
}
if (i >= count)
{
fprintf(stderr, "There is no device supporting CUDA 1.x.
");
return false;
}
cudaSetDevice(i);
return true;
}
// block, block 256 , ,
// , cpu 。 。
__global__ static void sumOfSquares(int *num, int size, int* result, clock_t* time)
{
extern __shared__ int temp[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
temp[tid] = 0;
if (tid == 0) time[bid] = clock();
for (i = tid + bid * blockDim.x; i < size; i += gridDim.x * blockDim.x)
{
temp[tid] += num[i] * num[i];
}
__syncthreads();
int offset = blockDim.x >> 1;
while (offset > 0)
{
if (tid < offset)
{
temp[tid] += temp[tid + offset];
}
__syncthreads();
offset >>= 1;
}
if (tid == 0)
{
result[bid] = temp[0];
time[bid + gridDim.x] = clock();
}
}
int main(int argc, char *argv[])
{
if (!InitCUDA())
{
return -1;
}
printf("CUDA initialized.
");
//const int block_num = 32;
const int block_num = 16;
const int thread_num = 256;
const int DATA_SIZE = 1024 * 16;
int data[DATA_SIZE];
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result;
clock_t* devTime, gpuTime[2 * block_num], cpuTime;
HANDLE_ERROR(cudaMalloc((void **)&gpudata, sizeof(int)* DATA_SIZE));
HANDLE_ERROR(cudaMalloc((void**)&result, sizeof(int)* block_num));
HANDLE_ERROR(cudaMalloc((void**)&devTime, sizeof(clock_t)* 2 * block_num));
HANDLE_ERROR(cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice));
sumOfSquares << > >(gpudata, DATA_SIZE, result, devTime);
long long sum = 0;
int gpuSum[block_num];
HANDLE_ERROR(cudaMemcpy(gpuSum, result, sizeof(int)* block_num, cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(gpuTime, devTime, sizeof(clock_t)* 2 * block_num, cudaMemcpyDeviceToHost));
cudaFree(gpudata);
cudaFree(result);
cudaFree(devTime);
for (int i = 0; i < block_num; i++)
{
sum += gpuSum[i];
}
clock_t minStart = gpuTime[0];
clock_t maxEnd = gpuTime[block_num];
for (int i = 1; i < block_num; i++)
{
if (minStart > gpuTime[i])
{
minStart = gpuTime[i];
}
if (maxEnd < gpuTime[i + block_num])
{
maxEnd = gpuTime[i + block_num];
}
}
printf("sum (GPU): %ld, time: %d
", sum, maxEnd - minStart);
long long sumCPU = 0;
cpuTime = clock();
for (int i = 0; i < DATA_SIZE; i++)
{
sumCPU += data[i] * data[i];
}
cpuTime = clock() - cpuTime;
printf("sum (CPU): %ld, time: %d
", sumCPU, cpuTime);
printf("Result %s
", sum == sumCPU ? "OK" : "Wrong");
//remember to release the device
cudaDeviceReset();
return 0;
}