Cudaプログラミング入門例21


#include 
#include 
#include 
#include 
#include 

#define BLOCK_SIZE 16
static void HandleError(cudaError_t err, const char *file, int line)
{
	if (err != cudaSuccess)
	{
		printf("%s in %s at line %d
", cudaGetErrorString(err), file, line); exit(EXIT_FAILURE); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) #define HANDLE_NULL( a ) {if ((a) == NULL) { \ printf("Host memory failed in %s at line %d
", \ __FILE__, __LINE__); \ exit(EXIT_FAILURE); }} static bool InitCUDA() { int count; cudaGetDeviceCount(&count); if (count == 0) { fprintf(stderr, "There is no device.
"); return false; } int i; cudaDeviceProp prop = {0}; for (i = 0; i < count; i++) { if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { printf("%s
", prop.name); break; } } } if (i >= count) { fprintf(stderr, "There is no device supporting CUDA 1.x.
"); return false; } cudaSetDevice(i); return true; } static void matgen(float* a, int lda, int n) { int i, j; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX); } } } static void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n) { int i, j, k; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { double t = 0; for (k = 0; k < n; k++) { t += a[i * lda + k] * b[j + k * ldb]; } c[i * ldc + j] = t; } } } static void compare_mat(const float* a, int lda, const float* b, int ldb, int n) { float max_err = 0; float average_err = 0; float max_absolute_err = 0; int i, j; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { if (b[i * ldb + j] != 0) { float tmp = fabs(a[i * lda + j] - b[i * ldb + j]); if (max_absolute_err < tmp) { max_absolute_err = tmp; } float err = fabs(tmp / b[i * ldb + j]); if (max_err < err) { max_err = err; } average_err += err; } } } printf("Max absolute error: %g
Max error: %g
Average error: %g
", \ max_absolute_err, max_err, average_err / (n * n)); } // Kahan's Summation Formula , dev_a , n 。 __global__ static void matMultCUDA(const float* dev_a, size_t lda, const float* dev_b, size_t ldb, float* dev_c, size_t ldc, int n) { extern __shared__ float data[]; const int tid = threadIdx.x; const int row = blockIdx.x; int i, j; for (i = tid; i < n; i += blockDim.x) { data[i] = dev_a[row * lda + i]; } __syncthreads(); //row ,j for (j = tid; j < n; j += blockDim.x) { float t = 0; float y = 0; for (i = 0; i < n; i++) { float r; y -= data[i] * dev_b[i * ldb + j]; r = t - y; y = (r - t) + y; t = r; } dev_c[row * ldc + j] = t; } } static clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n) { const int thread_num = 256; float *dev_a, *dev_b, *dev_c; clock_t time; time = clock(); HANDLE_ERROR(cudaMalloc((void**)&dev_a, sizeof(float)* n * n)); HANDLE_ERROR(cudaMalloc((void**)&dev_b, sizeof(float)* n * n)); HANDLE_ERROR(cudaMalloc((void**)&dev_c, sizeof(float)* n * n)); //HANDLE_ERROR(cudaMemcpy2D(dev_a, sizeof(float) * n, a, sizeof(float) * lda, sizeof(float) * n, n, cudaMemcpyHostToDevice)); //HANDLE_ERROR(cudaMemcpy2D(dev_b, sizeof(float) * n, b, sizeof(float) * ldb, sizeof(float) * n, n, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(dev_a, a, sizeof(float)* n * n, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(dev_b, b, sizeof(float)* n * n, cudaMemcpyHostToDevice)); //int blocks = (n * n + thread_num - 1) / thread_num; int blocks = n; // n , dev_a //matMultCUDA01<<>>(dev_a, n, dev_b, n, dev_c, n, n); matMultCUDA << > >(dev_a, n, dev_b, n, dev_c, n, n); //HANDLE_ERROR(cudaMemcpy2D(c, sizeof(float) * ldc, dev_c, sizeof(float) * n, sizeof(float) * n, n, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(c, dev_c, sizeof(float)* n * n, cudaMemcpyDeviceToHost)); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return clock() - time; } int main(int argc, char *argv[]) { const int n = 64 * 4 * 2; float *a, *b, *c, *d; HANDLE_NULL(a = (float*)malloc(sizeof(float)* n * n)); HANDLE_NULL(b = (float*)malloc(sizeof(float)* n * n)); HANDLE_NULL(c = (float*)malloc(sizeof(float)* n * n)); HANDLE_NULL(d = (float*)malloc(sizeof(float)* n * n)); srand(0); matgen(a, n, n); matgen(b, n, n); printf(" Kahan's Summation Formula
"); clock_t time = matmultCUDA(a, n, b, n, c, n, n); matmult(a, n, b, n, d, n, n); compare_mat(c, n, d, n, n); double sec = (double)time / CLOCKS_PER_SEC; printf("Time used: %.6fs (%.6lf GFLOPS)
", sec, 2.0 * n * n * n / (sec * 1E9)); free(a); free(b); free(c); free(d); //remember to release the device cudaDeviceReset(); return 0; }