pthreadマルチスレッド加速例(大型マトリクス乗算):Blocking,1024スレッド^^;


ブロック化方式を使用すると、加速効果がより顕著になり、32*32=1024スレッドでテストされ、非Blocking方式(この記事を参照)より十数倍向上した.
特にスレッドの数が多いとリソースが消費され、時間がかかります.下記paramパラメータ構造体は前例と異なり、必要に応じて変更可能である.
// Multi-Thread Speedup:Blocking Method
#include 
#include 
#include "MatrixLib.h"
#pragma comment(lib,"MatrixLib.lib")
#pragma warning(disable:4996)

void  checkResult(char* str, int value, FILE* pflog)
{
	if (value != 0)
	{
		fprintf(pflog, "Failed with %d at %s", value, str);
		exit(1);
	}
}

typedef struct
{
	FILE* pflog;
	double** R;
	double** A;
	double** B;
	int start_row;
	int end_row;
	int start_col;
	int end_col;
} threadParm_t;

//  ZeroInitSquareMatrix(R,N)
void *oneThread(void *param)
{
	threadParm_t *p = (threadParm_t *)param;
	//fprintf(p->pflog, "# Thread  \'%.8X %.8X\'  is now running.
", getpid()); double** R = p->R; double** A = p->A; double** B = p->B; int start_row = p->start_row; int end_row = p->end_row; int start_col = p->start_col; int end_col = p->end_col; double tmp; for (int i = start_row; i < end_row; ++i) { for (int j = start_col; j < end_col; ++j) { tmp = 0; for (int k = start_col; k < end_col; ++k) { tmp += A[i][k] * B[j][k]; } R[i][j] += tmp; } } return NULL; } void OneTry(const int N, const int C, FILE* pflog) { int CC = C*C; fprintf(pflog, "== %4d * %4d Matrix Multiply, %d Threads. ==
", N, N, CC); clock_t start = clock(); double** X = NewSquareMatrix(N); double** Y = NewSquareMatrix(N); double** Z = NewSquareMatrix(N); TransformSquareMat(Z, N); // int start_row = 0, end_row = 0; int start_col = 0, end_col = 0; int inc_row = N / C,inc_col=N/C; end_row = start_row + inc_row; end_col = start_col + inc_col; int i, j,k,rc; pthread_t* threads = new pthread_t[CC]; threadParm_t* tparams = new threadParm_t[CC]; for (i = 0; i < C; ++i) { for (j = 0; j < C; ++j) { k = i*C + j; tparams[k].pflog = pflog; tparams[k].R = X; tparams[k].A = Y; tparams[k].B = Z; tparams[k].start_row = start_row; tparams[k].end_row = end_row; tparams[k].start_col = start_col; tparams[k].end_col = end_col; start_row = end_row + 1; end_row += inc_row; start_col = end_col + 1; end_col += inc_col; start_row %= N; end_row %= N; start_col %= N; end_col %= N; rc = pthread_create(&threads[k], NULL, oneThread, &tparams[k]); checkResult("!! pthread_create()
", rc, pflog); //fprintf(pflog, "********** %4d of %4d threads created **********
", k + 1, CC); } } fprintf(pflog, "@ Waiting for worker threads' end...
"); int* status = new int[CC]; for (i = 0; i < CC; ++i) { rc = pthread_join(threads[i], (void**)(&status[i])); checkResult("!! pthread_join()
", rc, pflog); } fprintf(pflog, "@ Check all thread's results
"); for (i = 0; i < CC; ++i) { if (status[i] != NULL) { fprintf(pflog, "!! Unexpected thread status
"); } } //TransformSquareMat(Z, N); // SafeDeleteSquareMat(X, N); SafeDeleteSquareMat(Y, N); SafeDeleteSquareMat(Z, N); clock_t finish = clock(); fprintf(pflog, "@ All finished. Total time:%.8f(sec).

", (finish - start) / (1.0*CLOCKS_PER_SEC)); } int main(int argc, char **argv) { FILE* pflog = fopen("trace_log.txt", "a"); const int N = 4096, C = 32; printf("Matrix N=%d,Thread C=%d, now running...", N, C*C); time_t rawtime; time(&rawtime); tm* tminfo = localtime(&rawtime); fprintf(pflog, "
NEW LOG @%s", asctime(tminfo)); OneTry(N, C, pflog); fflush(pflog); fclose(pflog); printf("finshed!
"); system("pause"); return 0; }

ログ・セクション
NEW LOG @Sun Apr 20 13:18:12 2014
== 4096 * 4096 Matrix Multiply, 1024 Threads. ==
**********     1 of 1024 threads created  **********
**********     2 of 1024 threads created  **********
**********     3 of 1024 threads created  **********
# Thread  '000033A8 00F91A80'  is now running.
# Thread  '000033A8 00F91B60'  is now running.
( ...)
@ Check all thread's results
@ All finished. Total time:2.57800000(sec).

NEW LOG @Sun Apr 20 13:18:42 2014
== 4096 * 4096 Matrix Multiply, 256 Threads. ==
**********     1 of  256 threads created  **********
**********     2 of  256 threads created  **********
**********     3 of  256 threads created  **********
**********     4 of  256 threads created  **********
# Thread  '00003470 01001A80'  is now running.
# Thread  '00003470 01001B60'  is now running.
# Thread  '00003470 01003578'  is now running.
# Thread  '00003470 01003888'  is now running.
**********     5 of  256 threads created  **********
( ...)
@ Check all thread's results
@ All finished. Total time:3.60900000(sec).

NEW LOG @Sun Apr 20 13:18:52 2014
== 4096 * 4096 Matrix Multiply, 64 Threads. ==
**********     1 of   64 threads created  **********
**********     2 of   64 threads created  **********
**********     3 of   64 threads created  **********
# Thread  '0000368C 009B1A80'  is now running.
**********     4 of   64 threads created  **********
# Thread  '0000368C 009B1B60'  is now running.
( ...)
# Thread  '0000368C 009B3888'  is now running.
@ Check all thread's results
@ All finished. Total time:6.90600000(sec).

NEW LOG @Sun Apr 20 13:29:52 2014
== 4096 * 4096 Matrix Multiply, 1024 Threads. ==
@ Waiting for worker threads' end...
@ Check all thread's results
@ All finished. Total time:2.44600000(sec).