AVX命令セット行列乗ベクトルアルゴリズム
2569 ワード
#include
#include
#include
void matmul_avx(const float *x, const float **w,float *y,const int col,const int row){
const int col_reduced_8 = col - col % 8;
float scratchpad[8];
__m256 op0, op1, tgt, tmp_vec;
for (int i = 0; i < row; i++) {
float res = 0;
tgt = _mm256_setzero_ps();
for (int j = 0; j < col_reduced_8; j += 8) {
op0 = __builtin_ia32_loadups256(&x[j]);
op1 = __builtin_ia32_loadups256(&w[i][j]);
tmp_vec = __builtin_ia32_mulps256(op0, op1);
tgt = __builtin_ia32_addps256(tmp_vec, tgt);
}
__builtin_ia32_storeups256(scratchpad, tgt);
for (int k = 0; k < 8; k++)
res += scratchpad[k];
for (int l = col_reduced_8; l < col; l++) {
res += w[i][l] * x[l];
}
y[i] = res;
}
}
int main() {
const int col = 2048, row = 512, num_mul = 10;
float **w;
float x[col];
float y[row];
float scratchpad[8];
w = (float **)malloc(sizeof(float*)*row);
for (int i = 0; i < row; i ++) { w[i] = (float *)malloc(sizeof(float) * col); }
for (int i = 0; i < row; i++) {
for (int j = 0; j < col; j++) {
w[i][j] = (float) (rand() % 1000) / 800.0f;
}
}
for (int j = 0; j < col; j++) {
x[j] = (float) (rand() % 1000) / 800.0f;
}
clock_t t1, t2;
// The original matrix multiplication version
t1 = clock();
for (int r = 0; r < num_mul; r++)
for (int j = 0; j < row; j++) {
float sum = 0;
float *wj = w[j];
for (int i = 0; i < col; i++)
sum += wj[i] * x[i];
y[j] = sum;
}
t2 = clock();
float diff = ((float) t2 - (float) t1) / (num_mul*CLOCKS_PER_SEC);
printf("
Time taken: %f second.
", diff);
for (int i = 0; i < row; i++) {
printf("%.4f, ", y[i]);
y[i]=0;
}
printf("
");
// The avx matrix multiplication version.
t1 = clock();
for (int r = 0; r < num_mul; r++)
matmul_avx(x,w,y,col,row);
t2 = clock();
diff = ((float) t2 - (float) t1) / (num_mul*CLOCKS_PER_SEC);
printf("
Time taken: %f second.
",diff);
for (int i = 0; i < row; i++) {
printf("%.4f, ", y[i]);
}
printf("
");
}
実行方法:
gcc -o test test.c -mavx
./test