File tree Expand file tree Collapse file tree 1 file changed +9
-8
lines changed
Expand file tree Collapse file tree 1 file changed +9
-8
lines changed Original file line number Diff line number Diff line change 1010)
1111
1212code = """
13- #include <stdlib.h>
14-
1513#define N 4096
1614
1715void matrix_multiply(float *A, float *B, float *C) {
1816 #pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
17+ float temp_sum = 0.0f;
1918 #pragma acc parallel vector_length(nthreads)
20- #pragma acc loop
21- for ( i = 0; i < N; i++) {
22- for ( j = 0; j < N; j++ ) {
23- for ( k = 0; k < N; k++ ) {
24- C[i][j] += A[i][k] * B[k][j];
19+ #pragma acc loop collapse(2) reduction(+:temp_sum)
20+ for ( int i = 0; i < N; i++) {
21+ for ( int j = 0; j < N; j++ ) {
22+ temp_sum = 0.0f;
23+ for ( int k = 0; k < N; k++ ) {
24+ temp_sum += A[(i * N) + k] * B[(k * N) + j];
2525 }
26+ C[(i * N) + j] = temp_sum;
2627 }
2728 }
2829 #pragma tuner stop
3738tune_params = dict ()
3839tune_params ["nthreads" ] = [32 * i for i in range (1 , 33 )]
3940metrics = dict ()
40- metrics ["GB/s" ] = lambda x : (4096 * 4096 * 4 ) / (x ["time" ] / 10 ** 3 ) / 10 ** 9
41+ metrics ["GB/s" ] = lambda x : (( 4096 * 4096 * 4096 * 2 * 4 ) + ( 4096 * 4096 * 4 ) ) / (x ["time" ] / 10 ** 3 ) / 10 ** 9
4142
4243tune_kernel (
4344 "mm" ,
You can’t perform that action at this time.
0 commit comments