@@ -1013,12 +1013,15 @@ void
10131013xGemm<cl_double>::
10141014xGemm_Function (bool flush, cl_uint apiCallCount )
10151015{
1016- clblasDgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1016+ for (int i = 0 ; i < apiCallCount; i++)
1017+ {
1018+ clblasDgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
10171019 buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
10181020 buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
10191021 buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
10201022 buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
10211023 buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1024+ }
10221025 // flush==true if only the kernel time (library call) is timed
10231026 // flush==false if memory time is also timed
10241027 if (flush==true )
@@ -1032,12 +1035,15 @@ void
10321035xGemm<cl_float2>::
10331036xGemm_Function (bool flush, cl_uint apiCallCount )
10341037{
1035- clblasCgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1038+ for (int i = 0 ; i < apiCallCount; i++)
1039+ {
1040+ clblasCgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
10361041 buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
10371042 buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
10381043 buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
10391044 buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
10401045 buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1046+ }
10411047 // flush==true if only the kernel time (library call) is timed
10421048 // flush==false if memory time is also timed
10431049 if (flush==true )
@@ -1051,12 +1057,28 @@ void
10511057xGemm<cl_double2>::
10521058xGemm_Function (bool flush, cl_uint apiCallCount )
10531059{
1054- clblasZgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1060+ for (int i = 0 ; i < apiCallCount; i++)
1061+ {
1062+ clblasZgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
10551063 buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
10561064 buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
10571065 buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
10581066 buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
10591067 buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1068+ #if 0
1069+ // print kernel time
1070+ clFinish(queue_);
1071+ cl_ulong start, stop;
1072+ double time;
1073+ cl_int err;
1074+ err = clGetEventProfilingInfo( event_, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL );
1075+ if (err) printf("err = %i\n", err);
1076+ err = clGetEventProfilingInfo( event_, CL_PROFILING_COMMAND_END, sizeof(stop), &stop, NULL );
1077+ if (err) printf("err = %i\n", err);
1078+ time = (stop - start) / 1000000.0; // milliseconds
1079+ printf("kernel %lu -> %lu = %.f ms\n", start, stop, time );
1080+ #endif
1081+ }
10601082 // flush==true if only the kernel time (library call) is timed
10611083 // flush==false if memory time is also timed
10621084 if (flush==true )
@@ -1070,15 +1092,15 @@ double
10701092xGemm<cl_float2>::
10711093gflops ()
10721094{
1073- return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/time_in_ns ();
1095+ return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/( time_in_ns () / buffer_. apiCallCount );
10741096}
10751097
10761098template <>
10771099double
10781100xGemm<cl_double2>::
10791101gflops ()
10801102{
1081- return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/time_in_ns ();
1103+ return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/( time_in_ns () / buffer_. apiCallCount );
10821104}
10831105
10841106template <>
0 commit comments