@@ -1013,12 +1013,15 @@ void
1013
1013
xGemm<cl_double>::
1014
1014
xGemm_Function (bool flush, cl_uint apiCallCount )
1015
1015
{
1016
- clblasDgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1016
+ for (int i = 0 ; i < apiCallCount; i++)
1017
+ {
1018
+ clblasDgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1017
1019
buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
1018
1020
buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
1019
1021
buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
1020
1022
buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
1021
1023
buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1024
+ }
1022
1025
// flush==true if only the kernel time (library call) is timed
1023
1026
// flush==false if memory time is also timed
1024
1027
if (flush==true )
@@ -1032,12 +1035,15 @@ void
1032
1035
xGemm<cl_float2>::
1033
1036
xGemm_Function (bool flush, cl_uint apiCallCount )
1034
1037
{
1035
- clblasCgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1038
+ for (int i = 0 ; i < apiCallCount; i++)
1039
+ {
1040
+ clblasCgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1036
1041
buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
1037
1042
buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
1038
1043
buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
1039
1044
buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
1040
1045
buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1046
+ }
1041
1047
// flush==true if only the kernel time (library call) is timed
1042
1048
// flush==false if memory time is also timed
1043
1049
if (flush==true )
@@ -1051,12 +1057,28 @@ void
1051
1057
xGemm<cl_double2>::
1052
1058
xGemm_Function (bool flush, cl_uint apiCallCount )
1053
1059
{
1054
- clblasZgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1060
+ for (int i = 0 ; i < apiCallCount; i++)
1061
+ {
1062
+ clblasZgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1055
1063
buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
1056
1064
buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
1057
1065
buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
1058
1066
buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
1059
1067
buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1068
+ #if 0
1069
+ // print kernel time
1070
+ clFinish(queue_);
1071
+ cl_ulong start, stop;
1072
+ double time;
1073
+ cl_int err;
1074
+ err = clGetEventProfilingInfo( event_, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL );
1075
+ if (err) printf("err = %i\n", err);
1076
+ err = clGetEventProfilingInfo( event_, CL_PROFILING_COMMAND_END, sizeof(stop), &stop, NULL );
1077
+ if (err) printf("err = %i\n", err);
1078
+ time = (stop - start) / 1000000.0; // milliseconds
1079
+ printf("kernel %lu -> %lu = %.f ms\n", start, stop, time );
1080
+ #endif
1081
+ }
1060
1082
// flush==true if only the kernel time (library call) is timed
1061
1083
// flush==false if memory time is also timed
1062
1084
if (flush==true )
@@ -1070,15 +1092,15 @@ double
1070
1092
xGemm<cl_float2>::
1071
1093
gflops ()
1072
1094
{
1073
- return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/time_in_ns ();
1095
+ return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/( time_in_ns () / buffer_. apiCallCount );
1074
1096
}
1075
1097
1076
1098
template <>
1077
1099
double
1078
1100
xGemm<cl_double2>::
1079
1101
gflops ()
1080
1102
{
1081
- return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/time_in_ns ();
1103
+ return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/( time_in_ns () / buffer_. apiCallCount );
1082
1104
}
1083
1105
1084
1106
template <>
0 commit comments