@@ -1013,12 +1013,15 @@ void
1013
1013
xGemm<cl_double>::
1014
1014
xGemm_Function (bool flush, cl_uint apiCallCount )
1015
1015
{
1016
- clblasDgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1016
+ for (int i = 0 ; i < apiCallCount; i++)
1017
+ {
1018
+ clblasDgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1017
1019
buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
1018
1020
buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
1019
1021
buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
1020
1022
buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
1021
1023
buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1024
+ }
1022
1025
// flush==true if only the kernel time (library call) is timed
1023
1026
// flush==false if memory time is also timed
1024
1027
if (flush==true )
@@ -1032,12 +1035,15 @@ void
1032
1035
xGemm<cl_float2>::
1033
1036
xGemm_Function (bool flush, cl_uint apiCallCount )
1034
1037
{
1035
- clblasCgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1038
+ for (int i = 0 ; i < apiCallCount; i++)
1039
+ {
1040
+ clblasCgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1036
1041
buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
1037
1042
buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
1038
1043
buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
1039
1044
buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
1040
1045
buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1046
+ }
1041
1047
// flush==true if only the kernel time (library call) is timed
1042
1048
// flush==false if memory time is also timed
1043
1049
if (flush==true )
@@ -1051,12 +1057,15 @@ void
1051
1057
xGemm<cl_double2>::
1052
1058
xGemm_Function (bool flush, cl_uint apiCallCount )
1053
1059
{
1054
- clblasZgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1060
+ for (int i = 0 ; i < apiCallCount; i++)
1061
+ {
1062
+ clblasZgemm (order_, buffer_.trans_a_ , buffer_.trans_b_ ,
1055
1063
buffer_.m_ , buffer_.n_ , buffer_.k_ , buffer_.alpha_ ,
1056
1064
buffer_.buf_a_ , buffer_.offA_ , buffer_.lda_ ,
1057
1065
buffer_.buf_b_ , buffer_.offB_ , buffer_.ldb_ ,
1058
1066
buffer_.beta_ , buffer_.buf_c_ , buffer_.offC_ ,
1059
1067
buffer_.ldc_ , 1 , &queue_, 0 , NULL , &event_);
1068
+ }
1060
1069
// flush==true if only the kernel time (library call) is timed
1061
1070
// flush==false if memory time is also timed
1062
1071
if (flush==true )
@@ -1070,15 +1079,15 @@ double
1070
1079
xGemm<cl_float2>::
1071
1080
gflops ()
1072
1081
{
1073
- return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/time_in_ns ();
1082
+ return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/( time_in_ns () / buffer_. apiCallCount );
1074
1083
}
1075
1084
1076
1085
template <>
1077
1086
double
1078
1087
xGemm<cl_double2>::
1079
1088
gflops ()
1080
1089
{
1081
- return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/time_in_ns ();
1090
+ return (8.0 *buffer_.m_ *buffer_.n_ *buffer_.k_ )/( time_in_ns () / buffer_. apiCallCount );
1082
1091
}
1083
1092
1084
1093
template <>
0 commit comments