Skip to content

Commit bd13b7b

Browse files
committed
enables apiCallCount for zgemm within client
1 parent 6f476b8 commit bd13b7b

File tree

2 files changed

+28
-6
lines changed

2 files changed

+28
-6
lines changed

src/client/clfunc_common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ class clblasFunc
246246
props_[2] = 0;
247247
ctx_ = clCreateContext(props_, 1, &device_, NULL, NULL, &err);
248248
OPENCL_V_THROW(err, "creating context");
249-
queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);
249+
queue_ = clCreateCommandQueue(ctx_, device_, CL_QUEUE_PROFILING_ENABLE, &err);
250250

251251

252252
timer_id = timer.getUniqueID( "clfunc", 0 );

src/client/clfunc_xgemm.hpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,12 +1013,15 @@ void
10131013
xGemm<cl_double>::
10141014
xGemm_Function(bool flush, cl_uint apiCallCount )
10151015
{
1016-
clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
1016+
for (int i = 0; i < apiCallCount; i++)
1017+
{
1018+
clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
10171019
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
10181020
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
10191021
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
10201022
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
10211023
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
1024+
}
10221025
//flush==true if only the kernel time (library call) is timed
10231026
//flush==false if memory time is also timed
10241027
if (flush==true)
@@ -1032,12 +1035,15 @@ void
10321035
xGemm<cl_float2>::
10331036
xGemm_Function(bool flush, cl_uint apiCallCount )
10341037
{
1035-
clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
1038+
for (int i = 0; i < apiCallCount; i++)
1039+
{
1040+
clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
10361041
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
10371042
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
10381043
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
10391044
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
10401045
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
1046+
}
10411047
//flush==true if only the kernel time (library call) is timed
10421048
//flush==false if memory time is also timed
10431049
if (flush==true)
@@ -1051,12 +1057,28 @@ void
10511057
xGemm<cl_double2>::
10521058
xGemm_Function(bool flush, cl_uint apiCallCount )
10531059
{
1054-
clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
1060+
for (int i = 0; i < apiCallCount; i++)
1061+
{
1062+
clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
10551063
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
10561064
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
10571065
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
10581066
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
10591067
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
1068+
#if 0
1069+
// print kernel time
1070+
clFinish(queue_);
1071+
cl_ulong start, stop;
1072+
double time;
1073+
cl_int err;
1074+
err = clGetEventProfilingInfo( event_, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL );
1075+
if (err) printf("err = %i\n", err);
1076+
err = clGetEventProfilingInfo( event_, CL_PROFILING_COMMAND_END, sizeof(stop), &stop, NULL );
1077+
if (err) printf("err = %i\n", err);
1078+
time = (stop - start) / 1000000.0; // milliseconds
1079+
printf("kernel %lu -> %lu = %.f ms\n", start, stop, time );
1080+
#endif
1081+
}
10601082
//flush==true if only the kernel time (library call) is timed
10611083
//flush==false if memory time is also timed
10621084
if (flush==true)
@@ -1070,15 +1092,15 @@ double
10701092
xGemm<cl_float2>::
10711093
gflops()
10721094
{
1073-
return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns();
1095+
return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/(time_in_ns() / buffer_.apiCallCount);
10741096
}
10751097

10761098
template<>
10771099
double
10781100
xGemm<cl_double2>::
10791101
gflops()
10801102
{
1081-
return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns();
1103+
return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/(time_in_ns() / buffer_.apiCallCount);
10821104
}
10831105

10841106
template<>

0 commit comments

Comments
 (0)