Skip to content

Commit ecd89f9

Browse files
author
Timmy
committed
Merge pull request #133 from TimmyLiu/develop
fix the performance drop of SGEMM column major NT or row major TN when lda and ldb are big multiples of 1024 such as 4096, 5120, 6144, 7168, 8192
2 parents ce984ae + 5a74faf commit ecd89f9

File tree

8 files changed

+992
-0
lines changed

8 files changed

+992
-0
lines changed

src/library/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ set(SRC_BLAS
8181
blas/functor/gcn_dgemmSmallMatrices.cc
8282
blas/functor/gcn_sgemmSmallMatrices.cc
8383
blas/functor/hawaii_sgemmBranchKernel.cc
84+
blas/functor/hawaii_sgemmBig1024Kernel.cc
8485
)
8586

8687
set(SRC_BLAS_HEADERS
@@ -114,6 +115,7 @@ set(SRC_BLAS_HEADERS
114115
blas/functor/include/gcn_dgemmSmallMatrices.h
115116
blas/functor/include/gcn_sgemmSmallMatrices.h
116117
blas/functor/include/hawaii_sgemmBranchKernel.h
118+
blas/functor/include/hawaii_sgemmBig1024Kernel.h
117119
)
118120

119121
set(SRC_BLAS_GENERIC
@@ -234,6 +236,7 @@ set (SRC_CL_TEMPLATES
234236
dtrsm_gpu192.cl
235237
dgemm_gcn_SmallMatrices.cl
236238
sgemm_gcn_SmallMatrices.cl
239+
sgemm_gcn_bigMatrices.cl
237240
sgemm_gcn.cl
238241
zgemm_gcn.cl
239242
)
@@ -253,6 +256,9 @@ set(SRC_CL_TEMPLATES_GEN
253256
sgemm_gcn_SmallMatrices.clHawaii_64.bin.cl
254257
sgemm_gcn_SmallMatrices.clTahiti_64.bin.cl
255258
sgemm_gcn_SmallMatrices.clBonaire_64.bin.cl
259+
sgemm_gcn_bigMatrices.clHawaii_64.bin.cl
260+
sgemm_gcn_bigMatrices.clTahiti_64.bin.cl
261+
sgemm_gcn_bigMatrices.clBonaire_64.bin.cl
256262
sgemm_gcn.clHawaii_64.bin.cl
257263
zgemm_gcn.clHawaii_64.bin.cl
258264
sgemm_gcn.clBonaire_64.bin.cl

src/library/bingen.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ ${CLTEMPLATE_PATH}/sgemm_hawaiiSplitKernel.cl
1515
${CLTEMPLATE_PATH}/sgemm_gcn.cl
1616
${CLTEMPLATE_PATH}/zgemm_gcn.cl
1717
${CLTEMPLATE_PATH}/sgemm_gcn_SmallMatrices.cl
18+
${CLTEMPLATE_PATH}/sgemm_gcn_bigMatrices.cl
1819
${CLTEMPLATE_PATH}/sgemm_hawaiiSplit64_32.cl
1920
${CLTEMPLATE_PATH}/dtrsm_gpu192.cl
2021
)

src/library/blas/functor/hawaii.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "hawaii_sgemmSplit64_32.h"
2828
#include "gcn_zgemm.h"
2929
#include "gpu_dtrsm192.h"
30+
#include "hawaii_sgemmBig1024Kernel.h"
3031

3132
FunctorSelectorHawaii FunctorSelectorHawaii::instance ;
3233

@@ -116,6 +117,24 @@ clblasSgemmFunctor * FunctorSelectorHawaii::select_sgemm_specific(clblasSgemmFun
116117
//if (functor)
117118
// return functor;
118119

120+
if ((args.lda % 1024 == 0) && (args.ldb % 1024 == 0) && (args.K > args.lda / 4))
121+
{
122+
if ((args.lda == args.ldb) && (args.lda >= 4096) && (args.lda <= 8192)) // between 4096 and 8192 for now
123+
{
124+
if (args.lda != 6144)// 6144 is handled by a special case split
125+
{
126+
// we are going to call 16 GEMMs with M=M/2, N=N/2, K=K/4
127+
// each GEMM requires M%128 == 0, N%128 == 0, K%16 == 0
128+
if (args.M % 256 == 0 && args.N % 256 == 0 && args.K % 64 == 0)
129+
{
130+
functor = clBlashawaiiSgemmBig1024KernelFunctor::provide(args, "Hawaii");
131+
if (functor)
132+
return functor;
133+
}
134+
}
135+
}
136+
}
137+
119138
if ((args.M >= 1184 && args.N >= 1184) && (args.M <= 3872 && args.N <= 3872) && (args.M % 64 != 0 && args.N % 64 != 0) && (args.M % 96 != 0 && args.N % 96 != 0) && (args.K % 16 == 0))
120139
{
121140
//all the mod32 sizes that is not mod64 or mod96 ranging from 1184 to 3872

0 commit comments

Comments
 (0)