Skip to content

Commit 8ef0a43

Browse files
author
Timmy
committed
some static kernel code clean up
1 parent 413819f commit 8ef0a43

File tree

1 file changed

+31
-11
lines changed

1 file changed

+31
-11
lines changed

src/library/blas/gens/clTemplates/sgemm_hawaiiSplitKernel.cl

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
238238
{
239239
__local float* plA = lA + idy*97+idx;
240240
__local float* plB = lB + idy*97+idx;
241+
barrier(CLK_LOCAL_MEM_FENCE);
241242

242243
plB[0] = B[0+0*ldb];
243244
plB[16] = B[16+0*ldb];
@@ -355,6 +356,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
355356
{
356357
__local float* plA = lA + idy*97+idx;
357358
__local float* plB = lB + idy*97+idx;
359+
barrier(CLK_LOCAL_MEM_FENCE);
358360

359361
plB[0] = CurrentOffSetB>=N?0.0:B[0];
360362
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -472,6 +474,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
472474
{
473475
__local float* plA = lA + idy*97+idx;
474476
__local float* plB = lB + idy*97+idx;
477+
barrier(CLK_LOCAL_MEM_FENCE);
475478

476479
plB[0] = CurrentOffSetB>=N?0.0:B[0];
477480
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -602,7 +605,7 @@ static const char * sgemm_NT_16_SPLIT__ALPHA = "
602605
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
603606
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
604607
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
605-
barrier(CLK_LOCAL_MEM_FENCE);
608+
mem_fence(CLK_LOCAL_MEM_FENCE);
606609

607610
__attribute__((reqd_work_group_size(16,16,1)))
608611
__kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -648,7 +651,7 @@ __kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
648651
//{
649652
__local float* plA = lA + idy*97+idx;
650653
__local float* plB = lB + idy*97+idx;
651-
// barrier(CLK_LOCAL_MEM_FENCE);
654+
barrier(CLK_LOCAL_MEM_FENCE);
652655
plB[0] = B[0+0*ldb];
653656
plB[16] = B[16+0*ldb];
654657
plB[32] = B[32+0*ldb];
@@ -787,6 +790,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
787790
{
788791
__local float* plA = lA + idy*97+idx;
789792
__local float* plB = lB + idy*97+idx;
793+
barrier(CLK_LOCAL_MEM_FENCE);
790794

791795
plB[0] = B[0+0*ldb];
792796
plB[16] = B[16+0*ldb];
@@ -903,6 +907,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
903907
{
904908
__local float* plA = lA + idy*97+idx;
905909
__local float* plB = lB + idy*97+idx;
910+
barrier(CLK_LOCAL_MEM_FENCE);
906911

907912
plB[0] = CurrentOffSetB>=N?0.0:B[0];
908913
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -1020,6 +1025,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
10201025
{
10211026
__local float* plA = lA + idy*97+idx;
10221027
__local float* plB = lB + idy*97+idx;
1028+
barrier(CLK_LOCAL_MEM_FENCE);
10231029

10241030
plB[0] = CurrentOffSetB>=N?0.0:B[0];
10251031
plB[16] = CurrentOffSetB+16>=N?0.0:B[16];
@@ -1830,7 +1836,7 @@ static const char * sgemm_NT_1_SPLIT__ALPHA = "
18301836
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
18311837
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
18321838
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
1833-
barrier(CLK_LOCAL_MEM_FENCE);
1839+
mem_fence(CLK_LOCAL_MEM_FENCE);
18341840

18351841
__attribute__((reqd_work_group_size(16,16,1)))
18361842
__kernel void sgemm_NT_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -2727,6 +2733,7 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
27272733
{
27282734
__local float* plA = lA + idy*97+idx;
27292735
__local float* plB = lB + idx*97+idy;
2736+
barrier(CLK_LOCAL_MEM_FENCE);
27302737

27312738
plB[0] = B[0];
27322739
plB[16] = B[16*ldb];
@@ -2844,6 +2851,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
28442851
{
28452852
__local float* plA = lA + idy*97+idx;
28462853
__local float* plB = lB + idx*97+idy;
2854+
barrier(CLK_LOCAL_MEM_FENCE);
28472855

28482856
plB[0] = CurrentOffSetB>=N?0.0:B[0];
28492857
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -2962,6 +2970,7 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
29622970
{
29632971
__local float* plA = lA + idy*97+idx;
29642972
__local float* plB = lB + idx*97+idy;
2973+
barrier(CLK_LOCAL_MEM_FENCE);
29652974

29662975
plB[0] = CurrentOffSetB>=N?0.0:B[0];
29672976
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -3095,7 +3104,7 @@ static const char * sgemm_NN_16_SPLIT__ALPHA = "
30953104
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
30963105
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
30973106
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
3098-
barrier(CLK_LOCAL_MEM_FENCE);
3107+
mem_fence(CLK_LOCAL_MEM_FENCE);
30993108

31003109
__attribute__((reqd_work_group_size(16,16,1)))
31013110
__kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -3141,7 +3150,7 @@ __kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
31413150
//{
31423151
__local float* plA = lA + idy*97+idx;
31433152
__local float* plB = lB + idx*97+idy;
3144-
// barrier(CLK_LOCAL_MEM_FENCE);
3153+
barrier(CLK_LOCAL_MEM_FENCE);
31453154
plB[0] = B[0];
31463155
plB[16] = B[16*ldb];
31473156
plB[32] = B[32*ldb];
@@ -3280,7 +3289,8 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
32803289
{
32813290
__local float* plA = lA + idy*97+idx;
32823291
__local float* plB = lB + idx*97+idy;
3283-
3292+
barrier(CLK_LOCAL_MEM_FENCE);
3293+
32843294
plB[0] = B[0];
32853295
plB[16] = B[16*ldb];
32863296
plB[32] = B[32*ldb];
@@ -3396,6 +3406,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
33963406
{
33973407
__local float* plA = lA + idy*97+idx;
33983408
__local float* plB = lB + idx*97+idy;
3409+
barrier(CLK_LOCAL_MEM_FENCE);
33993410

34003411
plB[0] = CurrentOffSetB>=N?0.0:B[0];
34013412
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -3513,7 +3524,8 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
35133524
{
35143525
__local float* plA = lA + idy*97+idx;
35153526
__local float* plB = lB + idx*97+idy;
3516-
3527+
barrier(CLK_LOCAL_MEM_FENCE);
3528+
35173529
plB[0] = CurrentOffSetB>=N?0.0:B[0];
35183530
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
35193531
plB[32] = CurrentOffSetB+32>=N?0.0:B[32*ldb];
@@ -3667,7 +3679,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHABETA = "
36673679
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
36683680
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
36693681
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
3670-
barrier(CLK_LOCAL_MEM_FENCE);
3682+
mem_fence(CLK_LOCAL_MEM_FENCE);
36713683

36723684
__attribute__((reqd_work_group_size(16,16,1)))
36733685
__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHABETA_SPLIT_MAIN( __global float const * restrict A,
@@ -4400,7 +4412,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHA = "
44004412
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
44014413
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
44024414
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
4403-
barrier(CLK_LOCAL_MEM_FENCE);
4415+
mem_fence(CLK_LOCAL_MEM_FENCE);
44044416

44054417
__attribute__((reqd_work_group_size(16,16,1)))
44064418
__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -5122,7 +5134,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHABETA = "
51225134
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
51235135
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
51245136
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
5125-
barrier(CLK_LOCAL_MEM_FENCE);
5137+
mem_fence(CLK_LOCAL_MEM_FENCE);
51265138

51275139
__attribute__((reqd_work_group_size(16,16,1)))
51285140
__kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHABETA_SPLIT_MAIN( __global float const * restrict A,
@@ -5167,6 +5179,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
51675179
{
51685180
__local float* plA = lA + idx*97+idy;
51695181
__local float* plB = lB + idx*97+idy;
5182+
barrier(CLK_LOCAL_MEM_FENCE);
51705183

51715184
plB[0] = B[0];
51725185
plB[16] = B[16*ldb];
@@ -5302,6 +5315,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
53025315
{
53035316
__local float* plA = lA + idx*97+idy;
53045317
__local float* plB = lB + idx*97+idy;
5318+
barrier(CLK_LOCAL_MEM_FENCE);
53055319

53065320
plB[0] = B[0];
53075321
plB[16] = B[16*ldb];
@@ -5419,6 +5433,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
54195433
{
54205434
__local float* plA = lA + idx*97+idy;
54215435
__local float* plB = lB + idx*97+idy;
5436+
barrier(CLK_LOCAL_MEM_FENCE);
54225437

54235438
plB[0] = CurrentOffSetB>=N?0.0:B[0];
54245439
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -5537,6 +5552,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
55375552
{
55385553
__local float* plA = lA + idx*97+idy;
55395554
__local float* plB = lB + idx*97+idy;
5555+
barrier(CLK_LOCAL_MEM_FENCE);
55405556

55415557
plB[0] = CurrentOffSetB>=N?0.0:B[0];
55425558
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -5668,7 +5684,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHA = "
56685684
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
56695685
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
56705686
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
5671-
barrier(CLK_LOCAL_MEM_FENCE);
5687+
mem_fence(CLK_LOCAL_MEM_FENCE);
56725688

56735689
__attribute__((reqd_work_group_size(16,16,1)))
56745690
__kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float const * restrict A,
@@ -5712,6 +5728,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
57125728
{
57135729
__local float* plA = lA + idx*97+idy;
57145730
__local float* plB = lB + idx*97+idy;
5731+
barrier(CLK_LOCAL_MEM_FENCE);
57155732

57165733
plB[0] = B[0];
57175734
plB[16] = B[16*ldb];
@@ -5846,6 +5863,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
58465863
{
58475864
__local float* plA = lA + idx*97+idy;
58485865
__local float* plB = lB + idx*97+idy;
5866+
barrier(CLK_LOCAL_MEM_FENCE);
58495867

58505868
plB[0] = B[0];
58515869
plB[16] = B[16*ldb];
@@ -5962,6 +5980,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
59625980
{
59635981
__local float* plA = lA + idx*97+idy;
59645982
__local float* plB = lB + idx*97+idy;
5983+
barrier(CLK_LOCAL_MEM_FENCE);
59655984

59665985
plB[0] = CurrentOffSetB>=N?0.0:B[0];
59675986
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];
@@ -6079,6 +6098,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
60796098
{
60806099
__local float* plA = lA + idx*97+idy;
60816100
__local float* plB = lB + idx*97+idy;
6101+
barrier(CLK_LOCAL_MEM_FENCE);
60826102

60836103
plB[0] = CurrentOffSetB>=N?0.0:B[0];
60846104
plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb];

0 commit comments

Comments
 (0)