@@ -238,6 +238,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
238238 {
239239 __local float * plA = lA + idy * 97 + idx ;
240240 __local float * plB = lB + idy * 97 + idx ;
241+ barrier (CLK_LOCAL_MEM_FENCE );
241242
242243 plB [0 ] = B [0 + 0 * ldb ];
243244 plB [16 ] = B [16 + 0 * ldb ];
@@ -355,6 +356,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
355356 {
356357 __local float * plA = lA + idy * 97 + idx ;
357358 __local float * plB = lB + idy * 97 + idx ;
359+ barrier (CLK_LOCAL_MEM_FENCE );
358360
359361 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
360362 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -472,6 +474,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
472474 {
473475 __local float * plA = lA + idy * 97 + idx ;
474476 __local float * plB = lB + idy * 97 + idx ;
477+ barrier (CLK_LOCAL_MEM_FENCE );
475478
476479 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
477480 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -602,7 +605,7 @@ static const char * sgemm_NT_16_SPLIT__ALPHA = "
602605 rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
603606 rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
604607 rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
605- barrier (CLK_LOCAL_MEM_FENCE);
608+ mem_fence (CLK_LOCAL_MEM_FENCE);
606609
607610__attribute__((reqd_work_group_size (16 ,16 ,1 )))
608611__kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -648,7 +651,7 @@ __kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
648651 //{
649652 __local float * plA = lA + idy * 97 + idx ;
650653 __local float * plB = lB + idy * 97 + idx ;
651- // barrier(CLK_LOCAL_MEM_FENCE);
654+ barrier (CLK_LOCAL_MEM_FENCE );
652655 plB [0 ] = B [0 + 0 * ldb ];
653656 plB [16 ] = B [16 + 0 * ldb ];
654657 plB [32 ] = B [32 + 0 * ldb ];
@@ -787,6 +790,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
787790 {
788791 __local float * plA = lA + idy * 97 + idx ;
789792 __local float * plB = lB + idy * 97 + idx ;
793+ barrier (CLK_LOCAL_MEM_FENCE );
790794
791795 plB [0 ] = B [0 + 0 * ldb ];
792796 plB [16 ] = B [16 + 0 * ldb ];
@@ -903,6 +907,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
903907 {
904908 __local float * plA = lA + idy * 97 + idx ;
905909 __local float * plB = lB + idy * 97 + idx ;
910+ barrier (CLK_LOCAL_MEM_FENCE );
906911
907912 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
908913 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -1020,6 +1025,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
10201025 {
10211026 __local float * plA = lA + idy * 97 + idx ;
10221027 __local float * plB = lB + idy * 97 + idx ;
1028+ barrier (CLK_LOCAL_MEM_FENCE );
10231029
10241030 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
10251031 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -1830,7 +1836,7 @@ static const char * sgemm_NT_1_SPLIT__ALPHA = "
18301836 rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
18311837 rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
18321838 rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
1833- barrier (CLK_LOCAL_MEM_FENCE);
1839+ mem_fence (CLK_LOCAL_MEM_FENCE);
18341840
18351841__attribute__((reqd_work_group_size (16 ,16 ,1 )))
18361842__kernel void sgemm_NT_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -2727,6 +2733,7 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
27272733 {
27282734 __local float * plA = lA + idy * 97 + idx ;
27292735 __local float * plB = lB + idx * 97 + idy ;
2736+ barrier (CLK_LOCAL_MEM_FENCE );
27302737
27312738 plB [0 ] = B [0 ];
27322739 plB [16 ] = B [16 * ldb ];
@@ -2844,6 +2851,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
28442851 {
28452852 __local float * plA = lA + idy * 97 + idx ;
28462853 __local float * plB = lB + idx * 97 + idy ;
2854+ barrier (CLK_LOCAL_MEM_FENCE );
28472855
28482856 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
28492857 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -2962,6 +2970,7 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
29622970 {
29632971 __local float * plA = lA + idy * 97 + idx ;
29642972 __local float * plB = lB + idx * 97 + idy ;
2973+ barrier (CLK_LOCAL_MEM_FENCE );
29652974
29662975 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
29672976 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -3095,7 +3104,7 @@ static const char * sgemm_NN_16_SPLIT__ALPHA = "
30953104 rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
30963105 rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
30973106 rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
3098- barrier (CLK_LOCAL_MEM_FENCE );
3107+ mem_fence (CLK_LOCAL_MEM_FENCE );
30993108
31003109__attribute__((reqd_work_group_size (16 ,16 ,1 )))
31013110__kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -3141,7 +3150,7 @@ __kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
31413150 //{
31423151 __local float * plA = lA + idy * 97 + idx ;
31433152 __local float * plB = lB + idx * 97 + idy ;
3144- // barrier(CLK_LOCAL_MEM_FENCE);
3153+ barrier (CLK_LOCAL_MEM_FENCE );
31453154 plB [0 ] = B [0 ];
31463155 plB [16 ] = B [16 * ldb ];
31473156 plB [32 ] = B [32 * ldb ];
@@ -3280,7 +3289,8 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
32803289 {
32813290 __local float * plA = lA + idy * 97 + idx ;
32823291 __local float * plB = lB + idx * 97 + idy ;
3283-
3292+ barrier (CLK_LOCAL_MEM_FENCE );
3293+
32843294 plB [0 ] = B [0 ];
32853295 plB [16 ] = B [16 * ldb ];
32863296 plB [32 ] = B [32 * ldb ];
@@ -3396,6 +3406,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
33963406 {
33973407 __local float * plA = lA + idy * 97 + idx ;
33983408 __local float * plB = lB + idx * 97 + idy ;
3409+ barrier (CLK_LOCAL_MEM_FENCE );
33993410
34003411 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
34013412 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -3513,7 +3524,8 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
35133524 {
35143525 __local float * plA = lA + idy * 97 + idx ;
35153526 __local float * plB = lB + idx * 97 + idy ;
3516-
3527+ barrier (CLK_LOCAL_MEM_FENCE );
3528+
35173529 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
35183530 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
35193531 plB [32 ] = CurrentOffSetB + 32 >=N ?0.0 :B [32 * ldb ];
@@ -3667,7 +3679,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHABETA = "
36673679 rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
36683680 rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
36693681 rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
3670- barrier (CLK_LOCAL_MEM_FENCE );
3682+ mem_fence (CLK_LOCAL_MEM_FENCE );
36713683
36723684__attribute__((reqd_work_group_size (16 ,16 ,1 )))
36733685__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHABETA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -4400,7 +4412,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHA = "
44004412 rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
44014413 rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
44024414 rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
4403- barrier (CLK_LOCAL_MEM_FENCE );
4415+ mem_fence (CLK_LOCAL_MEM_FENCE );
44044416
44054417__attribute__((reqd_work_group_size (16 ,16 ,1 )))
44064418__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -5122,7 +5134,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHABETA = "
51225134 rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
51235135 rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
51245136 rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
5125- barrier (CLK_LOCAL_MEM_FENCE );
5137+ mem_fence (CLK_LOCAL_MEM_FENCE );
51265138
51275139__attribute__((reqd_work_group_size (16 ,16 ,1 )))
51285140 __kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHABETA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -5167,6 +5179,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
51675179 {
51685180 __local float * plA = lA + idx * 97 + idy ;
51695181 __local float * plB = lB + idx * 97 + idy ;
5182+ barrier (CLK_LOCAL_MEM_FENCE );
51705183
51715184 plB [0 ] = B [0 ];
51725185 plB [16 ] = B [16 * ldb ];
@@ -5302,6 +5315,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
53025315 {
53035316 __local float * plA = lA + idx * 97 + idy ;
53045317 __local float * plB = lB + idx * 97 + idy ;
5318+ barrier (CLK_LOCAL_MEM_FENCE );
53055319
53065320 plB [0 ] = B [0 ];
53075321 plB [16 ] = B [16 * ldb ];
@@ -5419,6 +5433,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
54195433 {
54205434 __local float * plA = lA + idx * 97 + idy ;
54215435 __local float * plB = lB + idx * 97 + idy ;
5436+ barrier (CLK_LOCAL_MEM_FENCE );
54225437
54235438 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
54245439 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -5537,6 +5552,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
55375552 {
55385553 __local float * plA = lA + idx * 97 + idy ;
55395554 __local float * plB = lB + idx * 97 + idy ;
5555+ barrier (CLK_LOCAL_MEM_FENCE );
55405556
55415557 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
55425558 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -5668,7 +5684,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHA = "
56685684 rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
56695685 rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
56705686 rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
5671- barrier (CLK_LOCAL_MEM_FENCE);
5687+ mem_fence (CLK_LOCAL_MEM_FENCE);
56725688
56735689__attribute__((reqd_work_group_size (16 ,16 ,1 )))
56745690 __kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -5712,6 +5728,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
57125728 {
57135729 __local float * plA = lA + idx * 97 + idy ;
57145730 __local float * plB = lB + idx * 97 + idy ;
5731+ barrier (CLK_LOCAL_MEM_FENCE );
57155732
57165733 plB [0 ] = B [0 ];
57175734 plB [16 ] = B [16 * ldb ];
@@ -5846,6 +5863,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
58465863 {
58475864 __local float * plA = lA + idx * 97 + idy ;
58485865 __local float * plB = lB + idx * 97 + idy ;
5866+ barrier (CLK_LOCAL_MEM_FENCE );
58495867
58505868 plB [0 ] = B [0 ];
58515869 plB [16 ] = B [16 * ldb ];
@@ -5962,6 +5980,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
59625980 {
59635981 __local float * plA = lA + idx * 97 + idy ;
59645982 __local float * plB = lB + idx * 97 + idy ;
5983+ barrier (CLK_LOCAL_MEM_FENCE );
59655984
59665985 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
59675986 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -6079,6 +6098,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
60796098 {
60806099 __local float * plA = lA + idx * 97 + idy ;
60816100 __local float * plB = lB + idx * 97 + idy ;
6101+ barrier (CLK_LOCAL_MEM_FENCE );
60826102
60836103 plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
60846104 plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
0 commit comments