@@ -238,6 +238,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
238
238
{
239
239
__local float * plA = lA + idy * 97 + idx ;
240
240
__local float * plB = lB + idy * 97 + idx ;
241
+ barrier (CLK_LOCAL_MEM_FENCE );
241
242
242
243
plB [0 ] = B [0 + 0 * ldb ];
243
244
plB [16 ] = B [16 + 0 * ldb ];
@@ -355,6 +356,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
355
356
{
356
357
__local float * plA = lA + idy * 97 + idx ;
357
358
__local float * plB = lB + idy * 97 + idx ;
359
+ barrier (CLK_LOCAL_MEM_FENCE );
358
360
359
361
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
360
362
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -472,6 +474,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
472
474
{
473
475
__local float * plA = lA + idy * 97 + idx ;
474
476
__local float * plB = lB + idy * 97 + idx ;
477
+ barrier (CLK_LOCAL_MEM_FENCE );
475
478
476
479
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
477
480
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -602,7 +605,7 @@ static const char * sgemm_NT_16_SPLIT__ALPHA = "
602
605
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
603
606
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
604
607
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
605
- barrier (CLK_LOCAL_MEM_FENCE);
608
+ mem_fence (CLK_LOCAL_MEM_FENCE);
606
609
607
610
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
608
611
__kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -648,7 +651,7 @@ __kernel void sgemm_NT_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
648
651
//{
649
652
__local float * plA = lA + idy * 97 + idx ;
650
653
__local float * plB = lB + idy * 97 + idx ;
651
- // barrier(CLK_LOCAL_MEM_FENCE);
654
+ barrier (CLK_LOCAL_MEM_FENCE );
652
655
plB [0 ] = B [0 + 0 * ldb ];
653
656
plB [16 ] = B [16 + 0 * ldb ];
654
657
plB [32 ] = B [32 + 0 * ldb ];
@@ -787,6 +790,7 @@ __kernel void sgemm_NT_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
787
790
{
788
791
__local float * plA = lA + idy * 97 + idx ;
789
792
__local float * plB = lB + idy * 97 + idx ;
793
+ barrier (CLK_LOCAL_MEM_FENCE );
790
794
791
795
plB [0 ] = B [0 + 0 * ldb ];
792
796
plB [16 ] = B [16 + 0 * ldb ];
@@ -903,6 +907,7 @@ __kernel void sgemm_NT_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
903
907
{
904
908
__local float * plA = lA + idy * 97 + idx ;
905
909
__local float * plB = lB + idy * 97 + idx ;
910
+ barrier (CLK_LOCAL_MEM_FENCE );
906
911
907
912
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
908
913
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -1020,6 +1025,7 @@ __kernel void sgemm_NT_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
1020
1025
{
1021
1026
__local float * plA = lA + idy * 97 + idx ;
1022
1027
__local float * plB = lB + idy * 97 + idx ;
1028
+ barrier (CLK_LOCAL_MEM_FENCE );
1023
1029
1024
1030
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
1025
1031
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 ];
@@ -1830,7 +1836,7 @@ static const char * sgemm_NT_1_SPLIT__ALPHA = "
1830
1836
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
1831
1837
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
1832
1838
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
1833
- barrier (CLK_LOCAL_MEM_FENCE);
1839
+ mem_fence (CLK_LOCAL_MEM_FENCE);
1834
1840
1835
1841
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
1836
1842
__kernel void sgemm_NT_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -2727,6 +2733,7 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHABETA_SPLIT_ROW( __global float co
2727
2733
{
2728
2734
__local float * plA = lA + idy * 97 + idx ;
2729
2735
__local float * plB = lB + idx * 97 + idy ;
2736
+ barrier (CLK_LOCAL_MEM_FENCE );
2730
2737
2731
2738
plB [0 ] = B [0 ];
2732
2739
plB [16 ] = B [16 * ldb ];
@@ -2844,6 +2851,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHABETA_SPLIT_COLUMN( __global float
2844
2851
{
2845
2852
__local float * plA = lA + idy * 97 + idx ;
2846
2853
__local float * plB = lB + idx * 97 + idy ;
2854
+ barrier (CLK_LOCAL_MEM_FENCE );
2847
2855
2848
2856
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
2849
2857
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -2962,6 +2970,7 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHABETA_SPLIT_SINGLE( __global float
2962
2970
{
2963
2971
__local float * plA = lA + idy * 97 + idx ;
2964
2972
__local float * plB = lB + idx * 97 + idy ;
2973
+ barrier (CLK_LOCAL_MEM_FENCE );
2965
2974
2966
2975
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
2967
2976
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -3095,7 +3104,7 @@ static const char * sgemm_NN_16_SPLIT__ALPHA = "
3095
3104
rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
3096
3105
rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
3097
3106
rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
3098
- barrier (CLK_LOCAL_MEM_FENCE );
3107
+ mem_fence (CLK_LOCAL_MEM_FENCE );
3099
3108
3100
3109
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
3101
3110
__kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -3141,7 +3150,7 @@ __kernel void sgemm_NN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN( __global float cons
3141
3150
//{
3142
3151
__local float * plA = lA + idy * 97 + idx ;
3143
3152
__local float * plB = lB + idx * 97 + idy ;
3144
- // barrier(CLK_LOCAL_MEM_FENCE);
3153
+ barrier (CLK_LOCAL_MEM_FENCE );
3145
3154
plB [0 ] = B [0 ];
3146
3155
plB [16 ] = B [16 * ldb ];
3147
3156
plB [32 ] = B [32 * ldb ];
@@ -3280,7 +3289,8 @@ __kernel void sgemm_NN_1_96_16_16x16_6x6__ALPHA_SPLIT_ROW( __global float const
3280
3289
{
3281
3290
__local float * plA = lA + idy * 97 + idx ;
3282
3291
__local float * plB = lB + idx * 97 + idy ;
3283
-
3292
+ barrier (CLK_LOCAL_MEM_FENCE );
3293
+
3284
3294
plB [0 ] = B [0 ];
3285
3295
plB [16 ] = B [16 * ldb ];
3286
3296
plB [32 ] = B [32 * ldb ];
@@ -3396,6 +3406,7 @@ __kernel void sgemm_NN_96_1_16_16x16_6x6__ALPHA_SPLIT_COLUMN( __global float con
3396
3406
{
3397
3407
__local float * plA = lA + idy * 97 + idx ;
3398
3408
__local float * plB = lB + idx * 97 + idy ;
3409
+ barrier (CLK_LOCAL_MEM_FENCE );
3399
3410
3400
3411
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
3401
3412
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -3513,7 +3524,8 @@ __kernel void sgemm_NN_1_1_16_16x16_6x6__ALPHA_SPLIT_SINGLE( __global float cons
3513
3524
{
3514
3525
__local float * plA = lA + idy * 97 + idx ;
3515
3526
__local float * plB = lB + idx * 97 + idy ;
3516
-
3527
+ barrier (CLK_LOCAL_MEM_FENCE );
3528
+
3517
3529
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
3518
3530
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
3519
3531
plB [32 ] = CurrentOffSetB + 32 >=N ?0.0 :B [32 * ldb ];
@@ -3667,7 +3679,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHABETA = "
3667
3679
rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
3668
3680
rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
3669
3681
rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
3670
- barrier (CLK_LOCAL_MEM_FENCE );
3682
+ mem_fence (CLK_LOCAL_MEM_FENCE );
3671
3683
3672
3684
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
3673
3685
__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHABETA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -4400,7 +4412,7 @@ static const char * sgemm_NN_1_SPLIT__ALPHA = "
4400
4412
rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
4401
4413
rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
4402
4414
rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
4403
- barrier (CLK_LOCAL_MEM_FENCE );
4415
+ mem_fence (CLK_LOCAL_MEM_FENCE );
4404
4416
4405
4417
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
4406
4418
__kernel void sgemm_NN_96_96_1_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -5122,7 +5134,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHABETA = "
5122
5134
rC [3 ][5 ]= mad (rA [0 ][3 ],rB [0 ][5 ],rC [3 ][5 ]); \
5123
5135
rC [4 ][5 ]= mad (rA [0 ][4 ],rB [0 ][5 ],rC [4 ][5 ]); \
5124
5136
rC [5 ][5 ]= mad (rA [0 ][5 ],rB [0 ][5 ],rC [5 ][5 ]); \
5125
- barrier (CLK_LOCAL_MEM_FENCE );
5137
+ mem_fence (CLK_LOCAL_MEM_FENCE );
5126
5138
5127
5139
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
5128
5140
__kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHABETA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -5167,6 +5179,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5167
5179
{
5168
5180
__local float * plA = lA + idx * 97 + idy ;
5169
5181
__local float * plB = lB + idx * 97 + idy ;
5182
+ barrier (CLK_LOCAL_MEM_FENCE );
5170
5183
5171
5184
plB [0 ] = B [0 ];
5172
5185
plB [16 ] = B [16 * ldb ];
@@ -5302,6 +5315,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5302
5315
{
5303
5316
__local float * plA = lA + idx * 97 + idy ;
5304
5317
__local float * plB = lB + idx * 97 + idy ;
5318
+ barrier (CLK_LOCAL_MEM_FENCE );
5305
5319
5306
5320
plB [0 ] = B [0 ];
5307
5321
plB [16 ] = B [16 * ldb ];
@@ -5419,6 +5433,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5419
5433
{
5420
5434
__local float * plA = lA + idx * 97 + idy ;
5421
5435
__local float * plB = lB + idx * 97 + idy ;
5436
+ barrier (CLK_LOCAL_MEM_FENCE );
5422
5437
5423
5438
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
5424
5439
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -5537,6 +5552,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5537
5552
{
5538
5553
__local float * plA = lA + idx * 97 + idy ;
5539
5554
__local float * plB = lB + idx * 97 + idy ;
5555
+ barrier (CLK_LOCAL_MEM_FENCE );
5540
5556
5541
5557
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
5542
5558
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -5668,7 +5684,7 @@ static const char * sgemm_TN_16_SPLIT__ALPHA = "
5668
5684
rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \
5669
5685
rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \
5670
5686
rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \
5671
- barrier (CLK_LOCAL_MEM_FENCE);
5687
+ mem_fence (CLK_LOCAL_MEM_FENCE);
5672
5688
5673
5689
__attribute__((reqd_work_group_size (16 ,16 ,1 )))
5674
5690
__kernel void sgemm_TN_96_96_16_16x16_6x6__ALPHA_SPLIT_MAIN ( __global float const * restrict A ,
@@ -5712,6 +5728,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5712
5728
{
5713
5729
__local float * plA = lA + idx * 97 + idy ;
5714
5730
__local float * plB = lB + idx * 97 + idy ;
5731
+ barrier (CLK_LOCAL_MEM_FENCE );
5715
5732
5716
5733
plB [0 ] = B [0 ];
5717
5734
plB [16 ] = B [16 * ldb ];
@@ -5846,6 +5863,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5846
5863
{
5847
5864
__local float * plA = lA + idx * 97 + idy ;
5848
5865
__local float * plB = lB + idx * 97 + idy ;
5866
+ barrier (CLK_LOCAL_MEM_FENCE );
5849
5867
5850
5868
plB [0 ] = B [0 ];
5851
5869
plB [16 ] = B [16 * ldb ];
@@ -5962,6 +5980,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
5962
5980
{
5963
5981
__local float * plA = lA + idx * 97 + idy ;
5964
5982
__local float * plB = lB + idx * 97 + idy ;
5983
+ barrier (CLK_LOCAL_MEM_FENCE );
5965
5984
5966
5985
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
5967
5986
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
@@ -6079,6 +6098,7 @@ __attribute__((reqd_work_group_size(16,16,1)))
6079
6098
{
6080
6099
__local float * plA = lA + idx * 97 + idy ;
6081
6100
__local float * plB = lB + idx * 97 + idy ;
6101
+ barrier (CLK_LOCAL_MEM_FENCE );
6082
6102
6083
6103
plB [0 ] = CurrentOffSetB >=N ?0.0 :B [0 ];
6084
6104
plB [16 ] = CurrentOffSetB + 16 >=N ?0.0 :B [16 * ldb ];
0 commit comments