@@ -1024,6 +1024,7 @@ tt.func @specialize_load_only(%desc: !tt.tensordesc<tensor<128x64xf16, #shared>>
1024
1024
scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
1025
1025
// CHECK: wait_barrier {{.*}} {ttg.partition = 0 : i32}
1026
1026
// CHECK-NEXT: local_load {{.*}} {ttg.partition = 0 : i32}
1027
+ // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
1027
1028
// CHECK-NEXT: arrive_barrier {{.*}} {ttg.partition = 0 : i32}
1028
1029
%val = tt.descriptor_load %desc [%i , %i ] : !tt.tensordesc <tensor <128 x64 xf16 , #shared >> -> tensor <128 x64 xf16 , #oper_layout >
1029
1030
" use" (%val ) : (tensor <128 x64 xf16 , #oper_layout >) -> ()
@@ -1078,6 +1079,7 @@ tt.func @specialize_mma_only(%rhs_desc: !tt.tensordesc<tensor<64x128xf16, #share
1078
1079
// CHECK-NEXT: [[LOADED:%.*]], %{{.*}} = ttng.tmem_load [[ACC_TMEM:%.*]][]
1079
1080
// CHECK: wait_barrier
1080
1081
// CHECK-NEXT: local_load
1082
+ // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
1081
1083
// CHECK-NEXT: arrive_barrier
1082
1084
// CHECK-NEXT: [[RESULTS:%.*]]:2 = "some_producer"
1083
1085
%rhs_reg , %next_acc = " some_producer" (%loaded , %acc ) : (tensor <64 x128 xf16 , #oper_layout >, tensor <128 x128 xf32 , #acc_layout >) -> (tensor <64 x128 xf16 , #oper_layout >, tensor <128 x128 xf32 , #acc_layout >)
@@ -1187,6 +1189,7 @@ tt.func @store_mma_load(
1187
1189
1188
1190
// CHECK-NEXT: wait_barrier [[LOAD_READY_BAR]], {{.*}}partition = 0
1189
1191
// CHECK-NEXT: [[LHS:%.*]] = ttg.local_load [[LOAD_BUF]] {ttg.partition = 0 : i32}
1192
+ // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
1190
1193
// CHECK-NEXT: arrive_barrier [[LOAD_EMPTY_BAR]], {{.*}}partition = 0
1191
1194
// CHECK-NEXT: [[LHS_OP:%.*]] = arith.addf [[LHS]], [[LHS]] {ttg.partition = 0 : i32}
1192
1195
// CHECK-NEXT: local_store [[LHS_OP]], [[LHS_SHARED]] {ttg.partition = 0 : i32}
@@ -1234,6 +1237,7 @@ tt.func @local_alloc_into_mma(
1234
1237
1235
1238
// CHECK: wait_barrier [[LOAD_READY_BAR]], {{.*}}partition = 0
1236
1239
// CHECK-NEXT: [[RHS_REG:%.*]] = ttg.local_load {{.*}}partition = 0
1240
+ // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
1237
1241
// CHECK-NEXT: arrive_barrier
1238
1242
// CHECK-NEXT: [[RHS_REG_MOD:%.*]] = arith.addf [[RHS_REG]], [[RHS_REG]] {ttg.partition = 0 : i32}
1239
1243
// CHECK-NEXT: wait_barrier [[MMA_OPER_BAR:%.*]], %arg{{.*}}partition = 0
0 commit comments