@@ -23,7 +23,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
2323 %pred: i1 ,
2424 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory >,
2525 %barrierPred: i1 ) {
26- ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] :
26+ ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] { is_async } :
2727 !ttg.memdesc <128 x128 xf16 , #shared , #ttg.shared_memory >,
2828 !ttg.memdesc <128 x128 xf16 , #shared1 , #ttg.shared_memory >,
2929 !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -56,7 +56,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
5656 %pred: i1 ,
5757 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory >,
5858 %barrierPred: i1 ) {
59- ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] :
59+ ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] { is_async } :
6060 !ttg.memdesc <128 x16 xf16 , #shared , #ttg.shared_memory >,
6161 !ttg.memdesc <16 x128 xf16 , #shared1 , #ttg.shared_memory >,
6262 !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -89,7 +89,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
8989 %pred: i1 ,
9090 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory >,
9191 %barrierPred: i1 ) {
92- ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] :
92+ ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] { is_async } :
9393 !ttg.memdesc <128 x16 xf16 , #shared , #ttg.shared_memory >,
9494 !ttg.memdesc <16 x128 xf16 , #shared1 , #ttg.shared_memory >,
9595 !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -219,7 +219,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
219219 %pred: i1 ,
220220 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory , mutable >,
221221 %barrierPred: i1 ) {
222- ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e4m3 rhs = e2m1 , %barrier [%barrierPred ] :
222+ ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e4m3 rhs = e2m1 , %barrier [%barrierPred ] { is_async } :
223223 !ttg.memdesc <128 x64 xi8 , #shared , #ttg.shared_memory >,
224224 !ttg.memdesc <32 x128 xi8 , #shared1 , #ttg.shared_memory >,
225225 !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -256,7 +256,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
256256 %pred: i1 ,
257257 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory , mutable >,
258258 %barrierPred: i1 ) {
259- ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e2m1 rhs = e4m3 , %barrier [%barrierPred ] :
259+ ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e2m1 rhs = e4m3 , %barrier [%barrierPred ] { is_async } :
260260 !ttg.memdesc <128 x64 xi8 , #shared1 , #ttg.shared_memory >,
261261 !ttg.memdesc <128 x128 xi8 , #shared , #ttg.shared_memory >,
262262 !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -285,7 +285,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
285285 // CHECK: tcgen05.mma.cta_group::2.kind::f16
286286 // CHECK: tcgen05.mma.cta_group::2.kind::f16
287287 // CHECK: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64
288- ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] {two_ctas } :
288+ ttng.tc_gen5_mma %a , %b , %c , %useAcc , %pred , %barrier [%barrierPred ] {is_async , two_ctas } :
289289 !ttg.memdesc <256 x32 xf16 , #shared , #ttg.shared_memory >,
290290 !ttg.memdesc <32 x128 xf16 , #shared1 , #ttg.shared_memory >,
291291 !ttg.memdesc <256 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -334,7 +334,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
334334 %pred: i1 ,
335335 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory >,
336336 %barrierPred: i1 ) {
337- ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e2m1 rhs = e2m1 , %barrier [%barrierPred ] :
337+ ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e2m1 rhs = e2m1 , %barrier [%barrierPred ] { is_async } :
338338 !ttg.memdesc <128 x64 xi8 , #shared , #ttg.shared_memory >,
339339 !ttg.memdesc <64 x256 xi8 , #shared1 , #ttg.shared_memory >,
340340 !ttg.memdesc <128 x256 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -368,7 +368,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
368368 %pred: i1 ,
369369 %barrier: !ttg.memdesc <1 xi64 , #shared2 , #ttg.shared_memory >,
370370 %barrierPred: i1 ) {
371- ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e2m1 rhs = e2m1 , %barrier [%barrierPred ] :
371+ ttng.tc_gen5_mma_scaled %a , %b , %c , %scale_a , %scale_b , %useAcc , %pred lhs = e2m1 rhs = e2m1 , %barrier [%barrierPred ] { is_async } :
372372 !ttg.memdesc <128 x64 xi8 , #shared , #ttg.shared_memory >,
373373 !ttg.memdesc <64 x256 xi8 , #shared1 , #ttg.shared_memory >,
374374 !ttg.memdesc <128 x256 xf32 , #tmem , #ttng.tensor_memory , mutable >,
@@ -584,7 +584,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
584584 tt.func @tc_gen5_mma_lhs_tmem (%arg0: !ttg.memdesc <128 x32 xf16 , #tmem , #ttng.tensor_memory >, %arg1: !ttg.memdesc <32 x128 xf16 , #shared , #smem >, %arg2: !ttg.memdesc <128 x128 xf32 , #tmem1 , #ttng.tensor_memory , mutable >, %arg3: i1 , %arg4: i1 , %arg5: !ttg.memdesc <1 xi64 , #shared1 , #smem >, %barrierPred: i1 ) {
585585 // CHECK-LABEL: tc_gen5_mma_lhs_tmem
586586 // CHECK: tcgen05.mma.cta_group::1.kind::f16
587- ttng.tc_gen5_mma %arg0 , %arg1 , %arg2 , %arg3 , %arg4 , %arg5 [%barrierPred ] :
587+ ttng.tc_gen5_mma %arg0 , %arg1 , %arg2 , %arg3 , %arg4 , %arg5 [%barrierPred ] { is_async } :
588588 !ttg.memdesc <128 x32 xf16 , #tmem , #ttng.tensor_memory >,
589589 !ttg.memdesc <32 x128 xf16 , #shared , #smem >,
590590 !ttg.memdesc <128 x128 xf32 , #tmem1 , #ttng.tensor_memory , mutable >,
0 commit comments