@@ -674,3 +674,55 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
674674 tt.return
675675 }
676676}
677+
678+ // -----
679+ #blocked = #ttg.blocked <{sizePerThread = [1 , 128 ], threadsPerWarp = [32 , 1 ], warpsPerCTA = [4 , 1 ], order = [0 , 1 ]}>
680+ #blocked1 = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 32 ], warpsPerCTA = [2 , 2 ], order = [1 , 0 ]}>
681+ #shared = #ttg.nvmma_shared <{swizzlingByteWidth = 128 , transposed = false , elementBitWidth = 16 }>
682+ #smem = #ttg.shared_memory
683+ #tmem = #ttng.tensor_memory_encoding <blockM = 128 , blockN = 128 , colStride = 1 >
684+ module attributes {" ttg.num-warps" = 4 : i32 , ttg.target = " cuda:100" } {
685+ // CHECK-LABEL: @for_loop_control_operand_ppg
686+ tt.func @for_loop_control_operand_ppg (%lb: i32 , %ub: i32 , %step: i32 , %ptr0: !tt.ptr <i32 >) {
687+ %true = arith.constant true
688+ %arefBuf = ttng.tmem_alloc : () -> !ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
689+ %aref = nvws.aref.create %arefBuf : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]>
690+ %_0 , %tok = nvws.aref.put.enter %aref : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]> -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >, !ttg.async.token
691+ // CHECK: put.enter
692+ // CHECK-NEXT: [[RET:%.*]]:5 = scf.for
693+ %tok0 = scf.for %iv0 = %lb to %ub step %step iter_args (%tok1 = %tok ) -> (!ttg.async.token ) : i32 {
694+ // CHECK-NEXT: tt.addptr {{.*}} {ttg.partition = array<i32: 0, 1, 2>}
695+ // CHECK-NEXT: tt.load {{.*}} {ttg.partition = array<i32: 0, 1, 2>}
696+ // CHECK-NEXT: "lb1"({{.*}}) {ttg.partition = array<i32: 0, 1, 2>}
697+ // CHECK-NEXT: "step1"({{.*}}) {ttg.partition = array<i32: 0, 1, 2>}
698+ %ptrub = tt.addptr %ptr0 , %iv0 {ttg.partition = array<i32 : 1 , 2 >} : !tt.ptr <i32 >, i32
699+ %ub1 = tt.load %ptrub {ttg.partition = array<i32 : 1 , 2 >} : !tt.ptr <i32 >
700+ %lb1 = " lb1" (%iv0 ) {ttg.partition = array<i32 : 1 , 2 >} : (i32 ) -> i32
701+ %step1 = " step1" (%iv0 ) {ttg.partition = array<i32 : 1 , 2 >} : (i32 ) -> i32
702+ // CHECK-NEXT: [[RET1:%.*]]:3 = scf.for
703+ %tok5 = scf.for %iv = %lb1 to %ub1 step %step1 iter_args (%tok2 = %tok1 ) -> (!ttg.async.token ) : i32 {
704+ %sA = " load1" (%iv ) {ttg.partition = array<i32 : 1 >} : (i32 ) -> !ttg.memdesc <128 x64 xf32 , #shared , #smem >
705+ %sB = " load2" (%iv ) {ttg.partition = array<i32 : 1 >} : (i32 ) -> !ttg.memdesc <64 x128 xf32 , #shared , #smem >
706+ %buf = nvws.aref.buffer %aref , %tok2 {ttg.partition = array<i32 : 2 >} : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]>, !ttg.async.token -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
707+ ttng.tc_gen5_mma %sA , %sB , %buf , %true , %true {ttg.partition = array<i32 : 2 >} : !ttg.memdesc <128 x64 xf32 , #shared , #smem >, !ttg.memdesc <64 x128 xf32 , #shared , #smem >, !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
708+ scf.yield {ttg.partition = array<i32 : 1 , 2 >} %tok2 : !ttg.async.token
709+ } {ttg.partition = array<i32 : 1 , 2 >, ttg.partition.outputs = [array <i32 : 2 >]}
710+ // CHECK: scf.yield
711+ // CHECK-NEXT: {ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 2>, array<i32: 0, 2>, array<i32: 2>]}
712+ // CHECK-NEXT: nvws.aref.put.exit {{.*}}[[[RET1]]#1]
713+ nvws.aref.put.exit %aref , %tok5 [#nvws.async_op <tc5mma >] {ttg.partition = array<i32 : 2 >} : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]>, !ttg.async.token
714+ %_1 , %token_2 = nvws.aref.get.enter %aref {ttg.partition = array<i32 : 1 >} : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]> -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >, !ttg.async.token
715+ nvws.aref.get.exit %aref , %token_2 [#nvws.async_op <none >] {ttg.partition = array<i32 : 1 >} : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]>, !ttg.async.token
716+ %buf1 , %tok6 = nvws.aref.put.enter %aref {ttg.partition = array<i32 : 2 >} : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]> -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >, !ttg.async.token
717+ // CHECK: aref.put.enter
718+ // CHECK-NEXT: scf.yield
719+ scf.yield {ttg.partition = array<i32 : 1 , 2 >} %tok6 : !ttg.async.token
720+ // CHECK-NEXT: {tt.warp_specialize, ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 2>, array<i32: 0, 2>, array<i32: 2>, array<i32: 0, 1>, array<i32: 0, 1>]}
721+ } {tt.warp_specialize , ttg.partition = array<i32 : 1 , 2 >, ttg.partition.outputs = [array <i32 : 2 >]}
722+ // CHECK-NEXT: aref.put.exit {{.*}}[[[RET]]#1]
723+ nvws.aref.put.exit %aref , %tok0 [#nvws.async_op <tc5mma >] : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]>, !ttg.async.token
724+ %_2 , %token_2 = nvws.aref.get.enter %aref : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]> -> !ttg.memdesc <128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >, !ttg.async.token
725+ nvws.aref.get.exit %aref , %token_2 [#nvws.async_op <none >] : <[!ttg.memdesc <1 x128 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >]>, !ttg.async.token
726+ tt.return
727+ }
728+ }
0 commit comments