|
4 | 4 | #smem = #ttg.shared_memory
|
5 | 5 | #tmem = #ttng.tensor_memory
|
6 | 6 | module attributes {"ttg.target" = "cuda:0", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
|
7 |
| - //CHECK: tt.func @aref_get_put |
8 |
| - // CHECK-NEXT: [[ZERO:%.*]] = arith.constant 0 : i32 |
9 |
| - // CHECK-NEXT: [[ONE:%.*]] = arith.constant 1 : i32 |
10 |
| - // CHECK-NEXT: [[EMPTY:%.*]] = ttg.local_alloc {aref_empty_mbarriers} |
11 |
| - // CHECK-NEXT: [[FULL:%.*]] = ttg.local_alloc {aref_full_mbarriers} |
12 |
| - // CHECK-NEXT: scf.for |
13 |
| - // CHECK-NEXT: [[EMPTYSLICE:%.*]] = ttg.memdesc_subview [[EMPTY]] |
14 |
| - // CHECK-NEXT: ttng.init_barrier [[EMPTYSLICE]], 0 |
15 |
| - // CHECK-NEXT: [[FULLSLICE:%.*]] = ttg.memdesc_subview [[FULL]] |
16 |
| - // CHECK-NEXT: ttng.init_barrier [[FULLSLICE]], 1 |
17 |
| - // CHECK-NEXT: } |
18 |
| - // CHECK-NEXT: [[EMPTYSLICE2:%.*]] = ttg.memdesc_subview [[EMPTY]] |
19 |
| - // CHECK-NEXT: ttng.wait_barrier [[EMPTYSLICE2]], [[ONE]] |
20 |
| - // CHECK-NEXT: [[A:%.*]] = ttg.memdesc_subview %arg0 |
21 |
| - // CHECK-NEXT: [[B:%.*]] = ttg.memdesc_subview %arg1 |
22 |
| - // CHECK-NEXT: "foo"([[A]], [[B]]) |
23 |
| - // CHECK-NEXT: [[FULLSLICE2:%.*]] = ttg.memdesc_subview [[FULL]] |
24 |
| - // CHECK-NEXT: ttng.arrive_barrier [[FULLSLICE2]], 1 |
25 |
| - // CHECK-NEXT: [[FULLSLICE3:%.*]] = ttg.memdesc_subview [[FULL]] |
26 |
| - // CHECK-NEXT: ttng.wait_barrier [[FULLSLICE3]], [[ZERO]] |
27 |
| - // CHECK-NEXT: [[AA:%.*]] = ttg.memdesc_subview %arg0 |
28 |
| - // CHECK-NEXT: [[BB:%.*]] = ttg.memdesc_subview %arg1 |
29 |
| - // CHECK-NEXT: "bar"([[AA]], [[BB]]) |
30 |
| - // CHECK-NEXT: [[EMPTYSLICE3:%.*]] = ttg.memdesc_subview [[EMPTY]] |
31 |
| - // CHECK-NEXT: ttng.arrive_barrier [[EMPTYSLICE3]], |
32 |
| - // CHECK-NEXT: tt.return |
33 |
| - // CHECK-NEXT: } |
34 |
| - tt.func @aref_get_put(%d : !ttg.memdesc<1x64x16xf16, #shared0, #tmem>, %e : !ttg.memdesc<1x16x32xf16, #shared0, #smem>) { |
| 7 | + //CHECK-LABEL: @aref_lowering |
| 8 | + tt.func @aref_lowering(%d : !ttg.memdesc<3x64x16xf16, #shared0, #tmem>, |
| 9 | + %e : !ttg.memdesc<3x16x32xf16, #shared0, #smem>, |
| 10 | + %cond : i1) { |
35 | 11 | %c0_i32 = arith.constant 0 : i32
|
36 | 12 | %c1_i32 = arith.constant 1 : i32
|
37 |
| - %0 = nvws.aref.create %d, %e : !nvws.aref<[!ttg.memdesc<1x64x16xf16, #shared0, #tmem>, !ttg.memdesc<1x16x32xf16, #shared0, #smem>]> |
38 |
| - %1:2 = nvws.aref.put.enter %0[%c0_i32, %c1_i32] : !nvws.aref<[!ttg.memdesc<1x64x16xf16, #shared0, #tmem>, !ttg.memdesc<1x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
39 |
| - "foo"(%1#0, %1#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
40 |
| - nvws.aref.put.exit %0[%c0_i32] : !nvws.aref<[!ttg.memdesc<1x64x16xf16, #shared0, #tmem>, !ttg.memdesc<1x16x32xf16, #shared0, #smem>]> |
41 |
| - %2:2 = nvws.aref.get.enter %0[%c0_i32, %c0_i32] : !nvws.aref<[!ttg.memdesc<1x64x16xf16, #shared0, #tmem>, !ttg.memdesc<1x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
42 |
| - "bar"(%2#0, %2#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
43 |
| - nvws.aref.get.exit %0[%c0_i32] : !nvws.aref<[!ttg.memdesc<1x64x16xf16, #shared0, #tmem>, !ttg.memdesc<1x16x32xf16, #shared0, #smem>]> |
| 13 | + %lb = arith.constant 0 : i32 |
| 14 | + // CHECK: [[C3:%.*]] = arith.constant 3 : i32 |
| 15 | + // CHECK: [[C0:%.*]] = arith.constant 0 : i32 |
| 16 | + // CHECK: [[C1:%.*]] = arith.constant 1 : i32 |
| 17 | + %ub = arith.constant 4 : i32 |
| 18 | + |
| 19 | + // CHECK: [[EMPTY0:%.*]] = ttg.local_alloc |
| 20 | + // CHECK-NEXT: [[FULL0:%.*]] = ttg.local_alloc |
| 21 | + // CHECK-NEXT: scf.for |
| 22 | + // CHECK-NEXT: [[EMPTYSLICE:%.*]] = ttg.memdesc_subview [[EMPTY0]] |
| 23 | + // CHECK-NEXT: ttng.init_barrier [[EMPTYSLICE]], 1 |
| 24 | + // CHECK-NEXT: [[FULLSLICE:%.*]] = ttg.memdesc_subview [[FULL0]] |
| 25 | + // CHECK-NEXT: ttng.init_barrier [[FULLSLICE]], 129 |
| 26 | + // CHECK-NEXT: } |
| 27 | + %aref0 = nvws.aref.create %d, %e : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 28 | + |
| 29 | + // CHECK: [[EMPTY1:%.*]] = ttg.local_alloc |
| 30 | + // CHECK-NEXT: [[FULL1:%.*]] = ttg.local_alloc |
| 31 | + // CHECK-NEXT: scf.for |
| 32 | + // CHECK-NEXT: [[EMPTYSLICE:%.*]] = ttg.memdesc_subview [[EMPTY1]] |
| 33 | + // CHECK-NEXT: ttng.init_barrier [[EMPTYSLICE]], 256 |
| 34 | + // CHECK-NEXT: [[FULLSLICE:%.*]] = ttg.memdesc_subview [[FULL1]] |
| 35 | + // CHECK-NEXT: ttng.init_barrier [[FULLSLICE]], 128 |
| 36 | + // CHECK-NEXT: } |
| 37 | + %aref1 = nvws.aref.create %d, %e : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 38 | + |
| 39 | + nvws.warp_group |
| 40 | + partition0 num_warps(4) { |
| 41 | + // CHECK: [[IDX:%.*]]:4 = scf.for [[I:%.*]] = [[LB:%.*]] to [[UB:%.*]] step [[C1:%.*]] iter_args([[IDX0:%.*]] = [[C0]], [[IDX1:%.*]] = [[C0]], [[IDX2:%.*]] = [[C0]], [[IDX3:%.*]] = [[C0]]) |
| 42 | + scf.for %i = %lb to %ub step %c1_i32 : i32{ |
| 43 | + |
| 44 | + // CHECK-NEXT: [[EMPTYIDX:%.*]] = arith.remsi [[IDX0]], [[C3]] |
| 45 | + // CHECK-NEXT: [[EMPTYMBAR:%.*]] = ttg.memdesc_subview [[EMPTY0]][[[EMPTYIDX]]] |
| 46 | + // CHECK-NEXT: [[PHASE_DIV:%.*]] = arith.divsi [[IDX0]], [[C3]] |
| 47 | + // CHECK-NEXT: [[PHASE_AND:%.*]] = arith.andi [[PHASE_DIV]], [[C1]] |
| 48 | + // CHECK-NEXT: [[PHASE_XOR:%.*]] = arith.xori [[PHASE_AND]], [[C1]] |
| 49 | + // CHECK-NEXT: ttng.wait_barrier [[EMPTYMBAR]], [[PHASE_XOR]] |
| 50 | + %1:2 = nvws.aref.put.enter %aref0[%c0_i32] {aref_tag = "put0"} : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 51 | + |
| 52 | + // CHECK-NEXT: [[STAGE:%.*]] = arith.remsi [[IDX0]], [[C3]] |
| 53 | + // CHECK-NEXT: [[BUFA:%.*]] = ttg.memdesc_subview %arg0[[[STAGE]],{{.*}},{{.*}}] |
| 54 | + // CHECK-NEXT: [[BUFB:%.*]] = ttg.memdesc_subview %arg1[[[STAGE]],{{.*}},{{.*}}] |
| 55 | + // CHECK-NEXT: [[FULLIDX:%.*]] = arith.remsi [[IDX2]], [[C3]] |
| 56 | + // CHECK-NEXT: [[FULLMBAR:%.*]] = ttg.memdesc_subview [[FULL0]][[[FULLIDX]]] |
| 57 | + // CHECK-NEXT: ttng.barrier_expect [[FULLMBAR]], 0 |
| 58 | + // CHECK-NEXT: [[IDX0a:%.*]] = arith.addi [[IDX0]], [[C1]] |
| 59 | + // CHECK-NEXT: "tma_load"([[BUFA]]) |
| 60 | + // CHECK-NEXT: "cp_async"([[BUFB]]) |
| 61 | + "tma_load"(%1#0) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>) -> () |
| 62 | + "cp_async"(%1#1) : (!ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 63 | + |
| 64 | + // CHECK-NEXT: [[FULLIDX:%.*]] = arith.remsi [[IDX2]], [[C3]] |
| 65 | + // CHECK-NEXT: [[FULLMBAR:%.*]] = ttg.memdesc_subview [[FULL0]][[[FULLIDX]]] |
| 66 | + // CHECK-NEXT: nvws.async_complete [[FULLMBAR]], async_op = <tma_load> |
| 67 | + // CHECK-NEXT: nvws.async_complete [[FULLMBAR]], async_op = <cp_async> |
| 68 | + // CHECK-NEXT: [[IDX2a:%.*]] = arith.addi [[IDX2]], [[C1]] |
| 69 | + nvws.aref.put.exit %aref0[%c0_i32] [#nvws.async_op<tma_load>, #nvws.async_op<cp_async>] {aref_tag = "put0"} : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 70 | + |
| 71 | + // CHECK-NEXT: [[IDX13:%.*]]:2 = scf.if |
| 72 | + scf.if %cond { |
| 73 | + |
| 74 | + // CHECK: arith.remsi [[IDX1]], [[C3]] |
| 75 | + // CHECK: arith.divsi [[IDX1]], [[C3]] |
| 76 | + // CHECK-NEXT: arith.andi {{.*}}, [[C1]] |
| 77 | + // CHECK-NEXT: arith.xori |
| 78 | + // CHECK-NEXT: ttng.wait_barrier |
| 79 | + // CHECK: [[IDX1a:%.*]] = arith.addi [[IDX1]], [[C1]] |
| 80 | + %2:2 = nvws.aref.put.enter %aref1[%c0_i32] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 81 | + "tmem_store"(%2#0, %2#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 82 | + |
| 83 | + // CHECK: arith.remsi [[IDX3]], [[C3]] |
| 84 | + // CHECK: [[IDX3a:%.*]] = arith.addi [[IDX3]], [[C1]] |
| 85 | + nvws.aref.put.exit %aref1[%c0_i32] [#nvws.async_op<none>] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 86 | + |
| 87 | + // CHECK: scf.yield [[IDX1a]], [[IDX3a]] |
| 88 | + } |
| 89 | + // CHECK-NEXT: } else { |
| 90 | + // CHECK-NEXT: scf.yield [[IDX1]], [[IDX3]] |
| 91 | + // CHECK-NEXT: } |
| 92 | + |
| 93 | + // CHECK: scf.yield [[IDX0a]], [[IDX13]]#0, [[IDX2a]], [[IDX13]]#1 |
| 94 | + } |
| 95 | + |
| 96 | + // CHECK: [[IDX1:%.*]]:2 = scf.if |
| 97 | + scf.if %cond { |
| 98 | + |
| 99 | + // CHECK: arith.remsi [[IDX]]#0, [[C3]] |
| 100 | + // CHECK: arith.divsi [[IDX]]#0, [[C3]] |
| 101 | + // CHECK: [[IDX0a:%.*]] = arith.addi [[IDX]]#0, [[C1]] |
| 102 | + %1:2 = nvws.aref.put.enter %aref0[%c0_i32] {aref_tag = "put1"} : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 103 | + "tma_load"(%1#0) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>) -> () |
| 104 | + "cp_async"(%1#1) : (!ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 105 | + |
| 106 | + // CHECK: arith.remsi [[IDX]]#2, [[C3]] |
| 107 | + // CHECK: [[IDX2a:%.*]] = arith.addi [[IDX]]#2, [[C1]] |
| 108 | + nvws.aref.put.exit %aref0[%c0_i32] [#nvws.async_op<tma_load>, #nvws.async_op<cp_async>] {aref_tag = "put1"} : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 109 | + } |
| 110 | + |
| 111 | + // CHECK: arith.remsi [[IDX]]#1, [[C3]] |
| 112 | + // CHECK: arith.divsi [[IDX]]#1, [[C3]] |
| 113 | + %1:2 = nvws.aref.put.enter %aref1[%c0_i32] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 114 | + "tmem_store"(%1#0, %1#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 115 | + // CHECK: arith.remsi [[IDX]]#3, [[C3]] |
| 116 | + nvws.aref.put.exit %aref1[%c0_i32] [#nvws.async_op<none>] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 117 | + nvws.warp_group.return |
| 118 | + } |
| 119 | + partition1 num_warps(8) { |
| 120 | + // CHECK: [[IDX:%.*]]:4 = scf.for [[I:%.*]] = [[LB:%.*]] to [[UB:%.*]] step [[C1:%.*]] iter_args([[IDX0:%.*]] = [[C0]], [[IDX1:%.*]] = [[C0]], [[IDX2:%.*]] = [[C0]], [[IDX3:%.*]] = [[C0]]) |
| 121 | + scf.for %i = %lb to %ub step %c1_i32 : i32{ |
| 122 | + |
| 123 | + // CHECK-NEXT: [[FULLIDX:%.*]] = arith.remsi [[IDX0]], [[C3]] |
| 124 | + // CHECK-NEXT: [[FULLMBAR:%.*]] = ttg.memdesc_subview [[FULL0]][[[FULLIDX]]] |
| 125 | + // CHECK-NEXT: [[PHASE_DIV:%.*]] = arith.divsi [[IDX0]], [[C3]] |
| 126 | + // CHECK-NEXT: [[PHASE_AND:%.*]] = arith.andi [[PHASE_DIV]], [[C1]] |
| 127 | + // CHECK-NEXT: ttng.wait_barrier [[FULLMBAR]], [[PHASE_AND]] |
| 128 | + %2:2 = nvws.aref.get.enter %aref0[%c0_i32] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 129 | + |
| 130 | + // CHECK-NEXT: [[STAGE:%.*]] = arith.remsi [[IDX0]], [[C3]] |
| 131 | + // CHECK-NEXT: [[BUFA:%.*]] = ttg.memdesc_subview %arg0[[[STAGE]],{{.*}},{{.*}}] |
| 132 | + // CHECK-NEXT: [[BUFB:%.*]] = ttg.memdesc_subview %arg1[[[STAGE]],{{.*}},{{.*}}] |
| 133 | + // CHECK-NEXT: arith.addi |
| 134 | + // CHECK-NEXT: "tc5mma"([[BUFA]], [[BUFB]]) |
| 135 | + "tc5mma"(%2#0, %2#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 136 | + |
| 137 | + // CHECK-NEXT: [[EMPTYIDX:%.*]] = arith.remsi [[IDX2]], [[C3]] |
| 138 | + // CHECK-NEXT: [[EMPTYMBAR:%.*]] = ttg.memdesc_subview [[EMPTY0]][[[EMPTYIDX]]] |
| 139 | + // CHECK-NEXT: nvws.async_complete [[EMPTYMBAR]], async_op = <tc5mma> |
| 140 | + // CHECK-NEXT: arith.addi |
| 141 | + nvws.aref.get.exit %aref0[%c0_i32] [#nvws.async_op<tc5mma>] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 142 | + |
| 143 | + // CHECK: [[IDX13:%.*]]:2 = scf.if |
| 144 | + scf.if %cond { |
| 145 | + // CHECK: arith.remsi [[IDX1]], [[C3]] |
| 146 | + // CHECK: arith.divsi [[IDX1]], [[C3]] |
| 147 | + // CHECK-NEXT: arith.andi {{.*}}, [[C1]] |
| 148 | + // CHECK-NEXT: ttng.wait_barrier |
| 149 | + %3:2 = nvws.aref.get.enter %aref1[%c0_i32] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 150 | + "tmem_load"(%3#0, %3#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 151 | + |
| 152 | + // CHECK: arith.remsi [[IDX3]], [[C3]] |
| 153 | + // CHECK-NEXT: ttg.memdesc_subview |
| 154 | + // CHECK-NEXT: nvws.async_complete {{.*}}, async_op = <none> |
| 155 | + nvws.aref.get.exit %aref1[%c0_i32] [#nvws.async_op<none>] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 156 | + } |
| 157 | + // CHECK: } else { |
| 158 | + // CHECK-NEXT: scf.yield [[IDX1]], [[IDX3]] |
| 159 | + // CHECK-NEXT: } |
| 160 | + |
| 161 | + // CHECK: scf.yield {{.*}}, [[IDX13]]#0, {{.*}}, [[IDX13]]#1 |
| 162 | + } |
| 163 | + scf.if %cond { |
| 164 | + // CHECK: arith.remsi [[IDX]]#0, [[C3]] |
| 165 | + // CHECK: arith.divsi [[IDX]]#0, [[C3]] |
| 166 | + // CHECK-NEXT: arith.andi {{.*}}, [[C1]] |
| 167 | + // CHECK-NEXT: ttng.wait_barrier |
| 168 | + %2:2 = nvws.aref.get.enter %aref0[%c0_i32] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 169 | + "tc5mma"(%2#0, %2#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 170 | + |
| 171 | + // CHECK: arith.remsi [[IDX]]#2, [[C3]] |
| 172 | + nvws.aref.get.exit %aref0[%c0_i32] [#nvws.async_op<tc5mma>] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 173 | + } |
| 174 | + // CHECK: } else { |
| 175 | + // CHECK-NEXT: scf.yield [[IDX]]#0, [[IDX]]#2 |
| 176 | + // CHECK-NEXT: } |
| 177 | + |
| 178 | + %2:2 = nvws.aref.get.enter %aref1[%c0_i32] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> -> !ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem> |
| 179 | + "tmem_load"(%2#0, %2#1) : (!ttg.memdesc<64x16xf16, #shared0, #tmem>, !ttg.memdesc<16x32xf16, #shared0, #smem>) -> () |
| 180 | + nvws.aref.get.exit %aref1[%c0_i32] [#nvws.async_op<none>] : !nvws.aref<[!ttg.memdesc<3x64x16xf16, #shared0, #tmem>, !ttg.memdesc<3x16x32xf16, #shared0, #smem>]> |
| 181 | + nvws.warp_group.return |
| 182 | + } |
44 | 183 | tt.return
|
45 | 184 | }
|
46 | 185 | }
|
0 commit comments