@@ -172,54 +172,27 @@ tt.func @unsupported_load() {
172172 // CHECK-NEXT: [[DONE_MBAR0:%.*]] = ttg.memdesc_subview [[DONE_MBAR]][%c0_i32]
173173 // CHECK-NEXT: ttng.init_barrier [[DONE_MBAR0]], 1
174174
175- // CHECK-NEXT: [[A_SHARED:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<128x64xf16,
176- // CHECK-NEXT: [[B_SHARED:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16,
177-
178- // CHECK-NEXT: [[OPER_EMPTY_MBAR:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1xi64
179- // CHECK-NEXT: [[OPER_EMPTY_MBAR0:%.*]] = ttg.memdesc_subview [[OPER_EMPTY_MBAR]][%c0_i32]
180- // CHECK-NEXT: init_barrier [[OPER_EMPTY_MBAR0]], 1
181-
182- // CHECK-NEXT: [[OPER_READY_MBAR:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1xi64
183- // CHECK-NEXT: [[OPER_READY_MBAR0:%.*]] = ttg.memdesc_subview [[OPER_READY_MBAR]][%c0_i32]
184- // CHECK-NEXT: init_barrier [[OPER_READY_MBAR0]], 1
185-
186- // CHECK-NEXT: arrive_barrier [[OPER_EMPTY_MBAR]], 1
187-
188175 // CHECK-NEXT: scf.for
189176 scf.for %k = %c0_i32 to %k_tiles step %c1_i32 iter_args (%acc = %zero ) -> tensor <128 x128 xf32 , #acc_layout > : i32 {
190177 // CHECK-NEXT: get_ptrs
191178 %a_ptrs , %b_ptrs = " get_ptrs" (%k ) : (i32 ) -> (tensor <128 x64 x!tt.ptr <f16 >, #oper_layout >, tensor <64 x128 x!tt.ptr <f16 >, #oper_layout >)
192- // CHECK-NEXT: [[A:%.*]] = tt.load
193179 %a = tt.load %a_ptrs : tensor <128 x64 x!tt.ptr <f16 >, #oper_layout >
194- // CHECK-NEXT: [[B:%.*]] = tt.load
195180 %b = tt.load %b_ptrs : tensor <64 x128 x!tt.ptr <f16 >, #oper_layout >
196181
197- // CHECK-NEXT: wait_barrier [[OPER_EMPTY_MBAR]]
198- // CHECK-NEXT: local_store [[A]], [[A_SHARED]]
199182 %a_shared = ttg.local_alloc %a : (tensor <128 x64 xf16 , #oper_layout >) -> !ttg.memdesc <128 x64 xf16 , #shared , #smem >
200- // CHECK-NEXT: local_store [[B]], [[B_SHARED]]
201183 %b_shared = ttg.local_alloc %b : (tensor <64 x128 xf16 , #oper_layout >) -> !ttg.memdesc <64 x128 xf16 , #shared , #smem >
202- // CHECK-NEXT: arrive_barrier [[OPER_READY_MBAR]], 1
203184
204185 %c_tmem , %c_tok = ttng.tmem_alloc %acc : (tensor <128 x128 xf32 , #acc_layout >) -> (!ttg.memdesc <128 x128 xf32 , #acc_tmem , #ttng.tensor_memory , mutable >, !ttg.async.token )
205- // CHECK-NEXT: [[IS_LAST:%.*]] = arith.cmpi eq, %{{.*}}, %c31_i32
206- // CHECK-NEXT: wait_barrier [[OPER_READY_MBAR]]
207- // CHECK-NEXT: ttng.tc_gen5_mma %{{.*}}, [[ACC]][], %true, %true, [[DONE_MBAR0]][[[IS_LAST]]], [[OPER_EMPTY_MBAR]][%true] {ttg.partition = 1 : i32}
186+ // CHECK: [[IS_LAST:%.*]] = arith.cmpi eq, %{{.*}}, %c31_i32
187+ // CHECK-NEXT: ttng.tc_gen5_mma %{{.*}}, [[ACC]][], %true, %true, [[DONE_MBAR0]][[[IS_LAST]]] {ttg.partition = 1 : i32}
208188 %mma_tok = ttng.tc_gen5_mma %a_shared , %b_shared , %c_tmem [%c_tok ], %true , %true : !ttg.memdesc <128 x64 xf16 , #shared , #smem >, !ttg.memdesc <64 x128 xf16 , #shared , #smem >, !ttg.memdesc <128 x128 xf32 , #acc_tmem , #ttng.tensor_memory , mutable >
209189 %c , %load_tok = ttng.tmem_load %c_tmem [%mma_tok ] : !ttg.memdesc <128 x128 xf32 , #acc_tmem , #ttng.tensor_memory , mutable > -> tensor <128 x128 xf32 , #acc_layout >
210190
211- // CHECK-NEXT: [[NEXT_PHASE:%.*]] = arith.xori
212- // CHECK-NEXT: yield [[NEXT_PHASE]]
213-
214191 scf.yield %c : tensor <128 x128 xf32 , #acc_layout >
215- // CHECK-NEXT : ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32]
192+ // CHECK: ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32]
216193 } {tt.warp_specialize }
217194
218195 // CHECK-NEXT: ttng.wait_barrier [[DONE_MBAR0]], %c0_i32
219- // CHECK-NEXT: ttng.inval_barrier [[OPER_READY_MBAR0]]
220- // CHECK-NEXT: ttg.local_dealloc [[OPER_READY_MBAR]]
221- // CHECK-NEXT: ttng.inval_barrier [[OPER_EMPTY_MBAR0]]
222- // CHECK-NEXT: ttg.local_dealloc [[OPER_EMPTY_MBAR]]
223196 // CHECK-NEXT: ttng.inval_barrier [[DONE_MBAR0]]
224197 // CHECK-NEXT: ttg.local_dealloc [[DONE_MBAR]]
225198
@@ -749,7 +722,7 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use_no_multibuf_flag(
749722 %b_shared = ttg.local_alloc %b : (tensor <64 x128 xf16 , #oper_layout >) -> !ttg.memdesc <64 x128 xf16 , #shared , #smem >
750723 %c_tmem , %c_tok = ttng.tmem_alloc %acc : (tensor <128 x128 xf32 , #acc_layout >) -> (!ttg.memdesc <128 x128 xf32 , #acc_tmem , #ttng.tensor_memory , mutable >, !ttg.async.token )
751724
752- // CHECK-NEXT: [[DO_EPILOGUE:%.*]] = arith.cmpi eq, [[K:%.*]], %c0_i32
725+ // CHECK-NEXT: [[DO_EPILOGUE:%.*]] = arith.cmpi eq, [[K:%.*]], %c0_i32 : i32
753726 // CHECK-NEXT: [[MMA_TOK:%.*]] = ttng.tc_gen5_mma %{{[0-9]+}}, %{{[0-9]+}}, [[ACC_BUF]][], [[FLAG]], %true, {{.*}}, [[ACC_READY_BUF0]][[[DO_EPILOGUE]]] {ttg.partition = 1 : i32}
754727 %mma_tok = ttng.tc_gen5_mma %a_shared , %b_shared , %c_tmem [%c_tok ], %flag , %true : !ttg.memdesc <128 x64 xf16 , #shared , #smem >, !ttg.memdesc <64 x128 xf16 , #shared , #smem >, !ttg.memdesc <128 x128 xf32 , #acc_tmem , #ttng.tensor_memory , mutable >
755728 %c , %load_tok = ttng.tmem_load %c_tmem [%mma_tok ] : !ttg.memdesc <128 x128 xf32 , #acc_tmem , #ttng.tensor_memory , mutable > -> tensor <128 x128 xf32 , #acc_layout >
0 commit comments