@@ -18,10 +18,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
1818 %3 = ttg.async_commit_group tokens %2
1919
2020 // Do not wait on the second async_copy => waitcnt 2
21- // CHECK: ttg .async_wait {{.*}} {num = 2
21+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 2
2222 %9 = ttg.async_wait %1 {num = 0 : i32 }
2323 // No async_copies in between => waitcnt 0
24- // CHECK: ttg .async_wait {{.*}} {num = 0
24+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 0
2525 %10 = ttg.async_wait %3 {num = 0 : i32 }
2626 tt.return
2727 }
@@ -47,10 +47,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
4747 %3 = ttg.async_commit_group tokens %2
4848
4949 // Do not wait on the second async_copy => waitcnt 2
50- // CHECK: ttg .async_wait {{.*}} {num = 0
50+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 0
5151 %9 = ttg.async_wait %3 {num = 0 : i32 }
5252 // No async_copies in between => waitcnt 0
53- // CHECK: ttg .async_wait {{.*}} {num = 2
53+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 2
5454 %10 = ttg.async_wait %1 {num = 0 : i32 }
5555 tt.return
5656 }
@@ -77,9 +77,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
7777
7878 %4 = tt.load %arg3 : tensor <128 x16 x!tt.ptr <f16 >, #blocked >
7979
80- // CHECK: ttg .async_wait {{.*}} {num = 2
80+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 2
8181 %9 = ttg.async_wait %1 {num = 0 : i32 }
82- // CHECK: ttg .async_wait {{.*}} {num = 0
82+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 0
8383 %10 = ttg.async_wait %3 {num = 0 : i32 }
8484 tt.return
8585 }
@@ -106,15 +106,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
106106 %2 = ttg.async_copy_global_to_local %arg4 , %arg2 : tensor <16 x256 x!tt.ptr <f16 >, #blocked1 > -> <16 x256 xf16 , #shared1 , #smem , mutable >
107107 %3 = ttg.async_commit_group tokens %2
108108 %8:2 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args (%arg15 = %1 , %arg16 = %3 ) -> (!ttg.async.token , !ttg.async.token ) : i32 {
109- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 0
109+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 0
110110 %10 = ttg.async_wait %arg15 , %arg16 {num = 2 : i32 }
111111 %11 = ttg.async_copy_global_to_local %arg3 , %arg1 : tensor <128 x16 x!tt.ptr <f16 >, #blocked > -> <128 x16 xf16 , #shared , #smem , mutable >
112112 %12 = ttg.async_commit_group tokens %11
113113 %13 = ttg.async_copy_global_to_local %arg4 , %arg2 : tensor <16 x256 x!tt.ptr <f16 >, #blocked1 > -> <16 x256 xf16 , #shared1 , #smem , mutable >
114114 %14 = ttg.async_commit_group tokens %13
115115 scf.yield %12 , %14: !ttg.async.token , !ttg.async.token
116116 }
117- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 0
117+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 0
118118 %9 = ttg.async_wait %8#0 , %8#1 {num = 0 : i32 }
119119 tt.return
120120 }
@@ -145,15 +145,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
145145 %6 = ttg.async_copy_global_to_local %arg4 , %arg2 : tensor <16 x256 x!tt.ptr <f16 >, #blocked1 > -> <16 x256 xf16 , #shared1 , #smem , mutable >
146146 %7 = ttg.async_commit_group tokens %6
147147 %8:4 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args (%arg15 = %1 , %arg16 = %5 , %arg17 = %3 , %arg18 = %7 ) -> (!ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token ) : i32 {
148- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 3
148+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 3
149149 %10 = ttg.async_wait %arg15 , %arg17 {num = 2 : i32 }
150150 %11 = ttg.async_copy_global_to_local %arg3 , %arg1 : tensor <128 x16 x!tt.ptr <f16 >, #blocked > -> <128 x16 xf16 , #shared , #smem , mutable >
151151 %12 = ttg.async_commit_group tokens %11
152152 %13 = ttg.async_copy_global_to_local %arg4 , %arg2 : tensor <16 x256 x!tt.ptr <f16 >, #blocked1 > -> <16 x256 xf16 , #shared1 , #smem , mutable >
153153 %14 = ttg.async_commit_group tokens %13
154154 scf.yield %arg16 , %12 , %arg18 , %14 : !ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token
155155 }
156- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 0
156+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 0
157157 %9 = ttg.async_wait %8#0 , %8#1 , %8#2 , %8#3 {num = 0 : i32 }
158158 tt.return
159159 }
@@ -185,12 +185,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
185185 %8:4 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args (%arg15 = %1 , %arg16 = %5 , %arg17 = %3 , %arg18 = %7 ) -> (!ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token ) : i32 {
186186 %103 = scf.if %cond -> (!ttg.async.token ) {
187187 // We wait on both tokens so we interleave with one iteration => 3
188- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 3
188+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 3
189189 %token1 = ttg.async_wait %arg15 , %arg17 {num = 2 : i32 }
190190 scf.yield %token1 : !ttg.async.token
191191 } else {
192192 // We only wait on the token of the first load so we can interleave one more load => 3 + 2
193- // CHECK: ttg .async_wait {{.*}} {num = 5
193+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 5
194194 %token2 = ttg.async_wait %arg15 {num = 1 : i32 }
195195 scf.yield %token2 : !ttg.async.token
196196 }
@@ -200,7 +200,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
200200 %14 = ttg.async_commit_group tokens %13
201201 scf.yield %arg16 , %12 , %arg18 , %14 : !ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token
202202 }
203- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 0
203+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 0
204204 %9 = ttg.async_wait %8#0 , %8#1 , %8#2 , %8#3 {num = 0 : i32 }
205205 tt.return
206206 }
@@ -235,7 +235,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
235235 %cond_load = ttg.async_copy_global_to_local %arg4 , %arg2 : tensor <16 x256 x!tt.ptr <f16 >, #blocked1 > -> <16 x256 xf16 , #shared1 , #smem , mutable >
236236 %cond_load_commit = ttg.async_commit_group tokens %cond_load
237237 // We wait on both tokens (3) and additionally we should count the load inside our block (+2) => 5
238- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 5
238+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 5
239239 %token1 = ttg.async_wait %arg15 , %arg17 {num = 2 : i32 }
240240 scf.yield %token1 : !ttg.async.token
241241 } else {
@@ -247,7 +247,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
247247 %14 = ttg.async_commit_group tokens %13
248248 scf.yield %arg16 , %12 , %arg18 , %14 : !ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token
249249 }
250- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 0
250+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 0
251251 %9 = ttg.async_wait %8#0 , %8#1 , %8#2 , %8#3 {num = 0 : i32 }
252252 tt.return
253253 }
@@ -279,7 +279,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
279279 %7 = ttg.async_commit_group tokens %6
280280 %8:4 = scf.for %arg14 = %c0_i32 to %arg0 step %c1_i32 iter_args (%arg15 = %1 , %arg16 = %5 , %arg17 = %3 , %arg18 = %7 ) -> (!ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token ) : i32 {
281281 // The then block contains 3 instructions and the else 1 so we expect the count to be 3 (1 + 2) because there are also 2 instructions outside the scf.if in the loop body
282- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 3
282+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 3
283283 %token1 = ttg.async_wait %arg15 , %arg17 {num = 2 : i32 }
284284
285285 %103 = scf.if %cond -> (!ttg.async.token ) {
@@ -296,7 +296,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
296296 %14 = ttg.async_commit_group tokens %13
297297 scf.yield %arg16 , %103 , %arg18 , %14 : !ttg.async.token , !ttg.async.token , !ttg.async.token , !ttg.async.token
298298 }
299- // CHECK: ttg .async_wait {{.*}}, {{.*}} {num = 0
299+ // CHECK: amdgpu .async_wait {{.*}}, {{.*}} {num_inst = 0
300300 %9 = ttg.async_wait %8#0 , %8#1 , %8#2 , %8#3 {num = 0 : i32 }
301301 tt.return
302302 }
@@ -323,14 +323,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
323323 %7 = ttg.async_commit_group tokens %6
324324 // Dynamic iteration count so we should not count its body
325325 %30 = scf.for %arg21 = %c0_i32 to %arg0 step %c1_i32 iter_args (%arg30 = %6 ) -> (!ttg.async.token ) : i32 {
326- // CHECK: ttg .async_wait {{.*}} {num = 0
326+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 0
327327 %31 = ttg.async_wait %arg30 {num = 1 : i32 }
328328 // Emits 1 direct to lds instruction
329329 %32 = ttg.async_copy_global_to_local %arg3 , %arg1 : tensor <128 x16 x!tt.ptr <f16 >, #blocked > -> <128 x16 xf16 , #shared , #smem , mutable >
330330 %33 = ttg.async_commit_group tokens %32
331331 scf.yield %33 : !ttg.async.token
332332 }
333- // CHECK: ttg .async_wait {{.*}} {num = 1
333+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 1
334334 %10 = ttg.async_wait %1 {num = 1 : i32 }
335335 tt.return
336336 }
@@ -357,14 +357,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
357357 %7 = ttg.async_commit_group tokens %6
358358 // Loop with 4 iterations => 4 instructions
359359 %30 = scf.for %arg21 = %c0_i32 to %c4_i32 step %c1_i32 iter_args (%arg30 = %6 ) -> (!ttg.async.token ) : i32 {
360- // CHECK: ttg .async_wait {{.*}} {num = 0
360+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 0
361361 %31 = ttg.async_wait %arg30 {num = 1 : i32 }
362362 // Emits 1 direct to lds instruction
363363 %32 = ttg.async_copy_global_to_local %arg3 , %arg1 : tensor <128 x16 x!tt.ptr <f16 >, #blocked > -> <128 x16 xf16 , #shared , #smem , mutable >
364364 %33 = ttg.async_commit_group tokens %32
365365 scf.yield %33 : !ttg.async.token
366366 }
367- // CHECK: ttg .async_wait {{.*}} {num = 5
367+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 5
368368 %10 = ttg.async_wait %1 {num = 1 : i32 }
369369 tt.return
370370 }
@@ -397,10 +397,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
397397
398398 // Check that we do not take other TDM loads into account (they use a different HW counter)
399399
400- // CHECK: ttg .async_wait {{.*}} {num = 2
400+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 2
401401 %cw1 = ttg.async_wait %21 {num = 0 : i32 }
402402
403- // CHECK: ttg .async_wait {{.*}} {num = 0
403+ // CHECK: amdgpu .async_wait {{.*}} {num_inst = 0
404404 %cw2 = ttg.async_wait %51 {num = 0 : i32 }
405405 tt.return
406406 }
0 commit comments