@@ -203,7 +203,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
203
203
// CHECK-LABEL: convert_mma_to_blocked
204
204
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
205
205
tt.func @convert_mma_to_blocked (%a: tensor <128 x256 xf16 , #mma >) {
206
- // CHECK-COUNT-16: nvgpu .stmatrix
206
+ // CHECK-COUNT-16: nvvm .stmatrix
207
207
// CHECK: nvvm.barrier0
208
208
%c = ttg.convert_layout %a : tensor <128 x256 xf16 , #mma > -> tensor <128 x256 xf16 , #blocked >
209
209
tt.return
@@ -254,7 +254,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
254
254
// CHECK-LABEL: distribute_to_shared_st_matrix
255
255
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
256
256
tt.func @distribute_to_shared_st_matrix (%a: tensor <128 x128 xf16 , #mma >) {
257
- // CHECK-COUNT-16: nvgpu .stmatrix
257
+ // CHECK-COUNT-16: nvvm .stmatrix
258
258
// CHECK: llvm.return
259
259
%b = ttg.local_alloc %a {allocation.offset = 0 : i32 } : (tensor <128 x128 xf16 , #mma >) -> !ttg.memdesc <128 x128 xf16 , #shared , #smem , mutable >
260
260
tt.return
@@ -269,7 +269,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
269
269
// CHECK-LABEL: distribute_to_shared_st_matrix_local_store
270
270
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
271
271
tt.func @distribute_to_shared_st_matrix_local_store (%a: tensor <128 x128 xf16 , #mma >) {
272
- // CHECK-COUNT-16: nvgpu .stmatrix
272
+ // CHECK-COUNT-16: nvvm .stmatrix
273
273
// CHECK: llvm.return
274
274
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <128 x128 xf16 , #shared , #smem , mutable >
275
275
ttg.local_store %a , %b : tensor <128 x128 xf16 , #mma > -> !ttg.memdesc <128 x128 xf16 , #shared , #smem , mutable >
@@ -285,7 +285,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
285
285
// CHECK-LABEL: distribute_to_shared_st_matrix_local_store
286
286
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
287
287
tt.func @distribute_to_shared_st_matrix_local_store (%a: tensor <64 x128 xf16 , #linear >) {
288
- // CHECK-COUNT-8: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans }
288
+ // CHECK-COUNT-8: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<col> }
289
289
// CHECK: llvm.return
290
290
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x128 xf16 , #shared , #smem , mutable >
291
291
ttg.local_store %a , %b : tensor <64 x128 xf16 , #linear > -> !ttg.memdesc <64 x128 xf16 , #shared , #smem , mutable >
@@ -301,7 +301,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
301
301
// CHECK-LABEL: distribute_to_swizzled_st_matrix_local_store
302
302
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
303
303
tt.func @distribute_to_swizzled_st_matrix_local_store (%a: tensor <8 x64 xf16 , #mma >) {
304
- // CHECK-COUNT-2: nvgpu .stmatrix
304
+ // CHECK-COUNT-2: nvvm .stmatrix
305
305
// CHECK: llvm.return
306
306
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <8 x64 xf16 , #shared , #smem , mutable >
307
307
ttg.local_store %a , %b : tensor <8 x64 xf16 , #mma > -> !ttg.memdesc <8 x64 xf16 , #shared , #smem , mutable >
@@ -317,7 +317,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
317
317
// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store
318
318
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
319
319
tt.func @linear_to_swizzled_st_matrix_local_store (%a: tensor <64 x32 xf16 , #linear >) {
320
- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
320
+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> }
321
321
// CHECK: llvm.return
322
322
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
323
323
ttg.local_store %a , %b : tensor <64 x32 xf16 , #linear > -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
@@ -339,7 +339,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
339
339
// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store
340
340
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
341
341
tt.func @linear_to_swizzled_st_matrix_local_store (%a: tensor <32 x32 xf16 , #linear >) {
342
- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
342
+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> }
343
343
// CHECK: llvm.return
344
344
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <32 x32 xf16 , #shared , #smem , mutable >
345
345
ttg.local_store %a , %b : tensor <32 x32 xf16 , #linear > -> !ttg.memdesc <32 x32 xf16 , #shared , #smem , mutable >
@@ -355,7 +355,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
355
355
// CHECK-LABEL: linear_to_swizzled_st_matrix_x2_local_store_fp8
356
356
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
357
357
tt.func @linear_to_swizzled_st_matrix_x2_local_store_fp8 (%a: tensor <64 x16 xf8 E4 M3 FNUZ, #linear >) {
358
- // CHECK-COUNT-1: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}} :
358
+ // CHECK-COUNT-1: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> } :
359
359
// CHECK: llvm.return
360
360
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x16 xf8 E4 M3 FNUZ, #shared , #smem , mutable >
361
361
ttg.local_store %a , %b : tensor <64 x16 xf8 E4 M3 FNUZ, #linear > -> !ttg.memdesc <64 x16 xf8 E4 M3 FNUZ, #shared , #smem , mutable >
@@ -371,7 +371,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
371
371
// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store_fp32
372
372
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
373
373
tt.func @linear_to_swizzled_st_matrix_local_store_fp32 (%a: tensor <64 x16 xf32 , #linear >) {
374
- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
374
+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> }
375
375
// CHECK: llvm.return
376
376
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x16 xf32 , #shared , #smem , mutable >
377
377
ttg.local_store %a , %b : tensor <64 x16 xf32 , #linear > -> !ttg.memdesc <64 x16 xf32 , #shared , #smem , mutable >
@@ -388,7 +388,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
388
388
// CHECK-LABEL: linear_to_swizzled_st_matrix_trans_local_store
389
389
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
390
390
tt.func @linear_to_swizzled_st_matrix_trans_local_store (%a: tensor <64 x32 xf16 , #linear >) {
391
- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans }
391
+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<col> }
392
392
// CHECK: llvm.return
393
393
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
394
394
ttg.local_store %a , %b : tensor <64 x32 xf16 , #linear > -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
@@ -410,7 +410,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
410
410
// CHECK-LABEL: linear_to_swizzled_st_matrix_trans_local_store
411
411
module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
412
412
tt.func @linear_to_swizzled_st_matrix_trans_local_store (%a: tensor <16 x32 xf16 , #linear >) {
413
- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans }
413
+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<col> }
414
414
// CHECK: llvm.return
415
415
%b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <16 x32 xf16 , #shared , #smem , mutable >
416
416
ttg.local_store %a , %b : tensor <16 x32 xf16 , #linear > -> !ttg.memdesc <16 x32 xf16 , #shared , #smem , mutable >
0 commit comments