@@ -203,7 +203,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
203203// CHECK-LABEL: convert_mma_to_blocked
204204module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
205205 tt.func @convert_mma_to_blocked (%a: tensor <128 x256 xf16 , #mma >) {
206- // CHECK-COUNT-16: nvgpu .stmatrix
206+ // CHECK-COUNT-16: nvvm .stmatrix
207207 // CHECK: nvvm.barrier0
208208 %c = ttg.convert_layout %a : tensor <128 x256 xf16 , #mma > -> tensor <128 x256 xf16 , #blocked >
209209 tt.return
@@ -254,7 +254,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
254254// CHECK-LABEL: distribute_to_shared_st_matrix
255255module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
256256 tt.func @distribute_to_shared_st_matrix (%a: tensor <128 x128 xf16 , #mma >) {
257- // CHECK-COUNT-16: nvgpu .stmatrix
257+ // CHECK-COUNT-16: nvvm .stmatrix
258258 // CHECK: llvm.return
259259 %b = ttg.local_alloc %a {allocation.offset = 0 : i32 } : (tensor <128 x128 xf16 , #mma >) -> !ttg.memdesc <128 x128 xf16 , #shared , #smem , mutable >
260260 tt.return
@@ -269,7 +269,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
269269// CHECK-LABEL: distribute_to_shared_st_matrix_local_store
270270module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
271271 tt.func @distribute_to_shared_st_matrix_local_store (%a: tensor <128 x128 xf16 , #mma >) {
272- // CHECK-COUNT-16: nvgpu .stmatrix
272+ // CHECK-COUNT-16: nvvm .stmatrix
273273 // CHECK: llvm.return
274274 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <128 x128 xf16 , #shared , #smem , mutable >
275275 ttg.local_store %a , %b : tensor <128 x128 xf16 , #mma > -> !ttg.memdesc <128 x128 xf16 , #shared , #smem , mutable >
@@ -285,7 +285,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
285285// CHECK-LABEL: distribute_to_shared_st_matrix_local_store
286286module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
287287 tt.func @distribute_to_shared_st_matrix_local_store (%a: tensor <64 x128 xf16 , #linear >) {
288- // CHECK-COUNT-8: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans }
288+ // CHECK-COUNT-8: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<col> }
289289 // CHECK: llvm.return
290290 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x128 xf16 , #shared , #smem , mutable >
291291 ttg.local_store %a , %b : tensor <64 x128 xf16 , #linear > -> !ttg.memdesc <64 x128 xf16 , #shared , #smem , mutable >
@@ -301,7 +301,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
301301// CHECK-LABEL: distribute_to_swizzled_st_matrix_local_store
302302module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
303303 tt.func @distribute_to_swizzled_st_matrix_local_store (%a: tensor <8 x64 xf16 , #mma >) {
304- // CHECK-COUNT-2: nvgpu .stmatrix
304+ // CHECK-COUNT-2: nvvm .stmatrix
305305 // CHECK: llvm.return
306306 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <8 x64 xf16 , #shared , #smem , mutable >
307307 ttg.local_store %a , %b : tensor <8 x64 xf16 , #mma > -> !ttg.memdesc <8 x64 xf16 , #shared , #smem , mutable >
@@ -317,7 +317,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
317317// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store
318318module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
319319 tt.func @linear_to_swizzled_st_matrix_local_store (%a: tensor <64 x32 xf16 , #linear >) {
320- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
320+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> }
321321 // CHECK: llvm.return
322322 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
323323 ttg.local_store %a , %b : tensor <64 x32 xf16 , #linear > -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
@@ -339,7 +339,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
339339// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store
340340module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
341341 tt.func @linear_to_swizzled_st_matrix_local_store (%a: tensor <32 x32 xf16 , #linear >) {
342- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
342+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> }
343343 // CHECK: llvm.return
344344 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <32 x32 xf16 , #shared , #smem , mutable >
345345 ttg.local_store %a , %b : tensor <32 x32 xf16 , #linear > -> !ttg.memdesc <32 x32 xf16 , #shared , #smem , mutable >
@@ -355,7 +355,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
355355// CHECK-LABEL: linear_to_swizzled_st_matrix_x2_local_store_fp8
356356module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
357357 tt.func @linear_to_swizzled_st_matrix_x2_local_store_fp8 (%a: tensor <64 x16 xf8 E4 M3 FNUZ, #linear >) {
358- // CHECK-COUNT-1: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}} :
358+ // CHECK-COUNT-1: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> } :
359359 // CHECK: llvm.return
360360 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x16 xf8 E4 M3 FNUZ, #shared , #smem , mutable >
361361 ttg.local_store %a , %b : tensor <64 x16 xf8 E4 M3 FNUZ, #linear > -> !ttg.memdesc <64 x16 xf8 E4 M3 FNUZ, #shared , #smem , mutable >
@@ -371,7 +371,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
371371// CHECK-LABEL: linear_to_swizzled_st_matrix_local_store_fp32
372372module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
373373 tt.func @linear_to_swizzled_st_matrix_local_store_fp32 (%a: tensor <64 x16 xf32 , #linear >) {
374- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
374+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<row> }
375375 // CHECK: llvm.return
376376 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x16 xf32 , #shared , #smem , mutable >
377377 ttg.local_store %a , %b : tensor <64 x16 xf32 , #linear > -> !ttg.memdesc <64 x16 xf32 , #shared , #smem , mutable >
@@ -388,7 +388,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
388388// CHECK-LABEL: linear_to_swizzled_st_matrix_trans_local_store
389389module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
390390 tt.func @linear_to_swizzled_st_matrix_trans_local_store (%a: tensor <64 x32 xf16 , #linear >) {
391- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans }
391+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<col> }
392392 // CHECK: llvm.return
393393 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
394394 ttg.local_store %a , %b : tensor <64 x32 xf16 , #linear > -> !ttg.memdesc <64 x32 xf16 , #shared , #smem , mutable >
@@ -410,7 +410,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
410410// CHECK-LABEL: linear_to_swizzled_st_matrix_trans_local_store
411411module attributes {" ttg.target" = " cuda:90" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
412412 tt.func @linear_to_swizzled_st_matrix_trans_local_store (%a: tensor <16 x32 xf16 , #linear >) {
413- // CHECK-COUNT-2: nvgpu .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {trans }
413+ // CHECK-COUNT-2: nvvm .stmatrix %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} {layout = #nvvm.mma_layout<col> }
414414 // CHECK: llvm.return
415415 %b = ttg.local_alloc {allocation.offset = 0 : i32 } : () -> !ttg.memdesc <16 x32 xf16 , #shared , #smem , mutable >
416416 ttg.local_store %a , %b : tensor <16 x32 xf16 , #linear > -> !ttg.memdesc <16 x32 xf16 , #shared , #smem , mutable >
0 commit comments