@@ -888,11 +888,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
888888 // CHECK: llvm.mlir.addressof @global_smem
889889 // CHECK: llvm.store {{.*}} vector<4xi32>
890890 // CHECK: nvvm.bar.warp.sync
891- // CHECK: nvgpu .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
891+ // CHECK: nvvm .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
892892 // CHECK: nvvm.bar.warp.sync
893893 // CHECK: llvm.store {{.*}} vector<4xi32>
894894 // CHECK: nvvm.bar.warp.sync
895- // CHECK: nvgpu .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
895+ // CHECK: nvvm .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
896896 %0 = ttg.convert_layout %arg0 : tensor <32 x32 xf32 , #blocked0 > -> tensor <32 x32 xf32 , #blocked1 >
897897 tt.return
898898 }
@@ -911,9 +911,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
911911 tt.func @convert_dot_ldmatrix (%A: tensor <16 x16 xf16 , #blocked0 >, %B: tensor <16 x16 xf16 , #blocked0 >) {
912912 %AA = ttg.local_alloc %A : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
913913 %BB = ttg.local_alloc %B : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
914- // CHECK: nvgpu .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
915- // CHECK: nvgpu .ldmatrix %{{.*}} {trans } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
916- // CHECK-NOT: nvgpu .ldmatrix
914+ // CHECK: nvvm .ldmatrix %{{.*}} {eltType = #nvvm.ld_st_matrix_elt_type<b16>, layout = #nvvm.mma_layout<row>, num = 4 : i32, shape = #nvvm.ld_st_matrix_shape<m = 8, n = 8> } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
915+ // CHECK: nvvm .ldmatrix %{{.*}} {eltType = #nvvm.ld_st_matrix_elt_type<b16>, layout = #nvvm.mma_layout<col>, num = 4 : i32, shape = #nvvm.ld_st_matrix_shape<m = 8, n = 8> } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
916+ // CHECK-NOT: nvvm .ldmatrix
917917 %AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
918918 %BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_b >
919919 %cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -941,9 +941,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
941941 tt.func @convert_dot_ldmatrix_swizzle (%A: tensor <16 x16 xf16 , #blocked0 >, %B: tensor <16 x16 xf16 , #blocked0 >) {
942942 %AA = ttg.local_alloc %A : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
943943 %BB = ttg.local_alloc %B : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
944- // CHECK: nvgpu .ldmatrix %{{.*}}, m8n8, 16 : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
945- // CHECK: nvgpu .ldmatrix %{{.*}}, m8n8, 16 {trans } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
946- // CHECK-NOT: nvgpu .ldmatrix
944+ // CHECK: nvvm .ldmatrix %{{.*}} {eltType = #nvvm.ld_st_matrix_elt_type<b16>, layout = #nvvm.mma_layout<row>, num = 4 : i32, shape = #nvvm.ld_st_matrix_shape<m = 8, n = 8>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
945+ // CHECK: nvvm .ldmatrix %{{.*}} {eltType = #nvvm.ld_st_matrix_elt_type<b16>, layout = #nvvm.mma_layout<col>, num = 4 : i32, shape = #nvvm.ld_st_matrix_shape<m = 8, n = 8> } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
946+ // CHECK-NOT: nvvm .ldmatrix
947947 %AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
948948 %BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_b >
949949 %cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -971,7 +971,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
971971 tt.func @convert_dot (%A: tensor <16 x16 xf16 , #blocked0 >, %B: tensor <16 x16 xf16 , #blocked0 >) {
972972 %AA = ttg.local_alloc %A : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
973973 %BB = ttg.local_alloc %B : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
974- // CHECK-NOT: nvgpu .ldmatrix
974+ // CHECK-NOT: nvvm .ldmatrix
975975 %AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
976976 %BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_b >
977977 %cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -999,7 +999,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
999999 tt.func @convert_dot_mmav3_shared (%A: tensor <64 x64 xf16 , #blocked0 >, %B: tensor <64 x64 xf16 , #blocked0 >) {
10001000 %AA = ttg.local_alloc %A : (tensor <64 x64 xf16 , #blocked0 >) -> !ttg.memdesc <64 x64 xf16 , #shared0 , #smem >
10011001 %BB = ttg.local_alloc %B : (tensor <64 x64 xf16 , #blocked0 >) -> !ttg.memdesc <64 x64 xf16 , #shared0 , #smem >
1002- // CHECK-COUNT-32: nvgpu .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
1002+ // CHECK-COUNT-32: nvvm .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
10031003 %AA_DOT = ttg.local_load %AA : !ttg.memdesc <64 x64 xf16 , #shared0 , #smem > -> tensor <64 x64 xf16 , #dot_operand_a >
10041004 %BB_DOT = ttg.local_load %BB : !ttg.memdesc <64 x64 xf16 , #shared0 , #smem > -> tensor <64 x64 xf16 , #dot_operand_b >
10051005 %cst0 = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #mma0 >
@@ -1023,8 +1023,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
10231023 tt.func @convert_dot_fp8 (%A: tensor <16 x16 xf8 E5 M2 , #blocked0 >, %B: tensor <16 x16 xf8 E5 M2 , #blocked0 >) {
10241024 %AA = ttg.local_alloc %A : (tensor <16 x16 xf8 E5 M2 , #blocked0 >) -> !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem >
10251025 %BB = ttg.local_alloc %B : (tensor <16 x16 xf8 E5 M2 , #blocked0 >) -> !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem >
1026- // CHECK: nvgpu .ldmatrix %{{.*}}, m8n8, 16 : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
1027- // CHECK-NOT: nvgpu .ldmatrix
1026+ // CHECK: nvvm .ldmatrix %{{.*}} {eltType = #nvvm.ld_st_matrix_elt_type<b16>, layout = #nvvm.mma_layout<row>, num = 2 : i32, shape = #nvvm.ld_st_matrix_shape<m = 8, n = 8>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
1027+ // CHECK-NOT: nvvm .ldmatrix
10281028 %AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem > -> tensor <16 x16 xf8 E5 M2 , #dot_operand_a >
10291029 %BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem > -> tensor <16 x16 xf8 E5 M2 , #dot_operand_b >
10301030 %cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -1355,7 +1355,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
13551355 tt.func @matmul_kernel_dot_operand_layout (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
13561356 %a: !ttg.memdesc <128 x32 xf16 , #shared , #smem >, %b: !ttg.memdesc <32 x256 xf16 , #shared , #smem >) {
13571357 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf32 , #mma >
1358- // CHECK: nvgpu .ldmatrix
1358+ // CHECK: nvvm .ldmatrix
13591359 %a_mat = ttg.local_load %a : !ttg.memdesc <128 x32 xf16 , #shared , #smem > -> tensor <128 x32 xf16 , #dot_operand_a >
13601360 %b_mat = ttg.local_load %b : !ttg.memdesc <32 x256 xf16 , #shared , #smem > -> tensor <32 x256 xf16 , #dot_operand_b >
13611361
@@ -1431,9 +1431,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
14311431 tt.func @matmul_tf32dot (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
14321432 %a: !ttg.memdesc <32 x16 xf32 , #shared , #smem >, %b: !ttg.memdesc <16 x32 xf32 , #shared , #smem >) {
14331433 %cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #mma >
1434- // CHECK: nvgpu .ldmatrix
1434+ // CHECK: nvvm .ldmatrix
14351435 // CHECK-SAME: (i32, i32, i32, i32)
1436- // CHECK: nvgpu .ldmatrix
1436+ // CHECK: nvvm .ldmatrix
14371437 // CHECK-SAME: (i32, i32, i32, i32)
14381438 %a_mat = ttg.local_load %a : !ttg.memdesc <32 x16 xf32 , #shared , #smem > -> tensor <32 x16 xf32 , #dot_operand_a >
14391439 %b_mat = ttg.local_load %b : !ttg.memdesc <16 x32 xf32 , #shared , #smem > -> tensor <16 x32 xf32 , #dot_operand_b >
@@ -1936,8 +1936,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
19361936 %f16_shared = ttg.local_alloc %f16_inp : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
19371937 %i16_shared = ttg.local_alloc %i16_inp : (tensor <16 x16 xi16 , #blocked0 >) -> !ttg.memdesc <16 x16 xi16 , #shared0 , #smem >
19381938
1939- // CHECK: nvgpu .ldmatrix
1940- // CHECK: nvgpu .ldmatrix
1939+ // CHECK: nvvm .ldmatrix
1940+ // CHECK: nvvm .ldmatrix
19411941
19421942 %f16_dot = ttg.local_load %f16_shared : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
19431943 %i16_dot = ttg.local_load %i16_shared : !ttg.memdesc <16 x16 xi16 , #shared0 , #smem > -> tensor <16 x16 xi16 , #dot_operand_b >
0 commit comments