@@ -880,9 +880,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
880
880
tt.func @convert_dot_ldmatrix (%A: tensor <16 x16 xf16 , #blocked0 >, %B: tensor <16 x16 xf16 , #blocked0 >) {
881
881
%AA = ttg.local_alloc %A : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
882
882
%BB = ttg.local_alloc %B : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
883
- // CHECK: nvvm .ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 4 : i32 } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
884
- // CHECK: nvvm .ldmatrix %{{.*}} {layout = #nvvm.mma_layout<col>, num = 4 : i32 } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
885
- // CHECK-NOT: nvvm .ldmatrix
883
+ // CHECK: nvgpu .ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
884
+ // CHECK: nvgpu .ldmatrix %{{.*}} {trans } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
885
+ // CHECK-NOT: nvgpu .ldmatrix
886
886
%AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
887
887
%BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_b >
888
888
%cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -910,9 +910,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
910
910
tt.func @convert_dot_ldmatrix_swizzle (%A: tensor <16 x16 xf16 , #blocked0 >, %B: tensor <16 x16 xf16 , #blocked0 >) {
911
911
%AA = ttg.local_alloc %A : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
912
912
%BB = ttg.local_alloc %B : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
913
- // CHECK: nvvm .ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 4 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
914
- // CHECK: nvvm .ldmatrix %{{.*}} {layout = #nvvm.mma_layout<col>, num = 4 : i32 } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
915
- // CHECK-NOT: nvvm .ldmatrix
913
+ // CHECK: nvgpu .ldmatrix %{{.*}}, m8n8, 16 : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
914
+ // CHECK: nvgpu .ldmatrix %{{.*}}, m8n8, 16 {trans } : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
915
+ // CHECK-NOT: nvgpu .ldmatrix
916
916
%AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
917
917
%BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_b >
918
918
%cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -940,7 +940,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
940
940
tt.func @convert_dot (%A: tensor <16 x16 xf16 , #blocked0 >, %B: tensor <16 x16 xf16 , #blocked0 >) {
941
941
%AA = ttg.local_alloc %A : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
942
942
%BB = ttg.local_alloc %B : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
943
- // CHECK-NOT: nvvm .ldmatrix
943
+ // CHECK-NOT: nvgpu .ldmatrix
944
944
%AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
945
945
%BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_b >
946
946
%cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -968,8 +968,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
968
968
tt.func @convert_dot_mmav3_shared (%A: tensor <64 x64 xf16 , #blocked0 >, %B: tensor <64 x64 xf16 , #blocked0 >) {
969
969
%AA = ttg.local_alloc %A : (tensor <64 x64 xf16 , #blocked0 >) -> !ttg.memdesc <64 x64 xf16 , #shared0 , #smem >
970
970
%BB = ttg.local_alloc %B : (tensor <64 x64 xf16 , #blocked0 >) -> !ttg.memdesc <64 x64 xf16 , #shared0 , #smem >
971
- // CHECK-COUNT-16: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 4 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
972
- // CHECK-COUNT-16: nvvm.ldmatrix %{{.*}} {layout = #nvvm.mma_layout<col>, num = 4 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
971
+ // CHECK-COUNT-32: nvgpu.ldmatrix %{{.*}} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
973
972
%AA_DOT = ttg.local_load %AA : !ttg.memdesc <64 x64 xf16 , #shared0 , #smem > -> tensor <64 x64 xf16 , #dot_operand_a >
974
973
%BB_DOT = ttg.local_load %BB : !ttg.memdesc <64 x64 xf16 , #shared0 , #smem > -> tensor <64 x64 xf16 , #dot_operand_b >
975
974
%cst0 = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #mma0 >
@@ -993,8 +992,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
993
992
tt.func @convert_dot_fp8 (%A: tensor <16 x16 xf8 E5 M2 , #blocked0 >, %B: tensor <16 x16 xf8 E5 M2 , #blocked0 >) {
994
993
%AA = ttg.local_alloc %A : (tensor <16 x16 xf8 E5 M2 , #blocked0 >) -> !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem >
995
994
%BB = ttg.local_alloc %B : (tensor <16 x16 xf8 E5 M2 , #blocked0 >) -> !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem >
996
- // CHECK: nvvm .ldmatrix %{{.*}} {layout = #nvvm.mma_layout<row>, num = 2 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
997
- // CHECK-NOT: nvvm .ldmatrix
995
+ // CHECK: nvgpu .ldmatrix %{{.*}}, m8n8, 16 : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
996
+ // CHECK-NOT: nvgpu .ldmatrix
998
997
%AA_DOT = ttg.local_load %AA : !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem > -> tensor <16 x16 xf8 E5 M2 , #dot_operand_a >
999
998
%BB_DOT = ttg.local_load %BB : !ttg.memdesc <16 x16 xf8 E5 M2 , #shared0 , #smem > -> tensor <16 x16 xf8 E5 M2 , #dot_operand_b >
1000
999
%cst0 = arith.constant dense <0.000000e+00 > : tensor <16 x16 xf32 , #mma0 >
@@ -1325,7 +1324,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
1325
1324
tt.func @matmul_kernel_dot_operand_layout (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
1326
1325
%a: !ttg.memdesc <128 x32 xf16 , #shared , #smem >, %b: !ttg.memdesc <32 x256 xf16 , #shared , #smem >) {
1327
1326
%cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf32 , #mma >
1328
- // CHECK: nvvm .ldmatrix
1327
+ // CHECK: nvgpu .ldmatrix
1329
1328
%a_mat = ttg.local_load %a : !ttg.memdesc <128 x32 xf16 , #shared , #smem > -> tensor <128 x32 xf16 , #dot_operand_a >
1330
1329
%b_mat = ttg.local_load %b : !ttg.memdesc <32 x256 xf16 , #shared , #smem > -> tensor <32 x256 xf16 , #dot_operand_b >
1331
1330
@@ -1401,9 +1400,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
1401
1400
tt.func @matmul_tf32dot (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
1402
1401
%a: !ttg.memdesc <32 x16 xf32 , #shared , #smem >, %b: !ttg.memdesc <16 x32 xf32 , #shared , #smem >) {
1403
1402
%cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #mma >
1404
- // CHECK: nvvm .ldmatrix
1403
+ // CHECK: nvgpu .ldmatrix
1405
1404
// CHECK-SAME: (i32, i32, i32, i32)
1406
- // CHECK: nvvm .ldmatrix
1405
+ // CHECK: nvgpu .ldmatrix
1407
1406
// CHECK-SAME: (i32, i32, i32, i32)
1408
1407
%a_mat = ttg.local_load %a : !ttg.memdesc <32 x16 xf32 , #shared , #smem > -> tensor <32 x16 xf32 , #dot_operand_a >
1409
1408
%b_mat = ttg.local_load %b : !ttg.memdesc <16 x32 xf32 , #shared , #smem > -> tensor <16 x32 xf32 , #dot_operand_b >
@@ -1876,8 +1875,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
1876
1875
%f16_shared = ttg.local_alloc %f16_inp : (tensor <16 x16 xf16 , #blocked0 >) -> !ttg.memdesc <16 x16 xf16 , #shared0 , #smem >
1877
1876
%i16_shared = ttg.local_alloc %i16_inp : (tensor <16 x16 xi16 , #blocked0 >) -> !ttg.memdesc <16 x16 xi16 , #shared0 , #smem >
1878
1877
1879
- // CHECK: nvvm .ldmatrix
1880
- // CHECK: nvvm .ldmatrix
1878
+ // CHECK: nvgpu .ldmatrix
1879
+ // CHECK: nvgpu .ldmatrix
1881
1880
1882
1881
%f16_dot = ttg.local_load %f16_shared : !ttg.memdesc <16 x16 xf16 , #shared0 , #smem > -> tensor <16 x16 xf16 , #dot_operand_a >
1883
1882
%i16_dot = ttg.local_load %i16_shared : !ttg.memdesc <16 x16 xi16 , #shared0 , #smem > -> tensor <16 x16 xi16 , #dot_operand_b >
0 commit comments