@@ -19,13 +19,13 @@ module attributes {"ttg.num-ctas" = 4 : i32, "ttg.num-warps" = 4 : i32} {
1919module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 } {
2020 // CHECK-LABEL: @dot_high_precision_acc
2121 tt.func @dot_high_precision_acc (%a: !ttg.memdesc <128 x128 xf8 E5 M2 , #shared , #smem >, %b: !ttg.memdesc <128 x256 xf8 E5 M2 , #shared1 , #smem >, %c: tensor <128 x256 xf32 , #mma >) {
22- // CHECK: nvgpu .wgmma
22+ // CHECK: nvg .wgmma
2323 // CHECK-COUNT-128: llvm.fadd
24- // CHECK: nvgpu .wgmma
24+ // CHECK: nvg .wgmma
2525 // CHECK-COUNT-128: llvm.fadd
26- // CHECK: nvgpu .wgmma
26+ // CHECK: nvg .wgmma
2727 // CHECK-COUNT-128: llvm.fadd
28- // CHECK: nvgpu .wgmma
28+ // CHECK: nvg .wgmma
2929 // CHECK-COUNT-128: llvm.fadd
3030 %m = ttng.warp_group_dot %a , %b , %c
3131 {maxNumImpreciseAcc = 32 : i32 , inputPrecision = 0 : i32 } :
@@ -43,13 +43,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
4343module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 } {
4444 // CHECK-LABEL: @dot_low_precision_acc
4545 tt.func @dot_low_precision_acc (%a: !ttg.memdesc <128 x128 xf8 E5 M2 , #shared , #smem >, %b: !ttg.memdesc <128 x256 xf8 E5 M2 , #shared1 , #smem >, %c: tensor <128 x256 xf32 , #mma >) {
46- // CHECK: nvgpu .wgmma
46+ // CHECK: nvg .wgmma
4747 // CHECK-NOT: llvm.fadd
48- // CHECK: nvgpu .wgmma
48+ // CHECK: nvg .wgmma
4949 // CHECK-NOT: llvm.fadd
50- // CHECK: nvgpu .wgmma
50+ // CHECK: nvg .wgmma
5151 // CHECK-NOT: llvm.fadd
52- // CHECK: nvgpu .wgmma
52+ // CHECK: nvg .wgmma
5353 // CHECK-NOT: llvm.fadd
5454 // CHECK: llvm.return
5555 %m = ttng.warp_group_dot %a , %b , %c
@@ -68,13 +68,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
6868module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 } {
6969 // CHECK-LABEL: @dot_mix_precision_acc
7070 tt.func @dot_mix_precision_acc (%a: !ttg.memdesc <128 x128 xf8 E5 M2 , #shared , #smem >, %b: !ttg.memdesc <128 x256 xf8 E5 M2 , #shared1 , #smem >, %c: tensor <128 x256 xf32 , #mma >) {
71- // CHECK: nvgpu .wgmma
71+ // CHECK: nvg .wgmma
7272 // CHECK-NOT: llvm.fadd
73- // CHECK: nvgpu .wgmma
73+ // CHECK: nvg .wgmma
7474 // CHECK-COUNT-128: llvm.fadd
75- // CHECK: nvgpu .wgmma
75+ // CHECK: nvg .wgmma
7676 // CHECK-NOT: llvm.fadd
77- // CHECK: nvgpu .wgmma
77+ // CHECK: nvg .wgmma
7878 // CHECK-COUNT-128: llvm.fadd
7979 // CHECK: llvm.return
8080 %m = ttng.warp_group_dot %a , %b , %c
@@ -97,7 +97,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar
9797 %acc: tensor <256 x512 xf32 , #mma >) {
9898 %res = ttng.warp_group_dot %a , %b , %acc {inputPrecision = 0 : i32 , isAsync = true } :
9999 !ttg.memdesc <256 x128 xbf16 , #shared , #smem > * !ttg.memdesc <128 x512 xbf16 , #shared , #smem > -> tensor <256 x512 xf32 , #mma >
100- // CHECK: nvgpu .wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
100+ // CHECK: nvg .wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
101101 tt.return
102102 }
103103}
@@ -111,7 +111,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar
111111module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
112112 // CHECK-LABEL: @dot_zero_acc
113113 // Generate a wgmma with 2 sources.
114- // CHECK: nvgpu .wgmma %{{.*}}, %{{.*}} {
114+ // CHECK: nvg .wgmma %{{.*}}, %{{.*}} {
115115 tt.func @dot_zero_acc (%a: !ttg.memdesc <128 x64 xf16 , #shared , #smem >, %b: !ttg.memdesc <64 x64 xf16 , #shared1 , #smem >) {
116116 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x64 xf32 , #mma >
117117 %m = ttng.warp_group_dot %a , %b , %cst {inputPrecision = 0 : i32 , maxNumImpreciseAcc = 0 : i32 } :
@@ -120,7 +120,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
120120 }
121121
122122 // CHECK-LABEL: @wgmma_on_subtile
123- // CHECK: nvgpu .wgmma %{{.*}}, %{{.*}}
123+ // CHECK: nvg .wgmma %{{.*}}, %{{.*}}
124124 tt.func @wgmma_on_subtile (%a: tensor <128 x16 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>, %b: !ttg.memdesc <16 x256 xf16 , #shared1 , #smem , mutable , 3 x64 x256 >){
125125 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf32 , #mma >
126126 %m = ttng.warp_group_dot %a , %b , %cst {inputPrecision = 0 : i32 , isAsync = true } : tensor <128 x16 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * !ttg.memdesc <16 x256 xf16 , #shared1 , #smem , mutable , 3 x64 x256 > -> tensor <128 x256 xf32 , #mma >
@@ -136,8 +136,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
136136module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
137137 // CHECK-LABEL: @dot_reg_operand_A
138138 // Generate a wgmma where the first operand is a struct.
139- // CHECK: nvgpu .wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
140- // CHECK: nvgpu .wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
139+ // CHECK: nvg .wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
140+ // CHECK: nvg .wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
141141 tt.func @dot_reg_operand_A (%a: tensor <128 x64 xf16 , #mma >, %b: !ttg.memdesc <64 x64 xf16 , #shared , #smem >) {
142142 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x64 xf32 , #mma >
143143 %opA = ttg.convert_layout %a : tensor <128 x64 xf16 , #mma > -> tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
@@ -156,8 +156,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
156156module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 } {
157157 // CHECK-LABEL: @dot_reg_operand_A_fp8
158158 // Generate a wgmma where the first operand is a struct.
159- // CHECK: nvgpu .wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
160- // CHECK: nvgpu .wgmma_wait_group %{{.*}} {pendings = 0 : i32}
159+ // CHECK: nvg .wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
160+ // CHECK: nvg .wgmma_wait_group %{{.*}} {pendings = 0 : i32}
161161 tt.func @dot_reg_operand_A_fp8 (%a: tensor <128 x128 xf8 E5 M2 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 4 }>>, %b: !ttg.memdesc <128 x256 xf8 E5 M2 , #shared , #smem >) {
162162 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf32 , #mma1 >
163163 %m = ttng.warp_group_dot %a , %b , %cst { maxNumImpreciseAcc = 1073741824 : i32 , inputPrecision = 0 : i32 } :
@@ -606,13 +606,13 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-warps" = 4 : i32} {
606606
607607// CHECK-LABEL: @warpgroup_dot_wait_1_input
608608tt.func @warpgroup_dot_wait_1_input (%arg0: tensor <128 xf32 , #blocked >) {
609- // CHECK: nvgpu .wgmma_wait_group
609+ // CHECK: nvg .wgmma_wait_group
610610 ttng.warp_group_dot_wait %arg0 {pendings = 0 : i32 } : tensor <128 xf32 , #blocked >
611611 tt.return
612612}
613613
614614tt.func @warpgroup_dot_wait_2_inputs (%arg0: tensor <128 xf32 , #blocked >, %arg1: tensor <128 xf32 , #blocked >) {
615- // CHECK: nvgpu .wgmma_wait_group
615+ // CHECK: nvg .wgmma_wait_group
616616 ttng.warp_group_dot_wait %arg0 , %arg1 {pendings = 0 : i32 } : tensor <128 xf32 , #blocked >, tensor <128 xf32 , #blocked >
617617 tt.return
618618}
0 commit comments