Skip to content

Commit ff5c1e7

Browse files
authored
[NVIDIA] rename nvgpu dialect to nvg (#8666)
Fixes: triton-lang/triton#8348 cc @wsmoses @jeffniu-openai @ThomasRaoux # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because: renames an existing dialect and updating the relavant uses of the dialect in lit tests. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
1 parent a0fbcbf commit ff5c1e7

File tree

7 files changed

+46
-46
lines changed

7 files changed

+46
-46
lines changed

test/Conversion/atomic_ldst.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
1010
%1 = arith.muli %0, %c128_i32 : i32
1111
%2 = arith.cmpi slt, %1, %c512_i32 : i32
1212

13-
// CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, gpu
13+
// CHECK-TTG2NVGPU: nvg.ld_acquire acquire, gpu
1414
// CHECK-NVGPU2LLVM: ld.global.gpu.acquire.b32
1515
%3 = tt.atomic_rmw fadd, acquire, gpu, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
1616
tt.store %arg0, %3 : !tt.ptr<f32>
1717

18-
// CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, cta
18+
// CHECK-TTG2NVGPU: nvg.ld_acquire acquire, cta
1919
// CHECK-NVGPU2LLVM: ld.global.cta.acquire.b32
2020
%4 = tt.atomic_rmw fadd, acquire, cta, %arg0, %cst, %true : (!tt.ptr<f32>, f32, i1) -> f32
2121
tt.store %arg0, %4 : !tt.ptr<f32>
2222

23-
// CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, sys
23+
// CHECK-TTG2NVGPU: nvg.ld_acquire acquire, sys
2424
// CHECK-NVGPU2LLVM: ld.global.sys.acquire.b32
2525
%5 = tt.atomic_rmw fadd, acquire, sys, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
2626
tt.store %arg0, %5 : !tt.ptr<f32>

test/Conversion/nvgpu_to_llvm.mlir

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ llvm.func @cluster_id() -> i32 {
88
// CHECK-NOT: nvvm.read.ptx.sreg.cluster.ctaid.z
99
// CHECK-NOT: nvvm.read.ptx.sreg.cluster.nctaid.x
1010
// CHECK-NOT: nvvm.read.ptx.sreg.cluster.nctaid.y
11-
%id = nvgpu.cluster_id
11+
%id = nvg.cluster_id
1212
llvm.return %id : i32
1313
}
1414

@@ -40,7 +40,7 @@ llvm.func @cluster_id() -> i32 {
4040
llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
4141
// CHECK: wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2
4242
%false = llvm.mlir.constant(false) : i1
43-
%acc0 = nvgpu.wgmma %desc, %desc, %false {
43+
%acc0 = nvg.wgmma %desc, %desc, %false {
4444
eltTypeA = 3 : i32,
4545
eltTypeB = 3 : i32,
4646
eltTypeC = 7 : i32,
@@ -53,7 +53,7 @@ llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
5353

5454
// CHECK: // wait for regs: $0,$1,$2,{{.*}},$127
5555
// CHECK: wgmma.wait_group.sync.aligned 0;
56-
%out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
56+
%out = nvg.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
5757
llvm.return
5858
}
5959

@@ -66,7 +66,7 @@ llvm.func @wgmma_wait(%in: !struct) {
6666
// CHECK: // wait for regs: $0,$1,$2,$3,$4,$5
6767
// CHECK: wgmma.wait_group.sync.aligned 0;
6868
// CHECK: "=f,=f,=r,=r,=h,=h,0,1,2,3,4,5"
69-
%out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct
69+
%out = nvg.wgmma_wait_group %in {pendings = 0 : i32} : !struct
7070
llvm.return
7171
}
7272

@@ -87,7 +87,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
8787
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@$0 tcgen05.dealloc.cta_group::1.sync.aligned.b32 $1, 128;", "b,r" %[[PRED]], %{{.+}} : (i1, !llvm.ptr<6>) -> !llvm.void
8888
llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
8989
llvm.func @tensor_memory_base_lowering() -> i32 attributes {nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>} {
90-
%263 = nvgpu.tensor_memory_base
90+
%263 = nvg.tensor_memory_base
9191
%264 = llvm.ptrtoint %263 : !llvm.ptr<6> to i32
9292
llvm.return %264 : i32
9393
}
@@ -109,7 +109,7 @@ llvm.func @tensor_memory_base_warpgroup() attributes {nvvm.kernel = 1 : ui1, nvv
109109
}
110110
// CHECK: partition0
111111
partition0() num_warps(1) {
112-
%0 = nvgpu.tensor_memory_base
112+
%0 = nvg.tensor_memory_base
113113
// CHECK-NEXT: "use"(%arg0)
114114
"use"(%0) : (!llvm.ptr<6>) -> ()
115115
ttg.warp_return
@@ -129,7 +129,7 @@ llvm.func @warpid_warp_specialize() {
129129
// CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
130130
// CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
131131
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
132-
%0 = nvgpu.warp_id
132+
%0 = nvg.warp_id
133133
// CHECK: "use"([[UNIFORM]])
134134
"use"(%0) : (i32) -> ()
135135

@@ -140,7 +140,7 @@ llvm.func @warpid_warp_specialize() {
140140
// CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
141141
// CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
142142
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
143-
%1 = nvgpu.warp_id
143+
%1 = nvg.warp_id
144144
// CHECK: "use"([[UNIFORM]])
145145
"use"(%1) : (i32) -> ()
146146
ttg.warp_yield
@@ -155,7 +155,7 @@ llvm.func @warpid_warp_specialize() {
155155
// CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C192]]
156156
// CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
157157
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
158-
%1 = nvgpu.warp_id
158+
%1 = nvg.warp_id
159159
// CHECK: "use"([[UNIFORM]])
160160
"use"(%1) : (i32) -> ()
161161
ttg.warp_return
@@ -169,7 +169,7 @@ llvm.func @warpid_warp_specialize() {
169169
// CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C128]]
170170
// CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
171171
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
172-
%1 = nvgpu.warp_id
172+
%1 = nvg.warp_id
173173
// CHECK: "use"([[UNIFORM]])
174174
"use"(%1) : (i32) -> ()
175175
ttg.warp_return
@@ -186,7 +186,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32}
186186
// CHECK-LABEL: @one_warp
187187
tt.func @one_warp() -> i32 {
188188
// CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
189-
%0 = nvgpu.warp_id
189+
%0 = nvg.warp_id
190190
// CHECK-NEXT: return [[C0]]
191191
tt.return %0 : i32
192192
}
@@ -206,7 +206,7 @@ tt.func @one_contextual_warp() {
206206
// CHECK: partition0
207207
partition0() num_warps(1) {
208208
// CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
209-
%0 = nvgpu.warp_id
209+
%0 = nvg.warp_id
210210
// CHECK-NEXT: "use"([[C0]])
211211
"use"(%0) : (i32) -> ()
212212
ttg.warp_return

test/Conversion/tritongpu_to_llvm_blackwell.mlir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
88
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
99
// CHECK-LABEL: @tc_gen5_mma
10-
// CHECK: %[[WID:.+]] = nvgpu.warp_id
10+
// CHECK: %[[WID:.+]] = nvg.warp_id
1111
// CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
1212
// CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
1313
// CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]] : i1
@@ -105,7 +105,7 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 8 : i32} {
105105
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
106106
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
107107
// CHECK-LABEL: @tensor_memory_ld
108-
// CHECK: nvgpu.tensor_memory_base
108+
// CHECK: nvg.tensor_memory_base
109109
// CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
110110
// CHECK: nvvm.tcgen05.wait <store>
111111
// CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
@@ -154,7 +154,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
154154
#tmem = #ttng.tensor_memory_encoding<blockM = 64, blockN = 128, colStride = 1>
155155
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
156156
// CHECK-LABEL: @tensor_memory_ld_m64
157-
// CHECK: nvgpu.tensor_memory_base
157+
// CHECK: nvg.tensor_memory_base
158158
// CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
159159
// CHECK: nvvm.tcgen05.wait <store>
160160
// CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
@@ -174,7 +174,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
174174
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 2>
175175
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
176176
// CHECK-LABEL: @tensor_memory_unpack_f16
177-
// CHECK: nvgpu.tensor_memory_base
177+
// CHECK: nvg.tensor_memory_base
178178
// CHECK: tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32
179179
// CHECK: nvvm.tcgen05.wait <store>
180180
// CHECK: tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32
@@ -197,7 +197,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
197197
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
198198
// CHECK-LABEL: @tc_gen5_mma_block_scale
199199
// CHECK: %[[TMEM_BASE:.+]] = llvm.ptrtoint %arg2 : !llvm.ptr<3> to i32
200-
// CHECK: %[[WID:.+]] = nvgpu.warp_id
200+
// CHECK: %[[WID:.+]] = nvg.warp_id
201201
// CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
202202
// CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
203203
// CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]] : i1
@@ -865,7 +865,7 @@ tt.func private @load_store_16x32bx1_broadcast(%arg0: !ttg.memdesc<16x8xi8, #tme
865865
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
866866
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
867867
// CHECK-LABEL: @tensor_memory_st
868-
// CHECK: nvgpu.tensor_memory_base
868+
// CHECK: nvg.tensor_memory_base
869869
// CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
870870
// CHECK: nvvm.tcgen05.wait <store>
871871
tt.func public @tensor_memory_st(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {

test/Conversion/tritongpu_to_llvm_hopper.mlir

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ module attributes {"ttg.num-ctas" = 4 : i32, "ttg.num-warps" = 4 : i32} {
1919
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
2020
// CHECK-LABEL: @dot_high_precision_acc
2121
tt.func @dot_high_precision_acc(%a: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %b: !ttg.memdesc<128x256xf8E5M2, #shared1, #smem>, %c: tensor<128x256xf32, #mma>) {
22-
// CHECK: nvgpu.wgmma
22+
// CHECK: nvg.wgmma
2323
// CHECK-COUNT-128: llvm.fadd
24-
// CHECK: nvgpu.wgmma
24+
// CHECK: nvg.wgmma
2525
// CHECK-COUNT-128: llvm.fadd
26-
// CHECK: nvgpu.wgmma
26+
// CHECK: nvg.wgmma
2727
// CHECK-COUNT-128: llvm.fadd
28-
// CHECK: nvgpu.wgmma
28+
// CHECK: nvg.wgmma
2929
// CHECK-COUNT-128: llvm.fadd
3030
%m = ttng.warp_group_dot %a, %b, %c
3131
{maxNumImpreciseAcc = 32 : i32, inputPrecision = 0 : i32} :
@@ -43,13 +43,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
4343
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
4444
// CHECK-LABEL: @dot_low_precision_acc
4545
tt.func @dot_low_precision_acc(%a: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %b: !ttg.memdesc<128x256xf8E5M2, #shared1, #smem>, %c: tensor<128x256xf32, #mma>) {
46-
// CHECK: nvgpu.wgmma
46+
// CHECK: nvg.wgmma
4747
// CHECK-NOT: llvm.fadd
48-
// CHECK: nvgpu.wgmma
48+
// CHECK: nvg.wgmma
4949
// CHECK-NOT: llvm.fadd
50-
// CHECK: nvgpu.wgmma
50+
// CHECK: nvg.wgmma
5151
// CHECK-NOT: llvm.fadd
52-
// CHECK: nvgpu.wgmma
52+
// CHECK: nvg.wgmma
5353
// CHECK-NOT: llvm.fadd
5454
// CHECK: llvm.return
5555
%m = ttng.warp_group_dot %a, %b, %c
@@ -68,13 +68,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
6868
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
6969
// CHECK-LABEL: @dot_mix_precision_acc
7070
tt.func @dot_mix_precision_acc(%a: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %b: !ttg.memdesc<128x256xf8E5M2, #shared1, #smem>, %c: tensor<128x256xf32, #mma>) {
71-
// CHECK: nvgpu.wgmma
71+
// CHECK: nvg.wgmma
7272
// CHECK-NOT: llvm.fadd
73-
// CHECK: nvgpu.wgmma
73+
// CHECK: nvg.wgmma
7474
// CHECK-COUNT-128: llvm.fadd
75-
// CHECK: nvgpu.wgmma
75+
// CHECK: nvg.wgmma
7676
// CHECK-NOT: llvm.fadd
77-
// CHECK: nvgpu.wgmma
77+
// CHECK: nvg.wgmma
7878
// CHECK-COUNT-128: llvm.fadd
7979
// CHECK: llvm.return
8080
%m = ttng.warp_group_dot %a, %b, %c
@@ -97,7 +97,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar
9797
%acc: tensor<256x512xf32, #mma>) {
9898
%res = ttng.warp_group_dot %a, %b, %acc {inputPrecision = 0 : i32, isAsync = true} :
9999
!ttg.memdesc<256x128xbf16, #shared, #smem> * !ttg.memdesc<128x512xbf16, #shared, #smem> -> tensor<256x512xf32, #mma>
100-
// CHECK: nvgpu.wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
100+
// CHECK: nvg.wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
101101
tt.return
102102
}
103103
}
@@ -111,7 +111,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar
111111
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
112112
// CHECK-LABEL: @dot_zero_acc
113113
// Generate a wgmma with 2 sources.
114-
// CHECK: nvgpu.wgmma %{{.*}}, %{{.*}} {
114+
// CHECK: nvg.wgmma %{{.*}}, %{{.*}} {
115115
tt.func @dot_zero_acc(%a: !ttg.memdesc<128x64xf16, #shared, #smem>, %b: !ttg.memdesc<64x64xf16, #shared1, #smem>) {
116116
%cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
117117
%m = ttng.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, maxNumImpreciseAcc = 0 : i32} :
@@ -120,7 +120,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
120120
}
121121

122122
// CHECK-LABEL: @wgmma_on_subtile
123-
// CHECK: nvgpu.wgmma %{{.*}}, %{{.*}}
123+
// CHECK: nvg.wgmma %{{.*}}, %{{.*}}
124124
tt.func @wgmma_on_subtile(%a: tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %b: !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256>){
125125
%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
126126
%m = ttng.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, isAsync = true} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256> -> tensor<128x256xf32, #mma>
@@ -136,8 +136,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
136136
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
137137
// CHECK-LABEL: @dot_reg_operand_A
138138
// Generate a wgmma where the first operand is a struct.
139-
// CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
140-
// CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
139+
// CHECK: nvg.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
140+
// CHECK: nvg.wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
141141
tt.func @dot_reg_operand_A(%a: tensor<128x64xf16, #mma>, %b: !ttg.memdesc<64x64xf16, #shared, #smem>) {
142142
%cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
143143
%opA = ttg.convert_layout %a : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
@@ -156,8 +156,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
156156
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
157157
// CHECK-LABEL: @dot_reg_operand_A_fp8
158158
// Generate a wgmma where the first operand is a struct.
159-
// CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
160-
// CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32}
159+
// CHECK: nvg.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
160+
// CHECK: nvg.wgmma_wait_group %{{.*}} {pendings = 0 : i32}
161161
tt.func @dot_reg_operand_A_fp8(%a: tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %b: !ttg.memdesc<128x256xf8E5M2, #shared, #smem>) {
162162
%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma1>
163163
%m = ttng.warp_group_dot %a, %b, %cst { maxNumImpreciseAcc = 1073741824 : i32, inputPrecision = 0 : i32 } :
@@ -606,13 +606,13 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-warps" = 4 : i32} {
606606

607607
// CHECK-LABEL: @warpgroup_dot_wait_1_input
608608
tt.func @warpgroup_dot_wait_1_input(%arg0: tensor<128xf32, #blocked>) {
609-
// CHECK: nvgpu.wgmma_wait_group
609+
// CHECK: nvg.wgmma_wait_group
610610
ttng.warp_group_dot_wait %arg0 {pendings = 0 : i32} : tensor<128xf32, #blocked>
611611
tt.return
612612
}
613613

614614
tt.func @warpgroup_dot_wait_2_inputs(%arg0: tensor<128xf32, #blocked>, %arg1: tensor<128xf32, #blocked>) {
615-
// CHECK: nvgpu.wgmma_wait_group
615+
// CHECK: nvg.wgmma_wait_group
616616
ttng.warp_group_dot_wait %arg0, %arg1 {pendings = 0 : i32} : tensor<128xf32, #blocked>, tensor<128xf32, #blocked>
617617
tt.return
618618
}

test/Conversion/tritoninstrument_to_llvm.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:90"} {
66
// CHECK-LABEL: @experimental_buffer_pointers_tmem
7-
// CHECK:nvgpu.tensor_memory_base
7+
// CHECK:nvg.tensor_memory_base
88
tt.func private @experimental_buffer_pointers_tmem() {
99
tti.experimental_buffer_pointers [0, 42], tensor_mem : tensor<2xi64, #blocked>
1010
tt.return

third_party/nvidia/include/Dialect/NVGPU/IR/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
22

33
set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
4-
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
5-
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
4+
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvg)
5+
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvg)
66
mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
77
mlir_tablegen(Ops.h.inc -gen-op-decls)
88
mlir_tablegen(Ops.cpp.inc -gen-op-defs)

third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUDialect.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
include "mlir/IR/OpBase.td"
2626

2727
def NVGPU_Dialect : Dialect {
28-
let name = "nvgpu";
28+
let name = "nvg";
2929
let cppNamespace = "::mlir::triton::nvgpu";
3030

3131
let description = [{

0 commit comments

Comments
 (0)