[NVIDIA] rename nvgpu dialect to nvg (#8666)

avik-pal · web-flow · commit ff5c1e77ce80 · 2025-11-07T10:00:04.000-08:00
Fixes: triton-lang/triton#8348 cc @wsmoses @jeffniu-openai @ThomasRaoux # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because: renames an existing dialect and updating the relavant uses of the dialect in lit tests. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/test/Conversion/atomic_ldst.mlir b/test/Conversion/atomic_ldst.mlir
@@ -10,17 +10,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     %1 = arith.muli %0, %c128_i32 : i32
     %2 = arith.cmpi slt, %1, %c512_i32 : i32
 
-    // CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, gpu
+    // CHECK-TTG2NVGPU: nvg.ld_acquire acquire, gpu
     // CHECK-NVGPU2LLVM: ld.global.gpu.acquire.b32
     %3 = tt.atomic_rmw fadd, acquire, gpu, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
     tt.store %arg0, %3 : !tt.ptr<f32>
 
-    // CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, cta
+    // CHECK-TTG2NVGPU: nvg.ld_acquire acquire, cta
     // CHECK-NVGPU2LLVM: ld.global.cta.acquire.b32
     %4 = tt.atomic_rmw fadd, acquire, cta, %arg0, %cst, %true : (!tt.ptr<f32>, f32, i1) -> f32
     tt.store %arg0, %4 : !tt.ptr<f32>
 
-    // CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, sys
+    // CHECK-TTG2NVGPU: nvg.ld_acquire acquire, sys
     // CHECK-NVGPU2LLVM: ld.global.sys.acquire.b32
     %5 = tt.atomic_rmw fadd, acquire, sys, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
     tt.store %arg0, %5 : !tt.ptr<f32>
diff --git a/test/Conversion/nvgpu_to_llvm.mlir b/test/Conversion/nvgpu_to_llvm.mlir
@@ -8,7 +8,7 @@ llvm.func @cluster_id() -> i32 {
   // CHECK-NOT: nvvm.read.ptx.sreg.cluster.ctaid.z
   // CHECK-NOT: nvvm.read.ptx.sreg.cluster.nctaid.x
   // CHECK-NOT: nvvm.read.ptx.sreg.cluster.nctaid.y
-  %id = nvgpu.cluster_id
+  %id = nvg.cluster_id
   llvm.return %id : i32
 }
 
@@ -40,7 +40,7 @@ llvm.func @cluster_id() -> i32 {
 llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
 // CHECK: wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2
 %false = llvm.mlir.constant(false) : i1
-%acc0 = nvgpu.wgmma %desc, %desc, %false {
+%acc0 = nvg.wgmma %desc, %desc, %false {
   eltTypeA = 3 : i32,
   eltTypeB = 3 : i32,
   eltTypeC = 7 : i32,
@@ -53,7 +53,7 @@ llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
 
   // CHECK: // wait for regs: $0,$1,$2,{{.*}},$127
   // CHECK: wgmma.wait_group.sync.aligned 0;
-  %out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
+  %out = nvg.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
   llvm.return
 }
 
@@ -66,7 +66,7 @@ llvm.func @wgmma_wait(%in: !struct) {
   // CHECK: // wait for regs: $0,$1,$2,$3,$4,$5
   // CHECK: wgmma.wait_group.sync.aligned 0;
   // CHECK: "=f,=f,=r,=r,=h,=h,0,1,2,3,4,5"
-  %out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct
+  %out = nvg.wgmma_wait_group %in {pendings = 0 : i32} : !struct
   llvm.return
 }
 
@@ -87,7 +87,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
   //      CHECK:    llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@$0 tcgen05.dealloc.cta_group::1.sync.aligned.b32 $1, 128;", "b,r" %[[PRED]], %{{.+}} : (i1, !llvm.ptr<6>) -> !llvm.void
   llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
   llvm.func @tensor_memory_base_lowering() -> i32 attributes {nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>} {
-    %263 = nvgpu.tensor_memory_base
+    %263 = nvg.tensor_memory_base
     %264 = llvm.ptrtoint %263 : !llvm.ptr<6> to i32
     llvm.return %264 : i32
   }
@@ -109,7 +109,7 @@ llvm.func @tensor_memory_base_warpgroup() attributes {nvvm.kernel = 1 : ui1, nvv
   }
   // CHECK: partition0
   partition0() num_warps(1) {
-    %0 = nvgpu.tensor_memory_base
+    %0 = nvg.tensor_memory_base
     // CHECK-NEXT: "use"(%arg0)
     "use"(%0) : (!llvm.ptr<6>) -> ()
     ttg.warp_return
@@ -129,7 +129,7 @@ llvm.func @warpid_warp_specialize() {
   // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
   // CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
   // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-  %0 = nvgpu.warp_id
+  %0 = nvg.warp_id
   // CHECK: "use"([[UNIFORM]])
   "use"(%0) : (i32) -> ()
 
@@ -140,7 +140,7 @@ llvm.func @warpid_warp_specialize() {
     // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
     // CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
     // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = nvgpu.warp_id
+    %1 = nvg.warp_id
     // CHECK: "use"([[UNIFORM]])
     "use"(%1) : (i32) -> ()
     ttg.warp_yield
@@ -155,7 +155,7 @@ llvm.func @warpid_warp_specialize() {
     // CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C192]]
     // CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
     // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = nvgpu.warp_id
+    %1 = nvg.warp_id
     // CHECK: "use"([[UNIFORM]])
     "use"(%1) : (i32) -> ()
     ttg.warp_return
@@ -169,7 +169,7 @@ llvm.func @warpid_warp_specialize() {
     // CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C128]]
     // CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
     // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = nvgpu.warp_id
+    %1 = nvg.warp_id
     // CHECK: "use"([[UNIFORM]])
     "use"(%1) : (i32) -> ()
     ttg.warp_return
@@ -186,7 +186,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32}
 // CHECK-LABEL: @one_warp
 tt.func @one_warp() -> i32 {
   // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
-  %0 = nvgpu.warp_id
+  %0 = nvg.warp_id
   // CHECK-NEXT: return [[C0]]
   tt.return %0 : i32
 }
@@ -206,7 +206,7 @@ tt.func @one_contextual_warp() {
   // CHECK: partition0
   partition0() num_warps(1) {
     // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
-    %0 = nvgpu.warp_id
+    %0 = nvg.warp_id
     // CHECK-NEXT: "use"([[C0]])
     "use"(%0) : (i32) -> ()
     ttg.warp_return
diff --git a/test/Conversion/tritongpu_to_llvm_blackwell.mlir b/test/Conversion/tritongpu_to_llvm_blackwell.mlir
@@ -7,7 +7,7 @@
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: @tc_gen5_mma
-  // CHECK: %[[WID:.+]] = nvgpu.warp_id
+  // CHECK: %[[WID:.+]] = nvg.warp_id
   // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
   // CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]]  : i1
@@ -105,7 +105,7 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 8 : i32} {
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_ld
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
   // CHECK: nvvm.tcgen05.wait <store>
   // CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
@@ -154,7 +154,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #tmem = #ttng.tensor_memory_encoding<blockM = 64, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_ld_m64
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
   // CHECK: nvvm.tcgen05.wait <store>
   // CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
@@ -174,7 +174,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 2>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_unpack_f16
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32
   // CHECK: nvvm.tcgen05.wait <store>
   // CHECK: tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32
@@ -197,7 +197,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: @tc_gen5_mma_block_scale
   // CHECK: %[[TMEM_BASE:.+]] = llvm.ptrtoint %arg2 : !llvm.ptr<3> to i32
-  // CHECK: %[[WID:.+]] = nvgpu.warp_id
+  // CHECK: %[[WID:.+]] = nvg.warp_id
   // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
   // CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]]  : i1
@@ -865,7 +865,7 @@ tt.func private @load_store_16x32bx1_broadcast(%arg0: !ttg.memdesc<16x8xi8, #tme
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_st
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
   // CHECK: nvvm.tcgen05.wait <store>
   tt.func public @tensor_memory_st(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -19,13 +19,13 @@ module attributes {"ttg.num-ctas" = 4 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_high_precision_acc
   tt.func @dot_high_precision_acc(%a: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %b: !ttg.memdesc<128x256xf8E5M2, #shared1, #smem>, %c: tensor<128x256xf32, #mma>) {
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-COUNT-128: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-COUNT-128: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-COUNT-128: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-COUNT-128: llvm.fadd
     %m = ttng.warp_group_dot %a, %b, %c
       {maxNumImpreciseAcc = 32 : i32, inputPrecision = 0 : i32} :
@@ -43,13 +43,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_low_precision_acc
   tt.func @dot_low_precision_acc(%a: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %b: !ttg.memdesc<128x256xf8E5M2, #shared1, #smem>, %c: tensor<128x256xf32, #mma>) {
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-NOT: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-NOT: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-NOT: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-NOT: llvm.fadd
     // CHECK: llvm.return
     %m = ttng.warp_group_dot %a, %b, %c
@@ -68,13 +68,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_mix_precision_acc
   tt.func @dot_mix_precision_acc(%a: !ttg.memdesc<128x128xf8E5M2, #shared, #smem>, %b: !ttg.memdesc<128x256xf8E5M2, #shared1, #smem>, %c: tensor<128x256xf32, #mma>) {
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-NOT: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-COUNT-128: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-NOT: llvm.fadd
-    // CHECK: nvgpu.wgmma
+    // CHECK: nvg.wgmma
     // CHECK-COUNT-128: llvm.fadd
     // CHECK: llvm.return
     %m = ttng.warp_group_dot %a, %b, %c
@@ -97,7 +97,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar
       %acc: tensor<256x512xf32, #mma>) {
     %res = ttng.warp_group_dot %a, %b, %acc {inputPrecision = 0 : i32, isAsync = true} :
       !ttg.memdesc<256x128xbf16, #shared, #smem> * !ttg.memdesc<128x512xbf16, #shared, #smem> -> tensor<256x512xf32, #mma>
-    // CHECK: nvgpu.wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
+    // CHECK: nvg.wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
     tt.return
   }
 }
@@ -111,7 +111,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: @dot_zero_acc
   // Generate a wgmma with 2 sources.
-  // CHECK: nvgpu.wgmma %{{.*}}, %{{.*}} {
+  // CHECK: nvg.wgmma %{{.*}}, %{{.*}} {
   tt.func @dot_zero_acc(%a: !ttg.memdesc<128x64xf16, #shared, #smem>, %b: !ttg.memdesc<64x64xf16, #shared1, #smem>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
     %m = ttng.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, maxNumImpreciseAcc = 0 : i32} :
@@ -120,7 +120,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   }
 
   // CHECK-LABEL: @wgmma_on_subtile
-  // CHECK: nvgpu.wgmma %{{.*}}, %{{.*}}
+  // CHECK: nvg.wgmma %{{.*}}, %{{.*}}
   tt.func @wgmma_on_subtile(%a: tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %b:  !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256>){
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
     %m = ttng.warp_group_dot %a, %b, %cst {inputPrecision = 0 : i32, isAsync = true} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<16x256xf16, #shared1, #smem, mutable, 3x64x256> -> tensor<128x256xf32, #mma>
@@ -136,8 +136,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: @dot_reg_operand_A
   // Generate a wgmma where the first operand is a struct.
-  // CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
-  // CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  // CHECK: nvg.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  // CHECK: nvg.wgmma_wait_group %{{.*}} {pendings = 0 : i32} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
   tt.func @dot_reg_operand_A(%a: tensor<128x64xf16, #mma>, %b: !ttg.memdesc<64x64xf16, #shared, #smem>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
     %opA = ttg.convert_layout %a : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
@@ -156,8 +156,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: @dot_reg_operand_A_fp8
   // Generate a wgmma where the first operand is a struct.
-  // CHECK: nvgpu.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
-  // CHECK: nvgpu.wgmma_wait_group %{{.*}} {pendings = 0 : i32}
+  // CHECK: nvg.wgmma {{.*}} : (!llvm.struct<(i32, i32, i32, i32)>, i64, i1) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  // CHECK: nvg.wgmma_wait_group %{{.*}} {pendings = 0 : i32}
   tt.func @dot_reg_operand_A_fp8(%a: tensor<128x128xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %b: !ttg.memdesc<128x256xf8E5M2, #shared, #smem>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma1>
     %m = ttng.warp_group_dot %a, %b, %cst { maxNumImpreciseAcc = 1073741824 : i32, inputPrecision = 0 : i32 } :
@@ -606,13 +606,13 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-warps" = 4 : i32} {
 
 // CHECK-LABEL: @warpgroup_dot_wait_1_input
 tt.func @warpgroup_dot_wait_1_input(%arg0: tensor<128xf32, #blocked>) {
-  // CHECK: nvgpu.wgmma_wait_group
+  // CHECK: nvg.wgmma_wait_group
   ttng.warp_group_dot_wait %arg0 {pendings = 0 : i32} : tensor<128xf32, #blocked>
   tt.return
 }
 
 tt.func @warpgroup_dot_wait_2_inputs(%arg0: tensor<128xf32, #blocked>, %arg1: tensor<128xf32, #blocked>) {
-  // CHECK: nvgpu.wgmma_wait_group
+  // CHECK: nvg.wgmma_wait_group
   ttng.warp_group_dot_wait %arg0, %arg1 {pendings = 0 : i32} : tensor<128xf32, #blocked>, tensor<128xf32, #blocked>
   tt.return
 }
diff --git a/test/Conversion/tritoninstrument_to_llvm.mlir b/test/Conversion/tritoninstrument_to_llvm.mlir
@@ -4,7 +4,7 @@
 
 module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:90"} {
 // CHECK-LABEL: @experimental_buffer_pointers_tmem
-// CHECK:nvgpu.tensor_memory_base
+// CHECK:nvg.tensor_memory_base
 tt.func private @experimental_buffer_pointers_tmem() {
   tti.experimental_buffer_pointers [0, 42], tensor_mem : tensor<2xi64, #blocked>
   tt.return
diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/CMakeLists.txt b/third_party/nvidia/include/Dialect/NVGPU/IR/CMakeLists.txt
@@ -1,8 +1,8 @@
 set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 
 set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
-mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvgpu)
-mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvgpu)
+mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=nvg)
+mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=nvg)
 mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
 mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUDialect.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUDialect.td
@@ -25,7 +25,7 @@
 include "mlir/IR/OpBase.td"
 
 def NVGPU_Dialect : Dialect {
-  let name = "nvgpu";
+  let name = "nvg";
   let cppNamespace = "::mlir::triton::nvgpu";
 
   let description = [{