triton-lang · wsmoses · Oct 17, 2025
@@ -1,7 +1,7 @@
 #pragma once
 #include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "amd/include/TritonAMDGPUTransforms/Passes.h"
-#include "nvidia/include/Dialect/NVGPU/IR/Dialect.h"
+#include "nvidia/include/Dialect/NVG/IR/Dialect.h"
 #include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
 #include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/Passes.h"
 #include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/Passes.h"
@@ -28,7 +28,7 @@
 
 #include "nvidia/hopper/include/Transforms/Passes.h"
 #include "nvidia/include/Dialect/NVWS/Transforms/Passes.h"
-#include "nvidia/include/NVGPUToLLVM/Passes.h"
+#include "nvidia/include/NVGToLLVM/Passes.h"
 #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
@@ -83,7 +83,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
   mlir::triton::registerConvertWarpSpecializeToLLVM();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
-  mlir::triton::registerConvertNVGPUToLLVMPass();
+  mlir::triton::registerConvertNVGToLLVMPass();
   mlir::triton::registerAllocateSharedMemoryNvPass();
   mlir::registerLLVMDIScope();
   mlir::LLVM::registerInlinerInterface(registry);
@@ -123,7 +123,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   // NVWS passes
   mlir::triton::registerNVWSTransformsPasses();
 
-  // NVGPU transform passes
+  // NVG transform passes
   mlir::registerNVHopperTransformsPasses();
 
   // Proton passes
@@ -143,7 +143,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
       mlir::triton::instrument::TritonInstrumentDialect,
       mlir::math::MathDialect, mlir::arith::ArithDialect, mlir::scf::SCFDialect,
       mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect, mlir::NVVM::NVVMDialect,
-      mlir::triton::nvgpu::NVGPUDialect, mlir::triton::nvws::NVWSDialect,
+      mlir::triton::nvg::NVGDialect, mlir::triton::nvws::NVWSDialect,
       mlir::triton::amdgpu::TritonAMDGPUDialect,
       mlir::triton::proton::ProtonDialect,
       mlir::triton::proton::gpu::ProtonGPUDialect, mlir::ROCDL::ROCDLDialect,

@@ -8,5 +8,5 @@ add_triton_library(TritonInstrumentToLLVM
     TritonGPUIR
     TritonInstrumentIR
     TritonNvidiaGPUIR
-    NVGPUIR
+    NVGIR
 )
@@ -1,6 +1,6 @@
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "third_party/nvidia/include/Dialect/NVGPU/IR/Dialect.h"
+#include "third_party/nvidia/include/Dialect/NVG/IR/Dialect.h"
 #include "third_party/nvidia/include/TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
 #include "third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/Utility.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
@@ -313,7 +313,7 @@ struct BufferPointersOpConversion
       assert(op.getMemType() == tti::MemType::TENSOR_MEM &&
              "Unsupported memory type");
       TritonLLVMOpBuilder b(loc, rewriter);
-      base = rewriter.create<nvgpu::TensorMemoryBaseAddress>(loc);
+      base = rewriter.create<nvg::TensorMemoryBaseAddress>(loc);
       base = b.ptrtoint(i32_ty, base);
     }
     bufPointers = rewriter.create<arith::AddIOp>(

@@ -334,10 +334,10 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
 
     if is_cuda():
         if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
-            pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
+            pytest.skip("non-float32 downcast tests only supported on NVG with compute capability 9.0+")
 
         if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
-            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
+            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVG with compute capability 9.0+")
 
         if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne':
             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
@@ -368,10 +368,10 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
 def test_typeconvert_downcast_clamping(src_dtype, dst_dtype, mode, device, rounding="rtne"):
     if is_cuda():
         if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
-            pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
+            pytest.skip("non-float32 downcast tests only supported on NVG with compute capability 9.0+")
 
         if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
-            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
+            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVG with compute capability 9.0+")
 
     converter = {
         tl.float8e4nv: torch.float8_e4m3fn,

@@ -121,9 +121,9 @@ def check_type_supported(dtype, device):
     if device in ['cuda']:
         cc = torch.cuda.get_device_capability()
         if cc[0] < 8 and (dtype is tl.bfloat16 or dtype == "bfloat16" or dtype is torch.bfloat16):
-            pytest.skip("bfloat16 is only supported on NVGPU with cc >= 80")
+            pytest.skip("bfloat16 is only supported on NVG with cc >= 80")
         if cc[0] < 9 and dtype in {tl.float8e4nv, "float8e4nv", "float8_e4m3fn"}:
-            pytest.skip("float8e4nv is only supported on NVGPU with cc >= 90")
+            pytest.skip("float8e4nv is only supported on NVG with cc >= 90")
     if is_interpreter():
         if dtype in [tl.bfloat16, "bfloat16", torch.bfloat16]:
             pytest.skip("bfloat16 is not supported in the interpreter")

@@ -1,5 +1,5 @@
-// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s --check-prefix=CHECK-TTG2NVGPU
-// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 --convert-nv-gpu-to-llvm 2>&1 | FileCheck %s --check-prefix=CHECK-NVGPU2LLVM
+// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s --check-prefix=CHECK-TTG2NVG
+// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 --convert-nv-gpu-to-llvm 2>&1 | FileCheck %s --check-prefix=CHECK-NVG2LLVM
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @kernel_r(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
     %cst = arith.constant 0.000000e+00 : f32
@@ -10,18 +10,18 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     %1 = arith.muli %0, %c128_i32 : i32
     %2 = arith.cmpi slt, %1, %c512_i32 : i32
 
-    // CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, gpu
-    // CHECK-NVGPU2LLVM: ld.global.gpu.acquire.b32
+    // CHECK-TTG2NVG: nvg.ld_acquire acquire, gpu
+    // CHECK-NVG2LLVM: ld.global.gpu.acquire.b32
     %3 = tt.atomic_rmw fadd, acquire, gpu, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
     tt.store %arg0, %3 : !tt.ptr<f32>
 
-    // CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, cta
-    // CHECK-NVGPU2LLVM: ld.global.cta.acquire.b32
+    // CHECK-TTG2NVG: nvg.ld_acquire acquire, cta
+    // CHECK-NVG2LLVM: ld.global.cta.acquire.b32
     %4 = tt.atomic_rmw fadd, acquire, cta, %arg0, %cst, %true : (!tt.ptr<f32>, f32, i1) -> f32
     tt.store %arg0, %4 : !tt.ptr<f32>
 
-    // CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, sys
-    // CHECK-NVGPU2LLVM: ld.global.sys.acquire.b32
+    // CHECK-TTG2NVG: nvg.ld_acquire acquire, sys
+    // CHECK-NVG2LLVM: ld.global.sys.acquire.b32
     %5 = tt.atomic_rmw fadd, acquire, sys, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
     tt.store %arg0, %5 : !tt.ptr<f32>
     tt.return

@@ -7,7 +7,7 @@ llvm.func @cluster_id() -> i32 {
   // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z
   // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x
   // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y
-  %id = nvgpu.cluster_id
+  %id = nvg.cluster_id
   llvm.return %id : i32
 }
 
@@ -39,7 +39,7 @@ llvm.func @cluster_id() -> i32 {
 llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
 // CHECK: wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2
 %false = llvm.mlir.constant(false) : i1
-%acc0 = nvgpu.wgmma %desc, %desc, %false {
+%acc0 = nvg.wgmma %desc, %desc, %false {
   eltTypeA = 3 : i32,
   eltTypeB = 3 : i32,
   eltTypeC = 7 : i32,
@@ -52,7 +52,7 @@ llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
 
   // CHECK: // wait for regs: $0,$1,$2,{{.*}},$127
   // CHECK: wgmma.wait_group.sync.aligned 0;
-  %out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
+  %out = nvg.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
   llvm.return
 }
 
@@ -73,7 +73,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
   //      CHECK:    llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@$0 tcgen05.dealloc.cta_group::1.sync.aligned.b32 $1, 128;", "b,r" %[[PRED]], %{{.+}} : (i1, !llvm.ptr<6>) -> !llvm.void
   llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
   llvm.func @tensor_memory_base_lowering() -> i32 attributes {nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>} {
-    %263 = nvgpu.tensor_memory_base
+    %263 = nvg.tensor_memory_base
     %264 = llvm.ptrtoint %263 : !llvm.ptr<6> to i32
     llvm.return %264 : i32
   }
@@ -95,7 +95,7 @@ llvm.func @tensor_memory_base_warpgroup() attributes {nvvm.kernel = 1 : ui1, nvv
   }
   // CHECK: partition0
   partition0() num_warps(1) {
-    %0 = nvgpu.tensor_memory_base
+    %0 = nvg.tensor_memory_base
     // CHECK-NEXT: "use"(%arg0)
     "use"(%0) : (!llvm.ptr<6>) -> ()
     ttg.warp_return
@@ -115,7 +115,7 @@ llvm.func @warpid_warp_specialize() {
   // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
   // CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
   // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-  %0 = nvgpu.warp_id
+  %0 = nvg.warp_id
   // CHECK: "use"([[UNIFORM]])
   "use"(%0) : (i32) -> ()
 
@@ -126,7 +126,7 @@ llvm.func @warpid_warp_specialize() {
     // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
     // CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
     // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = nvgpu.warp_id
+    %1 = nvg.warp_id
     // CHECK: "use"([[UNIFORM]])
     "use"(%1) : (i32) -> ()
     ttg.warp_yield
@@ -141,7 +141,7 @@ llvm.func @warpid_warp_specialize() {
     // CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C192]]
     // CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
     // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = nvgpu.warp_id
+    %1 = nvg.warp_id
     // CHECK: "use"([[UNIFORM]])
     "use"(%1) : (i32) -> ()
     ttg.warp_return
@@ -155,7 +155,7 @@ llvm.func @warpid_warp_specialize() {
     // CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C128]]
     // CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
     // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = nvgpu.warp_id
+    %1 = nvg.warp_id
     // CHECK: "use"([[UNIFORM]])
     "use"(%1) : (i32) -> ()
     ttg.warp_return
@@ -172,7 +172,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32}
 // CHECK-LABEL: @one_warp
 tt.func @one_warp() -> i32 {
   // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
-  %0 = nvgpu.warp_id
+  %0 = nvg.warp_id
   // CHECK-NEXT: return [[C0]]
   tt.return %0 : i32
 }
@@ -192,7 +192,7 @@ tt.func @one_contextual_warp() {
   // CHECK: partition0
   partition0() num_warps(1) {
     // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
-    %0 = nvgpu.warp_id
+    %0 = nvg.warp_id
     // CHECK-NEXT: "use"([[C0]])
     "use"(%0) : (i32) -> ()
     ttg.warp_return

@@ -7,7 +7,7 @@
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: @tc_gen5_mma
-  // CHECK: %[[WID:.+]] = nvgpu.warp_id
+  // CHECK: %[[WID:.+]] = nvg.warp_id
   // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
   // CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]]  : i1
@@ -105,7 +105,7 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 8 : i32} {
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_ld
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
   // CHECK: nvvm.tcgen05.wait <store>
   // CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
@@ -154,7 +154,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #tmem = #ttng.tensor_memory_encoding<blockM = 64, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_ld_m64
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
   // CHECK: nvvm.tcgen05.wait <store>
   // CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
@@ -174,7 +174,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 2>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_unpack_f16
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32
   // CHECK: nvvm.tcgen05.wait <store>
   // CHECK: tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32
@@ -197,7 +197,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: @tc_gen5_mma_block_scale
   // CHECK: %[[TMEM_BASE:.+]] = llvm.ptrtoint %arg2 : !llvm.ptr<3> to i32
-  // CHECK: %[[WID:.+]] = nvgpu.warp_id
+  // CHECK: %[[WID:.+]] = nvg.warp_id
   // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
   // CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]]  : i1
@@ -865,7 +865,7 @@ tt.func private @load_store_16x32bx1_broadcast(%arg0: !ttg.memdesc<16x8xi8, #tme
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_st
-  // CHECK: nvgpu.tensor_memory_base
+  // CHECK: nvg.tensor_memory_base
   // CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
   // CHECK: nvvm.tcgen05.wait <store>
   tt.func public @tensor_memory_st(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {