Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions bin/RegisterTritonDialects.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once
#include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
#include "amd/include/TritonAMDGPUTransforms/Passes.h"
#include "nvidia/include/Dialect/NVGPU/IR/Dialect.h"
#include "nvidia/include/Dialect/NVG/IR/Dialect.h"
#include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/Passes.h"
#include "proton/Dialect/include/Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/Passes.h"
Expand All @@ -28,7 +28,7 @@

#include "nvidia/hopper/include/Transforms/Passes.h"
#include "nvidia/include/Dialect/NVWS/Transforms/Passes.h"
#include "nvidia/include/NVGPUToLLVM/Passes.h"
#include "nvidia/include/NVGToLLVM/Passes.h"
#include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
#include "triton/Conversion/TritonToTritonGPU/Passes.h"
Expand Down Expand Up @@ -83,7 +83,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
mlir::triton::registerConvertWarpSpecializeToLLVM();
mlir::triton::registerConvertTritonGPUToLLVMPass();
mlir::triton::registerConvertNVGPUToLLVMPass();
mlir::triton::registerConvertNVGToLLVMPass();
mlir::triton::registerAllocateSharedMemoryNvPass();
mlir::registerLLVMDIScope();
mlir::LLVM::registerInlinerInterface(registry);
Expand Down Expand Up @@ -123,7 +123,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
// NVWS passes
mlir::triton::registerNVWSTransformsPasses();

// NVGPU transform passes
// NVG transform passes
mlir::registerNVHopperTransformsPasses();

// Proton passes
Expand All @@ -143,7 +143,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::triton::instrument::TritonInstrumentDialect,
mlir::math::MathDialect, mlir::arith::ArithDialect, mlir::scf::SCFDialect,
mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect, mlir::NVVM::NVVMDialect,
mlir::triton::nvgpu::NVGPUDialect, mlir::triton::nvws::NVWSDialect,
mlir::triton::nvg::NVGDialect, mlir::triton::nvws::NVWSDialect,
mlir::triton::amdgpu::TritonAMDGPUDialect,
mlir::triton::proton::ProtonDialect,
mlir::triton::proton::gpu::ProtonGPUDialect, mlir::ROCDL::ROCDLDialect,
Expand Down
2 changes: 1 addition & 1 deletion lib/Conversion/TritonInstrumentToLLVM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ add_triton_library(TritonInstrumentToLLVM
TritonGPUIR
TritonInstrumentIR
TritonNvidiaGPUIR
NVGPUIR
NVGIR
)
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "third_party/nvidia/include/Dialect/NVGPU/IR/Dialect.h"
#include "third_party/nvidia/include/Dialect/NVG/IR/Dialect.h"
#include "third_party/nvidia/include/TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
#include "third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/Utility.h"
#include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
Expand Down Expand Up @@ -313,7 +313,7 @@ struct BufferPointersOpConversion
assert(op.getMemType() == tti::MemType::TENSOR_MEM &&
"Unsupported memory type");
TritonLLVMOpBuilder b(loc, rewriter);
base = rewriter.create<nvgpu::TensorMemoryBaseAddress>(loc);
base = rewriter.create<nvg::TensorMemoryBaseAddress>(loc);
base = b.ptrtoint(i32_ty, base);
}
bufPointers = rewriter.create<arith::AddIOp>(
Expand Down
8 changes: 4 additions & 4 deletions python/test/unit/language/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,10 +334,10 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):

if is_cuda():
if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
pytest.skip("non-float32 downcast tests only supported on NVG with compute capability 9.0+")

if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVG with compute capability 9.0+")

if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne':
pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU CDNA3")
Expand Down Expand Up @@ -368,10 +368,10 @@ def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
def test_typeconvert_downcast_clamping(src_dtype, dst_dtype, mode, device, rounding="rtne"):
if is_cuda():
if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
pytest.skip("non-float32 downcast tests only supported on NVG with compute capability 9.0+")

if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVG with compute capability 9.0+")

converter = {
tl.float8e4nv: torch.float8_e4m3fn,
Expand Down
4 changes: 2 additions & 2 deletions python/test/unit/language/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ def check_type_supported(dtype, device):
if device in ['cuda']:
cc = torch.cuda.get_device_capability()
if cc[0] < 8 and (dtype is tl.bfloat16 or dtype == "bfloat16" or dtype is torch.bfloat16):
pytest.skip("bfloat16 is only supported on NVGPU with cc >= 80")
pytest.skip("bfloat16 is only supported on NVG with cc >= 80")
if cc[0] < 9 and dtype in {tl.float8e4nv, "float8e4nv", "float8_e4m3fn"}:
pytest.skip("float8e4nv is only supported on NVGPU with cc >= 90")
pytest.skip("float8e4nv is only supported on NVG with cc >= 90")
if is_interpreter():
if dtype in [tl.bfloat16, "bfloat16", torch.bfloat16]:
pytest.skip("bfloat16 is not supported in the interpreter")
Expand Down
16 changes: 8 additions & 8 deletions test/Conversion/atomic_ldst.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s --check-prefix=CHECK-TTG2NVGPU
// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 --convert-nv-gpu-to-llvm 2>&1 | FileCheck %s --check-prefix=CHECK-NVGPU2LLVM
// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s --check-prefix=CHECK-TTG2NVG
// RUN: triton-opt %s --allocate-shared-memory-nv=compute-capability=90 --convert-triton-gpu-to-llvm=compute-capability=90 --convert-nv-gpu-to-llvm 2>&1 | FileCheck %s --check-prefix=CHECK-NVG2LLVM
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
tt.func public @kernel_r(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
%cst = arith.constant 0.000000e+00 : f32
Expand All @@ -10,18 +10,18 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
%1 = arith.muli %0, %c128_i32 : i32
%2 = arith.cmpi slt, %1, %c512_i32 : i32

// CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, gpu
// CHECK-NVGPU2LLVM: ld.global.gpu.acquire.b32
// CHECK-TTG2NVG: nvg.ld_acquire acquire, gpu
// CHECK-NVG2LLVM: ld.global.gpu.acquire.b32
%3 = tt.atomic_rmw fadd, acquire, gpu, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
tt.store %arg0, %3 : !tt.ptr<f32>

// CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, cta
// CHECK-NVGPU2LLVM: ld.global.cta.acquire.b32
// CHECK-TTG2NVG: nvg.ld_acquire acquire, cta
// CHECK-NVG2LLVM: ld.global.cta.acquire.b32
%4 = tt.atomic_rmw fadd, acquire, cta, %arg0, %cst, %true : (!tt.ptr<f32>, f32, i1) -> f32
tt.store %arg0, %4 : !tt.ptr<f32>

// CHECK-TTG2NVGPU: nvgpu.ld_acquire acquire, sys
// CHECK-NVGPU2LLVM: ld.global.sys.acquire.b32
// CHECK-TTG2NVG: nvg.ld_acquire acquire, sys
// CHECK-NVG2LLVM: ld.global.sys.acquire.b32
%5 = tt.atomic_rmw fadd, acquire, sys, %arg0, %cst, %2 : (!tt.ptr<f32>, f32, i1) -> f32
tt.store %arg0, %5 : !tt.ptr<f32>
tt.return
Expand Down
22 changes: 11 additions & 11 deletions test/Conversion/nvgpu_to_llvm.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ llvm.func @cluster_id() -> i32 {
// CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z
// CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x
// CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y
%id = nvgpu.cluster_id
%id = nvg.cluster_id
llvm.return %id : i32
}

Expand Down Expand Up @@ -39,7 +39,7 @@ llvm.func @cluster_id() -> i32 {
llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
// CHECK: wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2
%false = llvm.mlir.constant(false) : i1
%acc0 = nvgpu.wgmma %desc, %desc, %false {
%acc0 = nvg.wgmma %desc, %desc, %false {
eltTypeA = 3 : i32,
eltTypeB = 3 : i32,
eltTypeC = 7 : i32,
Expand All @@ -52,7 +52,7 @@ llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {

// CHECK: // wait for regs: $0,$1,$2,{{.*}},$127
// CHECK: wgmma.wait_group.sync.aligned 0;
%out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
%out = nvg.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
llvm.return
}

Expand All @@ -73,7 +73,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "@$0 tcgen05.dealloc.cta_group::1.sync.aligned.b32 $1, 128;", "b,r" %[[PRED]], %{{.+}} : (i1, !llvm.ptr<6>) -> !llvm.void
llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
llvm.func @tensor_memory_base_lowering() -> i32 attributes {nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>} {
%263 = nvgpu.tensor_memory_base
%263 = nvg.tensor_memory_base
%264 = llvm.ptrtoint %263 : !llvm.ptr<6> to i32
llvm.return %264 : i32
}
Expand All @@ -95,7 +95,7 @@ llvm.func @tensor_memory_base_warpgroup() attributes {nvvm.kernel = 1 : ui1, nvv
}
// CHECK: partition0
partition0() num_warps(1) {
%0 = nvgpu.tensor_memory_base
%0 = nvg.tensor_memory_base
// CHECK-NEXT: "use"(%arg0)
"use"(%0) : (!llvm.ptr<6>) -> ()
ttg.warp_return
Expand All @@ -115,7 +115,7 @@ llvm.func @warpid_warp_specialize() {
// CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
// CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
%0 = nvgpu.warp_id
%0 = nvg.warp_id
// CHECK: "use"([[UNIFORM]])
"use"(%0) : (i32) -> ()

Expand All @@ -126,7 +126,7 @@ llvm.func @warpid_warp_specialize() {
// CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
// CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
%1 = nvgpu.warp_id
%1 = nvg.warp_id
// CHECK: "use"([[UNIFORM]])
"use"(%1) : (i32) -> ()
ttg.warp_yield
Expand All @@ -141,7 +141,7 @@ llvm.func @warpid_warp_specialize() {
// CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C192]]
// CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
%1 = nvgpu.warp_id
%1 = nvg.warp_id
// CHECK: "use"([[UNIFORM]])
"use"(%1) : (i32) -> ()
ttg.warp_return
Expand All @@ -155,7 +155,7 @@ llvm.func @warpid_warp_specialize() {
// CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C128]]
// CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
// CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
%1 = nvgpu.warp_id
%1 = nvg.warp_id
// CHECK: "use"([[UNIFORM]])
"use"(%1) : (i32) -> ()
ttg.warp_return
Expand All @@ -172,7 +172,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32}
// CHECK-LABEL: @one_warp
tt.func @one_warp() -> i32 {
// CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
%0 = nvgpu.warp_id
%0 = nvg.warp_id
// CHECK-NEXT: return [[C0]]
tt.return %0 : i32
}
Expand All @@ -192,7 +192,7 @@ tt.func @one_contextual_warp() {
// CHECK: partition0
partition0() num_warps(1) {
// CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
%0 = nvgpu.warp_id
%0 = nvg.warp_id
// CHECK-NEXT: "use"([[C0]])
"use"(%0) : (i32) -> ()
ttg.warp_return
Expand Down
12 changes: 6 additions & 6 deletions test/Conversion/tritongpu_to_llvm_blackwell.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
// CHECK-LABEL: @tc_gen5_mma
// CHECK: %[[WID:.+]] = nvgpu.warp_id
// CHECK: %[[WID:.+]] = nvg.warp_id
// CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
// CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]] : i1
Expand Down Expand Up @@ -105,7 +105,7 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 8 : i32} {
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @tensor_memory_ld
// CHECK: nvgpu.tensor_memory_base
// CHECK: nvg.tensor_memory_base
// CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
// CHECK: nvvm.tcgen05.wait <store>
// CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
Expand Down Expand Up @@ -154,7 +154,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
#tmem = #ttng.tensor_memory_encoding<blockM = 64, blockN = 128, colStride = 1>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @tensor_memory_ld_m64
// CHECK: nvgpu.tensor_memory_base
// CHECK: nvg.tensor_memory_base
// CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
// CHECK: nvvm.tcgen05.wait <store>
// CHECK: tcgen05.ld.sync.aligned.32x32b.x128.b32
Expand All @@ -174,7 +174,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 2>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @tensor_memory_unpack_f16
// CHECK: nvgpu.tensor_memory_base
// CHECK: nvg.tensor_memory_base
// CHECK: tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32
// CHECK: nvvm.tcgen05.wait <store>
// CHECK: tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32
Expand All @@ -197,7 +197,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
// CHECK-LABEL: @tc_gen5_mma_block_scale
// CHECK: %[[TMEM_BASE:.+]] = llvm.ptrtoint %arg2 : !llvm.ptr<3> to i32
// CHECK: %[[WID:.+]] = nvgpu.warp_id
// CHECK: %[[WID:.+]] = nvg.warp_id
// CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
// CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]] : i1
Expand Down Expand Up @@ -865,7 +865,7 @@ tt.func private @load_store_16x32bx1_broadcast(%arg0: !ttg.memdesc<16x8xi8, #tme
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @tensor_memory_st
// CHECK: nvgpu.tensor_memory_base
// CHECK: nvg.tensor_memory_base
// CHECK: tcgen05.st.sync.aligned.32x32b.x128.b32
// CHECK: nvvm.tcgen05.wait <store>
tt.func public @tensor_memory_st(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {
Expand Down
Loading
Loading