From f6a8f22b15ae5fb5d3996b002841b53e83000dc1 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Fri, 7 Feb 2025 16:21:09 +0530 Subject: [PATCH] [MLIR][NVVM] Add tcgen05 wait/fence Ops PR #126091 adds intrinsics for tcgen05 wait/fence/commit operations. This patch adds NVVM Dialect Ops for them. Signed-off-by: Durgadoss R --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 109 ++++++++++++++++++ mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 30 +++++ .../Target/LLVMIR/nvvm/tcgen05-barriers.mlir | 56 +++++++++ 3 files changed, 195 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 11226dae2c3f3..fe15a524ec3b5 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -2617,6 +2617,30 @@ def Tcgen05GroupKindAttr : let assemblyFormat = "`<` $value `>`"; } +def Tcgen05FenceBefore : I32EnumAttrCase<"BEFORE_THREAD_SYNC", 0, "before">; +def Tcgen05FenceAfter : I32EnumAttrCase<"AFTER_THREAD_SYNC", 1, "after">; +def Tcgen05FenceKind : I32EnumAttr<"Tcgen05FenceKind", "NVVM Tcgen05 fence kind", + [Tcgen05FenceBefore, Tcgen05FenceAfter]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def Tcgen05FenceKindAttr : + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05WaitLoad : I32EnumAttrCase<"LOAD", 0, "load">; +def Tcgen05WaitStore : I32EnumAttrCase<"STORE", 1, "store">; +def Tcgen05WaitKind : I32EnumAttr<"Tcgen05WaitKind", "NVVM Tcgen05 wait kind", + [Tcgen05WaitLoad, Tcgen05WaitStore]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def Tcgen05WaitKindAttr : + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> { let summary = "Tcgen05 alloc operation"; let description = [{ @@ -2701,6 +2725,91 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm }]; } +def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> { + let summary = "Tcgen05 fence operations"; + let description = [{ + The `tcgen05.fence` orders all prior async tcgen05 operations + with respect to the subsequent tcgen05 and execution ordering operations. + The `tcgen05.fence` orders all subsequent async tcgen05 operations + with respect to the prior tcgen05 and execution ordering operations. + + [For more information refer to the PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence) + }]; + + let arguments = (ins Tcgen05FenceKindAttr:$kind); + let assemblyFormat = "$kind attr-dict"; + + string llvmBuilder = [{ + auto id = ($kind == NVVM::Tcgen05FenceKind::BEFORE_THREAD_SYNC) + ? llvm::Intrinsic::nvvm_tcgen05_fence_before_thread_sync + : llvm::Intrinsic::nvvm_tcgen05_fence_after_thread_sync; + createIntrinsicCall(builder, id); + }]; +} + +def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> { + let summary = "Tcgen05 wait operations"; + let description = [{ + The `tcgen05.wait` causes the executing thread to block until + all prior `tcgen05.ld` operations issued by the executing thread + have completed. Similarly, the `tcgen05.wait` causes the executing + thread to block until all prior `tcgen05.st` operations issued by the + executing thread have completed. + [For more information refer PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait) + }]; + + let arguments = (ins Tcgen05WaitKindAttr:$kind); + let assemblyFormat = "$kind attr-dict"; + + string llvmBuilder = [{ + auto id = ($kind == NVVM::Tcgen05WaitKind::LOAD) + ? llvm::Intrinsic::nvvm_tcgen05_wait_ld + : llvm::Intrinsic::nvvm_tcgen05_wait_st; + createIntrinsicCall(builder, id); + }]; +} + +def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> { + let summary = "Tcgen05 commit operations"; + let description = [{ + The `tcgen05.commit` makes the mbarrier object, specified by + the operand `addr`, track the completion of all the prior + async-tcgen05 operations initiated by the executing thread. + The multicast variants allow signaling on the mbarrier objects + of multiple CTAs within the cluster. Operand `multicastMask`, + when present, specifies the destination CTAs in the cluster such + that each bit position in the 16-bit `multicastMask` operand + corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA. + [For more information refer PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit) + }]; + + let arguments = (ins + AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr, + Optional:$multicastMask, + DefaultValuedAttr:$group); + + let assemblyFormat = [{ + $addr (`,` `multicast_mask` `=` $multicastMask^)? + attr-dict `:` type(operands) + }]; + + let extraClassDeclaration = [{ + static llvm::Intrinsic::ID + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args); + }]; + + string llvmBuilder = [{ + llvm::SmallVector args; + auto id = NVVM::Tcgen05CommitOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, args); + createIntrinsicCall(builder, id, args); + }]; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 241b25c6caf12..62f0c21338111 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1284,6 +1284,36 @@ llvm::Intrinsic::ID Tcgen05DeallocOp::getIntrinsicIDAndArgs( return id; } +#define TCGEN05_COMMIT_IMPL(cg, is_shared, mc) \ + is_shared ? llvm::Intrinsic::nvvm_tcgen05_commit##mc##_shared##_##cg \ + : llvm::Intrinsic::nvvm_tcgen05_commit##mc##_##cg + +#define GET_TCGEN05_COMMIT_ID(cta_group, is_shared, has_mc) \ + has_mc ? TCGEN05_COMMIT_IMPL(cta_group, is_shared, _mc) \ + : TCGEN05_COMMIT_IMPL(cta_group, is_shared, ) + +llvm::Intrinsic::ID +Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op, + LLVM::ModuleTranslation &mt, + llvm::SmallVector &args) { + auto curOp = cast(op); + unsigned AS = llvm::cast(curOp.getAddr().getType()) + .getAddressSpace(); + bool isShared = AS == NVVMMemorySpace::kSharedMemorySpace; + bool hasMulticast = curOp.getMulticastMask() ? true : false; + bool is2CTAMode = curOp.getGroup() == Tcgen05GroupKind::CTA_2; + + auto id = is2CTAMode ? GET_TCGEN05_COMMIT_ID(cg2, isShared, hasMulticast) + : GET_TCGEN05_COMMIT_ID(cg1, isShared, hasMulticast); + + // Fill the Intrinsic Args + args.push_back(mt.lookupValue(curOp.getAddr())); + if (hasMulticast) + args.push_back(mt.lookupValue(curOp.getMulticastMask())); + + return id; +} + /// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might /// have ConstantRangeAttr. static void nvvmInferResultRanges(Operation *op, Value result, diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir new file mode 100644 index 0000000000000..7536a4567e34e --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir @@ -0,0 +1,56 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s +// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM + +// CHECK-LABEL: @llvm_nvvm_tcgen05_fence +llvm.func @llvm_nvvm_tcgen05_fence() { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync() + nvvm.tcgen05.fence #nvvm.tcgen05_fence + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync() + nvvm.tcgen05.fence #nvvm.tcgen05_fence + + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_wait +llvm.func @llvm_nvvm_tcgen05_wait() { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld() + nvvm.tcgen05.wait #nvvm.tcgen05_wait + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st() + nvvm.tcgen05.wait #nvvm.tcgen05_wait + + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_generic +llvm.func @llvm_nvvm_tcgen05_commit_generic(%barrier : !llvm.ptr, %cta_mask : i16) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.cg1(ptr %{{.*}}) + nvvm.tcgen05.commit %barrier : !llvm.ptr + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.cg2(ptr %{{.*}}) + nvvm.tcgen05.commit %barrier {group = #nvvm.tcgen05_group} : !llvm.ptr + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask : !llvm.ptr, i16 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask {group = #nvvm.tcgen05_group} : !llvm.ptr, i16 + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_shared +llvm.func @llvm_nvvm_tcgen05_commit_shared(%barrier : !llvm.ptr<3>, %cta_mask : i16) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %{{.*}}) + nvvm.tcgen05.commit %barrier : !llvm.ptr<3> + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %{{.*}}) + nvvm.tcgen05.commit %barrier {group = #nvvm.tcgen05_group} : !llvm.ptr<3> + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask : !llvm.ptr<3>, i16 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask {group = #nvvm.tcgen05_group} : !llvm.ptr<3>, i16 + llvm.return +}