diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h index d474ba8485d5d..a9270c6f52344 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h @@ -39,7 +39,11 @@ enum NVVMMemorySpace { /// Shared memory space identifier. kSharedMemorySpace = 3, /// Constant memory space identifier. - kConstantMemorySpace = 4 + kConstantMemorySpace = 4, + /// Tensor memory space identifier. + /// Tensor memory is available only in arch-accelerated + /// variants from sm100 onwards. + kTensorMemorySpace = 6 }; /// Return the element type and number of elements associated with a wmma matrix diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 23db9375fbffe..c501b5e7c1001 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -23,6 +23,7 @@ include "mlir/Interfaces/InferIntRangeInterface.td" def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>; def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>; def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>; +def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>; //===----------------------------------------------------------------------===// // NVVM dialect definitions @@ -2592,6 +2593,110 @@ def NVVM_Breakpoint : NVVM_Op<"breakpoint"> { let assemblyFormat = "attr-dict"; } +//===----------------------------------------------------------------------===// +// NVVM TCGEN05 Ops +//===----------------------------------------------------------------------===// +// Num CTAs in a group participating in the TCGEN05 operation. +// This corresponds to the "cta_group::1", "cta_group::2" +// modifiers in the PTX instructions. +def Tcgen05GroupCTA_1 : I32EnumAttrCase<"CTA_1", 0, "cta_1">; +def Tcgen05GroupCTA_2 : I32EnumAttrCase<"CTA_2", 1, "cta_2">; + +def Tcgen05GroupKind : I32EnumAttr<"Tcgen05GroupKind", + "NVVM Tcgen05 group kind", + [Tcgen05GroupCTA_1, Tcgen05GroupCTA_2]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def Tcgen05GroupKindAttr : + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> { + let summary = "Tcgen05 alloc operation"; + let description = [{ + The `tcgen05.alloc` Op allocates tensor core memory for + the amount specified by `nCols` and writes the destination + address to the `addr` argument. The `nCols` operand specifies the + number of columns to be allocated and it must be a power-of-two. + [For more information, refer to the PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions) + }]; + + let arguments = (ins + AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr, + I32:$nCols, + DefaultValuedAttr:$group); + + let assemblyFormat = "$addr `,` $nCols attr-dict `:` type(operands)"; + + let extraClassDeclaration = [{ + static llvm::Intrinsic::ID + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args); + }]; + string llvmBuilder = [{ + llvm::SmallVector args; + auto id = NVVM::Tcgen05AllocOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, args); + createIntrinsicCall(builder, id, args); + }]; +} + +def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> { + let summary = "Tcgen05 dealloc operation"; + let description = [{ + The `tcgen05.dealloc` Op de-allocates the tensor core memory + specified by `tmemAddr`, which must be from a previous tensor + memory allocation. The `nCols` operand specifies the number + of columns to be de-allocated, and it must be a power-of-two. + [For more information, refer to the PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions) + }]; + + let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols, + DefaultValuedAttr:$group); + + let assemblyFormat = "$taddr `,` $nCols attr-dict `:` type(operands)"; + + let extraClassDeclaration = [{ + static llvm::Intrinsic::ID + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args); + }]; + string llvmBuilder = [{ + llvm::SmallVector args; + auto id = NVVM::Tcgen05DeallocOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, args); + createIntrinsicCall(builder, id, args); + }]; +} + +def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit"> { + let summary = "Tcgen05 Op to relinquish the right to allocate"; + let description = [{ + The `tcgen05.relinquish_alloc_permit` Op specifies that the CTA + of the executing thread is relinquishing the right to allocate + Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc` + after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`. + [For more information, refer to the PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions) + }]; + + let arguments = (ins + DefaultValuedAttr:$group); + + let assemblyFormat = "attr-dict"; + + string llvmBuilder = [{ + auto id = ($group == NVVM::Tcgen05GroupKind::CTA_1) ? + llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg1 : + llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg2; + createIntrinsicCall(builder, id); + }]; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index a5d09eaa34eb5..241b25c6caf12 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1243,6 +1243,47 @@ llvm::Intrinsic::ID CvtFloatToTF32Op::getIntrinsicID(NVVM::FPRoundingMode rnd, } } +llvm::Intrinsic::ID +Tcgen05AllocOp::getIntrinsicIDAndArgs(Operation &op, + LLVM::ModuleTranslation &mt, + llvm::SmallVector &args) { + auto curOp = cast(op); + unsigned AS = llvm::cast(curOp.getAddr().getType()) + .getAddressSpace(); + bool isShared = AS == NVVMMemorySpace::kSharedMemorySpace; + bool is2CTAMode = curOp.getGroup() == Tcgen05GroupKind::CTA_2; + + llvm::Intrinsic::ID id; + if (isShared) { + id = is2CTAMode ? llvm::Intrinsic::nvvm_tcgen05_alloc_shared_cg2 + : llvm::Intrinsic::nvvm_tcgen05_alloc_shared_cg1; + } else { + id = is2CTAMode ? llvm::Intrinsic::nvvm_tcgen05_alloc_cg2 + : llvm::Intrinsic::nvvm_tcgen05_alloc_cg1; + } + + // Fill the Intrinsic Args + args.push_back(mt.lookupValue(curOp.getAddr())); + args.push_back(mt.lookupValue(curOp.getNCols())); + + return id; +} + +llvm::Intrinsic::ID Tcgen05DeallocOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args) { + auto curOp = cast(op); + auto id = (curOp.getGroup() == Tcgen05GroupKind::CTA_1) + ? llvm::Intrinsic::nvvm_tcgen05_dealloc_cg1 + : llvm::Intrinsic::nvvm_tcgen05_dealloc_cg2; + + // Fill the Intrinsic Args + args.push_back(mt.lookupValue(curOp.getTaddr())); + args.push_back(mt.lookupValue(curOp.getNCols())); + + return id; +} + /// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might /// have ConstantRangeAttr. static void nvvmInferResultRanges(Operation *op, Value result, diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir new file mode 100644 index 0000000000000..781efa2567111 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir @@ -0,0 +1,42 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s +// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM + +// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc +llvm.func @llvm_nvvm_tcgen05_alloc(%addr : !llvm.ptr, %ncols : i32) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %{{.*}}, i32 %{{.*}}) + nvvm.tcgen05.alloc %addr, %ncols : !llvm.ptr, i32 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %{{.*}}, i32 %{{.*}}) + nvvm.tcgen05.alloc %addr, %ncols {group = #nvvm.tcgen05_group} : !llvm.ptr, i32 + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc_shared +llvm.func @llvm_nvvm_tcgen05_alloc_shared(%addr : !llvm.ptr<3>, %ncols : i32) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %{{.*}}, i32 %{{.*}}) + nvvm.tcgen05.alloc %addr, %ncols : !llvm.ptr<3>, i32 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %{{.*}}, i32 %{{.*}}) + nvvm.tcgen05.alloc %addr, %ncols {group = #nvvm.tcgen05_group} : !llvm.ptr<3>, i32 + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_dealloc +llvm.func @llvm_nvvm_tcgen05_dealloc(%addr : !llvm.ptr<6>, %ncols : i32) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %{{.*}}, i32 %{{.*}}) + nvvm.tcgen05.dealloc %addr, %ncols : !llvm.ptr<6>, i32 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %{{.*}}, i32 %{{.*}}) + nvvm.tcgen05.dealloc %addr, %ncols {group = #nvvm.tcgen05_group} : !llvm.ptr<6>, i32 + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_relinquish_alloc_permit +llvm.func @llvm_nvvm_tcgen05_relinquish_alloc_permit() { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1() + nvvm.tcgen05.relinquish_alloc_permit + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2() + nvvm.tcgen05.relinquish_alloc_permit {group = #nvvm.tcgen05_group} + llvm.return +}