-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[mlir][amdgpu] Add lowering for make_dma_descriptor #169955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[mlir][amdgpu] Add lowering for make_dma_descriptor #169955
Conversation
Initial lowering for make_dma_descriptor. At the moment it only supports tensors of rank 2.
|
@llvm/pr-subscribers-mlir-amdgpu @llvm/pr-subscribers-mlir-gpu Author: Erick Ochoa Lopez (amd-eochoalo) Changes
Patch is 63.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169955.diff 8 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index e07c72b839e7c..c072ebdfa5d26 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}
+//===----------------------------------------------------------------------===//
+// AMDGPU Type definitions
+//===----------------------------------------------------------------------===//
+
class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
: TypeDef<AMDGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
}
-//===----------------------------------------------------------------------===//
-// AMDGPU Type definitions
-//===----------------------------------------------------------------------===//
-
def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let summary = "Pair of base addresses that move data between LDS and global storage.";
let description = [{
@@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let assemblyFormat = "`<` $elementType `>`";
}
+def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
+ let summary = "Descriptors used in tensor store/load operations.";
+ let description = [{
+ This type is opaque and corresponds to the two or four descriptor groups
+ used in tensor_load_to_lds or tensor_store_from_lds.
+ }];
+
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1219,17 +1228,15 @@ def AMDGPU_ScaledMFMAOp :
}
def AMDGPU_MakeDmaBaseOp :
- AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
+ AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["src", "dst"]>]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to read from">:$src,
- Variadic<Index>:$srcIndices,
+ Variadic<Index>:$src_indices,
Arg<AnyMemRef, "buffer to write to">:$dst,
- Variadic<Index>:$dstIndices)>,
+ Variadic<Index>:$dst_indices)>,
Results<(outs AMDGPU_TDMBaseType: $base)> {
// TODO:
- // * Add verifiers such that one of the memrefs is from LDS and the other global.
- // * Add verifiers to make sure that the type is in the correct direction.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1240,11 +1247,184 @@ def AMDGPU_MakeDmaBaseOp :
This operation creates a value corresponding to the tensor descriptor (D#) group 0
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
+ For example:
+
+ ```mlir
+ %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+ ```
+
+ to
+
+ ```mlir
+ // pseudo-code
+ %global_base = llvm.extractvalue %global_memref[1]
+ %global_address = llvm.get_element_ptr ...
+
+ %lds_base = llvm.extractvalue %lds_memref[1]
+ %lds_address = llvm.get_element_ptr ...
+
+ // Definition of %base
+ %undef = llvm.mlir.undef : vector<4xi32>
+ %v0 = llvm.insertelement %15, %undef[0] : vector<4xi32>
+ %v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32>
+ %v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32>
+ %base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32>
+
+ rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ ```
+
These tensor DMA operations were introduced in gfx1250.
}];
let assemblyFormat = [{
- $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
+ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
+ }];
+
+ let hasVerifier = 1;
+}
+
+def AMDGPU_MakeDmaDescriptorOp :
+ AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
+ Arguments<(ins
+ AMDGPU_TDMBaseType: $base,
+ Variadic<Index>: $global_dynamic_sizes,
+ DenseI64ArrayAttr: $global_static_sizes,
+ Variadic<Index>: $global_dynamic_strides,
+ DenseI64ArrayAttr: $global_static_strides,
+ Variadic<Index>: $shared_dynamic_sizes,
+ DenseI64ArrayAttr: $shared_static_sizes,
+ Optional<Index>: $pad_amount,
+ Optional<Index>: $pad_interval,
+ Optional<AnyMemRef>: $atomic_barrier_address,
+ Variadic<Index>: $atomic_barrier_indices,
+ Optional<Index>: $global_increment,
+ Optional<Index>: $lds_increment,
+ Optional<Index>: $iteration_count)>,
+ Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
+
+ let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
+ let description = [{
+ Make all descriptor groups needed by tensor memory operations.
+
+ The $base operand corresponds to the base pair addresses, one must be an address in LDS
+ while the other must be a global memory location.
+
+ $global_{static/dynamic}_sizes determine the size of the tensor.
+ $global_{static/dynamic}_strides determine the strides of the tensor.
+ $shared_{static/dynamic}_sizes determines the size of the tile.
+
+ Padding can be applied to the LDS address when copying from memory to LDS,
+ but not when copying from LDS to memory.
+ The values in the padded target addresses remain the same as before the operation was applied.
+
+ 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
+ $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
+ $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
+ $iterate_count determines how many times to iterate.
+
+ ```mlir
+ // Example of moving a two-dimensional tensor to LDS.
+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+
+ // Example of moving a two dimension tensor to LDS where padding is applied after every integer.
+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+ ```
+ }];
+
+ let assemblyFormat = [{
+ $base
+ `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
+ `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
+ `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
+ ( `padShared` `(` $pad_amount^ `every` $pad_interval`)` )?
+ ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
+ `:` type($atomic_barrier_address) `)`)?
+ ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
+ attr-dict `:` qualified(type($base)) `->` type(results)
+ }];
+
+ let extraClassDeclaration = [{
+ int getRank() {
+ return getGlobalStaticSizes().size();
+ }
+
+ int getElementTypeWidth() {
+ Type elementType = getBase().getType().getElementType();
+ int width;
+ if (auto floatType = dyn_cast<FloatType>(elementType)) {
+ width = floatType.getWidth();
+ } else if (auto intType = dyn_cast<IntegerType>(elementType)) {
+ width = intType.getWidth();
+ } else {
+ llvm_unreachable("element type must have getWidth interface");
+ }
+ return width;
+ }
+
+ SmallVector<OpFoldResult> getMixedList(SmallVector<Value> dynamics, ArrayRef<int64_t> statics) {
+ SmallVector<OpFoldResult> result;
+ unsigned ctr = 0;
+ OpBuilder b(getContext());
+ for (int64_t static_elem : statics) {
+ if (ShapedType::isDynamic(static_elem)) {
+ result.push_back(dynamics[ctr++]);
+ } else {
+ result.push_back(b.getIndexAttr(static_elem));
+ }
+ }
+ return result;
+ }
+
+ SmallVector<OpFoldResult> getMixedGlobalSizes() {
+ return getMixedList(getGlobalDynamicSizes(), getGlobalStaticSizes());
+ }
+
+ SmallVector<OpFoldResult> getMixedGlobalStrides() {
+ return getMixedList(getGlobalDynamicStrides(), getGlobalStaticStrides());
+ }
+
+ SmallVector<OpFoldResult> getMixedSharedSizes() {
+ return getMixedList(getSharedDynamicSizes(), getSharedStaticSizes());
+ }
+ }];
+
+ let hasVerifier = 1;
+ let hasFolder = 1;
+}
+
+def AMDGPU_TensorLoadToLDSOp :
+ AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Load tensors from global memory to LDS.";
+ let description = [{
+ Load tensors of up to five dimensions from global memory to LDS.
+
+ The operation is fully described by the descriptor operand.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
+def AMDGPU_TensorStoreFromLDSOp :
+ AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Store tensors from LDS to global memory.";
+ let description = [{
+ Store tensors of up to five dimensions from LDS to global memory.
+
+ The operation is fully described by the descriptor operand.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
}];
}
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index a7680fb5c3191..958757da0933e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
IntegerAttr m, IntegerAttr n, IntegerAttr k) {
printMNKDimensionList(printer, m, n, k);
}
+
+// Utility functions for quering the address space.
+bool hasGlobalMemorySpace(Attribute memorySpace);
+bool hasWorkgroupMemorySpace(Attribute memorySpace);
+bool hasFatRawBufferMemorySpace(Attribute memorySpace);
} // namespace mlir::amdgpu
#define GET_ATTRDEF_CLASSES
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index b9a5e7d7f6eac..1e81d339b0ddc 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2264,6 +2264,451 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
}
};
+struct AMDGPUMakeDmaBaseLowering
+ : public ConvertOpToLLVMPattern<MakeDmaBaseOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ AMDGPUMakeDmaBaseLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<MakeDmaBaseOp>(converter), chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250) {
+ return op->emitOpError("make_dma_base is only supported on gfx1250");
+ }
+
+ Location loc = op.getLoc();
+
+ ValueRange srcIndices = adaptor.getSrcIndices();
+ Value src = adaptor.getSrc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+
+ Value srcPtr =
+ getStridedElementPtr(rewriter, loc, srcMemRefType, src, srcIndices);
+
+ ValueRange dstIndices = adaptor.getDstIndices();
+ Value dst = adaptor.getDst();
+ auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
+
+ Value dstPtr =
+ getStridedElementPtr(rewriter, loc, dstMemRefType, dst, dstIndices);
+
+ bool storeFrom = hasWorkgroupMemorySpace(srcMemRefType.getMemorySpace());
+ Value ldsAddr = storeFrom ? srcPtr : dstPtr;
+ Value globalAddr = storeFrom ? dstPtr : srcPtr;
+
+ Type i32 = rewriter.getI32Type();
+ Type i64 = rewriter.getI64Type();
+
+ Value castForLdsAddr =
+ LLVM::PtrToIntOp::create(rewriter, loc, i32, ldsAddr);
+ Value castForGlobalAddr =
+ LLVM::PtrToIntOp::create(rewriter, loc, i64, globalAddr);
+
+ Value mask = createI64Constant(rewriter, loc, 0x1FFFFFFFFFFFFFF);
+ Value first57BitsOfGlobalAddr =
+ LLVM::AndOp::create(rewriter, loc, castForGlobalAddr, mask);
+ Value shift = LLVM::LShrOp::create(rewriter, loc, first57BitsOfGlobalAddr,
+ createI64Constant(rewriter, loc, 32));
+
+ Value lowHalf =
+ LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr);
+ Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift);
+
+ Value typeMask = createI32Constant(rewriter, loc, 2 << 30);
+ Value highHalfPlusType =
+ LLVM::OrOp::create(rewriter, loc, highHalf, typeMask);
+
+ Value c0 = createI32Constant(rewriter, loc, 0);
+ Value c1 = createI32Constant(rewriter, loc, 1);
+ Value c2 = createI32Constant(rewriter, loc, 2);
+ Value c3 = createI32Constant(rewriter, loc, 3);
+
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ Value result = LLVM::UndefOp::create(rewriter, loc, v4i32);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, c0, c0);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result,
+ castForLdsAddr, c1);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result,
+ highHalfPlusType, c3);
+
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+};
+
+struct AMDGPUMakeDmaDescriptorLowering
+ : public ConvertOpToLLVMPattern<MakeDmaDescriptorOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ AMDGPUMakeDmaDescriptorLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<MakeDmaDescriptorOp>(converter),
+ chipset(chipset) {}
+ Chipset chipset;
+
+ Value getDGroup0(OpAdaptor adaptor) const { return adaptor.getBase(); }
+
+ Value setValueAtOffset(ConversionPatternRewriter &rewriter, Location loc,
+ Value accumulator, Value value, int shift) const {
+ shift = shift % 32;
+ Value shiftAmount;
+ if (shift != 0) {
+ shiftAmount = createI32Constant(rewriter, loc, shift % 32);
+ value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount);
+ }
+ return LLVM::OrOp::create(rewriter, loc, accumulator, value);
+ }
+
+ Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> consts) const {
+ // Compute data_size.
+ int elementTypeWidthInBytes = op.getElementTypeWidth() / 8;
+
+ Value dataSize;
+ switch (elementTypeWidthInBytes) {
+ case 1:
+ dataSize = consts[0];
+ break;
+ case 2:
+ dataSize = consts[1];
+ break;
+ case 4:
+ dataSize = consts[2];
+ break;
+ case 8:
+ dataSize = consts[3];
+ break;
+ default:
+ llvm_unreachable("Invalid element size.");
+ }
+ return setValueAtOffset(rewriter, loc, sgpr0, dataSize, 16);
+ }
+
+ Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
+ if (!atomic_barrier_enable)
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18);
+ }
+
+ Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool iterate_enable = adaptor.getGlobalIncrement() != nullptr;
+ if (!iterate_enable)
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19);
+ }
+
+ Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool pad_enable = op.getPadAmount() != nullptr;
+ if (!pad_enable)
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
+ }
+
+ Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool pad_enable = op.getPadAmount() != nullptr;
+ if (!pad_enable)
+ return sgpr0;
+
+ IntegerType i32 = rewriter.getI32Type();
+ Value padInterval = adaptor.getPadInterval();
+ // pre-condition: padInterval can be a power of two between 2 and 256
+ padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
+ padInterval, false);
+ padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]);
+ // post-condition: padInterval can be a value between 0 and 7
+ return setValueAtOffset(rewriter, loc, sgpr0, padInterval, 22);
+ }
+
+ Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool pad_enable = op.getPadAmount() != nullptr;
+ if (!pad_enable)
+ return sgpr0;
+
+ Value padAmount = adaptor.getPadAmount();
+ // pre-condition: padAmount is a value between 1-128
+ padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
+ // post-condition: padAmount is a value between 0-127
+ return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25);
+ }
+
+ Value setAtomicBarrierAddress(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1,
+ const SmallVector<Value> &consts) const {
+ bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
+ if (!atomic_barrier_enable)
+ return sgpr1;
+
+ Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress();
+ IntegerType i32 = rewriter.getI32Type();
+ atomicBarrierAddress =
+ LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
+ atomicBarrierAddress =
+ LLVM::LShrOp::create(rewriter, loc, atomicBarrierAddress, consts[3]);
+ Value mask = createI32Constant(rewriter, loc, 0xFFFF);
+ atomicBarrierAddress =
+ LLVM::AndOp::create(rewriter, loc, atomicBarrierAddress, mask);
+ return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32);
+ }
+
+ std::pair<Value, Value>
+ ...
[truncated]
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Erick Ochoa Lopez (amd-eochoalo) Changes
Patch is 63.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169955.diff 8 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index e07c72b839e7c..c072ebdfa5d26 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}
+//===----------------------------------------------------------------------===//
+// AMDGPU Type definitions
+//===----------------------------------------------------------------------===//
+
class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
: TypeDef<AMDGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
}
-//===----------------------------------------------------------------------===//
-// AMDGPU Type definitions
-//===----------------------------------------------------------------------===//
-
def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let summary = "Pair of base addresses that move data between LDS and global storage.";
let description = [{
@@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let assemblyFormat = "`<` $elementType `>`";
}
+def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
+ let summary = "Descriptors used in tensor store/load operations.";
+ let description = [{
+ This type is opaque and corresponds to the two or four descriptor groups
+ used in tensor_load_to_lds or tensor_store_from_lds.
+ }];
+
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1219,17 +1228,15 @@ def AMDGPU_ScaledMFMAOp :
}
def AMDGPU_MakeDmaBaseOp :
- AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
+ AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["src", "dst"]>]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to read from">:$src,
- Variadic<Index>:$srcIndices,
+ Variadic<Index>:$src_indices,
Arg<AnyMemRef, "buffer to write to">:$dst,
- Variadic<Index>:$dstIndices)>,
+ Variadic<Index>:$dst_indices)>,
Results<(outs AMDGPU_TDMBaseType: $base)> {
// TODO:
- // * Add verifiers such that one of the memrefs is from LDS and the other global.
- // * Add verifiers to make sure that the type is in the correct direction.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1240,11 +1247,184 @@ def AMDGPU_MakeDmaBaseOp :
This operation creates a value corresponding to the tensor descriptor (D#) group 0
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
+ For example:
+
+ ```mlir
+ %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+ ```
+
+ to
+
+ ```mlir
+ // pseudo-code
+ %global_base = llvm.extractvalue %global_memref[1]
+ %global_address = llvm.get_element_ptr ...
+
+ %lds_base = llvm.extractvalue %lds_memref[1]
+ %lds_address = llvm.get_element_ptr ...
+
+ // Definition of %base
+ %undef = llvm.mlir.undef : vector<4xi32>
+ %v0 = llvm.insertelement %15, %undef[0] : vector<4xi32>
+ %v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32>
+ %v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32>
+ %base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32>
+
+ rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ ```
+
These tensor DMA operations were introduced in gfx1250.
}];
let assemblyFormat = [{
- $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
+ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
+ }];
+
+ let hasVerifier = 1;
+}
+
+def AMDGPU_MakeDmaDescriptorOp :
+ AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
+ Arguments<(ins
+ AMDGPU_TDMBaseType: $base,
+ Variadic<Index>: $global_dynamic_sizes,
+ DenseI64ArrayAttr: $global_static_sizes,
+ Variadic<Index>: $global_dynamic_strides,
+ DenseI64ArrayAttr: $global_static_strides,
+ Variadic<Index>: $shared_dynamic_sizes,
+ DenseI64ArrayAttr: $shared_static_sizes,
+ Optional<Index>: $pad_amount,
+ Optional<Index>: $pad_interval,
+ Optional<AnyMemRef>: $atomic_barrier_address,
+ Variadic<Index>: $atomic_barrier_indices,
+ Optional<Index>: $global_increment,
+ Optional<Index>: $lds_increment,
+ Optional<Index>: $iteration_count)>,
+ Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
+
+ let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
+ let description = [{
+ Make all descriptor groups needed by tensor memory operations.
+
+ The $base operand corresponds to the base pair addresses, one must be an address in LDS
+ while the other must be a global memory location.
+
+ $global_{static/dynamic}_sizes determine the size of the tensor.
+ $global_{static/dynamic}_strides determine the strides of the tensor.
+ $shared_{static/dynamic}_sizes determines the size of the tile.
+
+ Padding can be applied to the LDS address when copying from memory to LDS,
+ but not when copying from LDS to memory.
+ The values in the padded target addresses remain the same as before the operation was applied.
+
+ 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
+ $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
+ $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
+ $iterate_count determines how many times to iterate.
+
+ ```mlir
+ // Example of moving a two-dimensional tensor to LDS.
+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+
+ // Example of moving a two dimension tensor to LDS where padding is applied after every integer.
+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+ ```
+ }];
+
+ let assemblyFormat = [{
+ $base
+ `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
+ `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
+ `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
+ ( `padShared` `(` $pad_amount^ `every` $pad_interval`)` )?
+ ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
+ `:` type($atomic_barrier_address) `)`)?
+ ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
+ attr-dict `:` qualified(type($base)) `->` type(results)
+ }];
+
+ let extraClassDeclaration = [{
+ int getRank() {
+ return getGlobalStaticSizes().size();
+ }
+
+ int getElementTypeWidth() {
+ Type elementType = getBase().getType().getElementType();
+ int width;
+ if (auto floatType = dyn_cast<FloatType>(elementType)) {
+ width = floatType.getWidth();
+ } else if (auto intType = dyn_cast<IntegerType>(elementType)) {
+ width = intType.getWidth();
+ } else {
+ llvm_unreachable("element type must have getWidth interface");
+ }
+ return width;
+ }
+
+ SmallVector<OpFoldResult> getMixedList(SmallVector<Value> dynamics, ArrayRef<int64_t> statics) {
+ SmallVector<OpFoldResult> result;
+ unsigned ctr = 0;
+ OpBuilder b(getContext());
+ for (int64_t static_elem : statics) {
+ if (ShapedType::isDynamic(static_elem)) {
+ result.push_back(dynamics[ctr++]);
+ } else {
+ result.push_back(b.getIndexAttr(static_elem));
+ }
+ }
+ return result;
+ }
+
+ SmallVector<OpFoldResult> getMixedGlobalSizes() {
+ return getMixedList(getGlobalDynamicSizes(), getGlobalStaticSizes());
+ }
+
+ SmallVector<OpFoldResult> getMixedGlobalStrides() {
+ return getMixedList(getGlobalDynamicStrides(), getGlobalStaticStrides());
+ }
+
+ SmallVector<OpFoldResult> getMixedSharedSizes() {
+ return getMixedList(getSharedDynamicSizes(), getSharedStaticSizes());
+ }
+ }];
+
+ let hasVerifier = 1;
+ let hasFolder = 1;
+}
+
+def AMDGPU_TensorLoadToLDSOp :
+ AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Load tensors from global memory to LDS.";
+ let description = [{
+ Load tensors of up to five dimensions from global memory to LDS.
+
+ The operation is fully described by the descriptor operand.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
+def AMDGPU_TensorStoreFromLDSOp :
+ AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Store tensors from LDS to global memory.";
+ let description = [{
+ Store tensors of up to five dimensions from LDS to global memory.
+
+ The operation is fully described by the descriptor operand.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
}];
}
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index a7680fb5c3191..958757da0933e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
IntegerAttr m, IntegerAttr n, IntegerAttr k) {
printMNKDimensionList(printer, m, n, k);
}
+
+// Utility functions for quering the address space.
+bool hasGlobalMemorySpace(Attribute memorySpace);
+bool hasWorkgroupMemorySpace(Attribute memorySpace);
+bool hasFatRawBufferMemorySpace(Attribute memorySpace);
} // namespace mlir::amdgpu
#define GET_ATTRDEF_CLASSES
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index b9a5e7d7f6eac..1e81d339b0ddc 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2264,6 +2264,451 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
}
};
+struct AMDGPUMakeDmaBaseLowering
+ : public ConvertOpToLLVMPattern<MakeDmaBaseOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ AMDGPUMakeDmaBaseLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<MakeDmaBaseOp>(converter), chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250) {
+ return op->emitOpError("make_dma_base is only supported on gfx1250");
+ }
+
+ Location loc = op.getLoc();
+
+ ValueRange srcIndices = adaptor.getSrcIndices();
+ Value src = adaptor.getSrc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+
+ Value srcPtr =
+ getStridedElementPtr(rewriter, loc, srcMemRefType, src, srcIndices);
+
+ ValueRange dstIndices = adaptor.getDstIndices();
+ Value dst = adaptor.getDst();
+ auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
+
+ Value dstPtr =
+ getStridedElementPtr(rewriter, loc, dstMemRefType, dst, dstIndices);
+
+ bool storeFrom = hasWorkgroupMemorySpace(srcMemRefType.getMemorySpace());
+ Value ldsAddr = storeFrom ? srcPtr : dstPtr;
+ Value globalAddr = storeFrom ? dstPtr : srcPtr;
+
+ Type i32 = rewriter.getI32Type();
+ Type i64 = rewriter.getI64Type();
+
+ Value castForLdsAddr =
+ LLVM::PtrToIntOp::create(rewriter, loc, i32, ldsAddr);
+ Value castForGlobalAddr =
+ LLVM::PtrToIntOp::create(rewriter, loc, i64, globalAddr);
+
+ Value mask = createI64Constant(rewriter, loc, 0x1FFFFFFFFFFFFFF);
+ Value first57BitsOfGlobalAddr =
+ LLVM::AndOp::create(rewriter, loc, castForGlobalAddr, mask);
+ Value shift = LLVM::LShrOp::create(rewriter, loc, first57BitsOfGlobalAddr,
+ createI64Constant(rewriter, loc, 32));
+
+ Value lowHalf =
+ LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr);
+ Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift);
+
+ Value typeMask = createI32Constant(rewriter, loc, 2 << 30);
+ Value highHalfPlusType =
+ LLVM::OrOp::create(rewriter, loc, highHalf, typeMask);
+
+ Value c0 = createI32Constant(rewriter, loc, 0);
+ Value c1 = createI32Constant(rewriter, loc, 1);
+ Value c2 = createI32Constant(rewriter, loc, 2);
+ Value c3 = createI32Constant(rewriter, loc, 3);
+
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ Value result = LLVM::UndefOp::create(rewriter, loc, v4i32);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, c0, c0);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result,
+ castForLdsAddr, c1);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result,
+ highHalfPlusType, c3);
+
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+};
+
+struct AMDGPUMakeDmaDescriptorLowering
+ : public ConvertOpToLLVMPattern<MakeDmaDescriptorOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ AMDGPUMakeDmaDescriptorLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<MakeDmaDescriptorOp>(converter),
+ chipset(chipset) {}
+ Chipset chipset;
+
+ Value getDGroup0(OpAdaptor adaptor) const { return adaptor.getBase(); }
+
+ Value setValueAtOffset(ConversionPatternRewriter &rewriter, Location loc,
+ Value accumulator, Value value, int shift) const {
+ shift = shift % 32;
+ Value shiftAmount;
+ if (shift != 0) {
+ shiftAmount = createI32Constant(rewriter, loc, shift % 32);
+ value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount);
+ }
+ return LLVM::OrOp::create(rewriter, loc, accumulator, value);
+ }
+
+ Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> consts) const {
+ // Compute data_size.
+ int elementTypeWidthInBytes = op.getElementTypeWidth() / 8;
+
+ Value dataSize;
+ switch (elementTypeWidthInBytes) {
+ case 1:
+ dataSize = consts[0];
+ break;
+ case 2:
+ dataSize = consts[1];
+ break;
+ case 4:
+ dataSize = consts[2];
+ break;
+ case 8:
+ dataSize = consts[3];
+ break;
+ default:
+ llvm_unreachable("Invalid element size.");
+ }
+ return setValueAtOffset(rewriter, loc, sgpr0, dataSize, 16);
+ }
+
+ Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
+ if (!atomic_barrier_enable)
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18);
+ }
+
+ Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool iterate_enable = adaptor.getGlobalIncrement() != nullptr;
+ if (!iterate_enable)
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19);
+ }
+
+ Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool pad_enable = op.getPadAmount() != nullptr;
+ if (!pad_enable)
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
+ }
+
+ Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool pad_enable = op.getPadAmount() != nullptr;
+ if (!pad_enable)
+ return sgpr0;
+
+ IntegerType i32 = rewriter.getI32Type();
+ Value padInterval = adaptor.getPadInterval();
+ // pre-condition: padInterval can be a power of two between 2 and 256
+ padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
+ padInterval, false);
+ padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]);
+ // post-condition: padInterval can be a value between 0 and 7
+ return setValueAtOffset(rewriter, loc, sgpr0, padInterval, 22);
+ }
+
+ Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, const SmallVector<Value> &consts) const {
+ bool pad_enable = op.getPadAmount() != nullptr;
+ if (!pad_enable)
+ return sgpr0;
+
+ Value padAmount = adaptor.getPadAmount();
+ // pre-condition: padAmount is a value between 1-128
+ padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
+ // post-condition: padAmount is a value between 0-127
+ return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25);
+ }
+
+ Value setAtomicBarrierAddress(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1,
+ const SmallVector<Value> &consts) const {
+ bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
+ if (!atomic_barrier_enable)
+ return sgpr1;
+
+ Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress();
+ IntegerType i32 = rewriter.getI32Type();
+ atomicBarrierAddress =
+ LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
+ atomicBarrierAddress =
+ LLVM::LShrOp::create(rewriter, loc, atomicBarrierAddress, consts[3]);
+ Value mask = createI32Constant(rewriter, loc, 0xFFFF);
+ atomicBarrierAddress =
+ LLVM::AndOp::create(rewriter, loc, atomicBarrierAddress, mask);
+ return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32);
+ }
+
+ std::pair<Value, Value>
+ ...
[truncated]
|
Depends on #169817