update lowering

lialan · lialan · commit 4ed20068c554 · 2025-04-02T14:12:42.000-04:00
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -769,9 +769,10 @@ def AMDGPU_GatherToLDSOp :
     AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
     Arguments<(ins
                    Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
-                   Variadic<I32>:$srcIndices,
+                   Variadic<Index>:$srcIndices,
                    Arg<AnyMemRef, "buffer to write to", [MemWrite]>:$dst,
-                   Variadic<I32>:$dstIndices
+                   Variadic<Index>:$dstIndices,
+                   TypeAttr:$transferType
                    )>,
     Results<(outs)> {
   let summary = "MLIR wrapper for CDNA mfma instructions";
@@ -784,7 +785,10 @@ def AMDGPU_GatherToLDSOp :
     * `$dst`: LDS memory memref to write to.
     * `$dstIndices`: base indices into `$dst` to write to for the subgroup of this thread.
       number of subgroup size of elements will be written contiguously to `$dst[$dstIndices]`.
-    
+    * `$transferType`: type of the data to be transferred by each thread. This is used to determine
+      the size of the data to be transferred and the number of threads in the subgroup.
+      The transfer type must be a scalar type or a vector type with a single element type.
+
     The `$dst`, along with its indices, points to the memory location the subgroup of this thread
     will write to.
   
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -913,60 +913,49 @@ struct GatherToLDSOpLowering
   LogicalResult
   matchAndRewrite(GatherToLDSOp op, GatherToLDSOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (chipset < kGfx942)
+      return op.emitOpError("chipset not supported");
+
     Location loc = op.getLoc();
 
-    auto elemType = cast<MemRefType>(op.getDst().getType()).getElementType();
-    size_t elemSizeInBits = elemType.getIntOrFloatBitWidth();
-    if (elemSizeInBits % 8 != 0)
-      return op.emitOpError("element size must be a multiple of 8");
+    auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+    auto dstMemRefType = cast<MemRefType>(op.getSrc().getType());
 
     // TODO: instead of only transfering one element per thread, we could
     // augment it to transfer multiple elements per thread by issuing multiple
     // `global_load_lds` instructions.
-    auto loadWidth = elemSizeInBits / 8;
-
-    if (chipset < kGfx942)
-      return op.emitOpError("chipset not supported");
+    size_t loadWidth;
+    Type transferType = op.getTransferType();
+    if (auto transferVectorType = dyn_cast<VectorType>(transferType))
+      loadWidth = transferVectorType.getNumElements() *
+                  transferVectorType.getElementTypeBitWidth() / 8;
+    else
+      loadWidth = transferType.getIntOrFloatBitWidth() / 8;
 
     // Currently only 1, 2, and 4 byte loads are supported.
-    if (!(loadWidth == 1 || loadWidth == 2 || loadWidth == 4))
+    if (loadWidth != 1 && loadWidth != 2 && loadWidth != 4)
       return op.emitOpError("chipset unsupported element size");
 
-    // Return pair of {base pointer, linearized index}.
-    auto getBasePtrAndLinearizedIndex =
-        [&](Value memref, MemRefType memrefType,
-            ValueRange indices) -> std::optional<std::pair<Value, Value>> {
-      MemRefDescriptor memRefDescriptor(memref);
-      int64_t offset = 0;
-      SmallVector<int64_t, 5> strides;
-      if (failed(memrefType.getStridesAndOffset(strides, offset)))
-        return {};
-      return std::make_pair(
-          memRefDescriptor.bufferPtr(rewriter, loc, *getTypeConverter(),
-                                     memrefType),
-          getLinearIndexI32(rewriter, loc, memRefDescriptor, indices, strides));
+    auto convertIndices =
+        [&](ValueRange indices) -> SmallVector<Value, 4> {
+      SmallVector<Value, 4> convertedIndices;
+      
+      for (Value index : indices) {
+        Type convertedType = getTypeConverter()->convertType(index.getType());
+        auto convertedIndex = rewriter.create<LLVM::ConstantOp>(
+            loc, convertedType,
+            rewriter.getIntegerAttr(convertedType, 0));
+        convertedIndices.push_back(convertedIndex);
+      }
+      return convertedIndices;
     };
 
-    auto optSrcBuffer = getBasePtrAndLinearizedIndex(
-        adaptor.getSrc(), cast<MemRefType>(op.getSrc().getType()),
-        op.getSrcIndices());
-    if (!optSrcBuffer)
-      return op.emitOpError("failed to flatten source memref indices");
-    auto optDstBuffer = getBasePtrAndLinearizedIndex(
-        adaptor.getDst(), cast<MemRefType>(op.getDst().getType()),
-        op.getDstIndices());
-    if (!optDstBuffer)
-      return op.emitOpError("failed to flatten destination memref indices");
-
-    Type srcPtrType = LLVM::LLVMPointerType::get(rewriter.getContext(), 1);
-    Type dstPtrType = LLVM::LLVMPointerType::get(rewriter.getContext(), 3);
-    Value srcPtr = rewriter.create<LLVM::GEPOp>(
-        loc, srcPtrType, elemType, optSrcBuffer->first,
-        ArrayRef<Value>({optSrcBuffer->second}));
-
-    Value dstPtr = rewriter.create<LLVM::GEPOp>(
-        loc, dstPtrType, elemType, optDstBuffer->first,
-        ArrayRef<Value>({optDstBuffer->second}));
+    Value srcPtr =
+        getStridedElementPtr(loc, srcMemRefType, adaptor.getSrc(),
+                             convertIndices(op.getSrcIndices()), rewriter);
+    Value dstPtr =
+        getStridedElementPtr(loc, dstMemRefType, adaptor.getDst(),
+                             convertIndices(op.getDstIndices()), rewriter);
 
     rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadLDSOp>(
         op, srcPtr, dstPtr, createI32Constant(rewriter, loc, loadWidth),
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -25,6 +25,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/IR/DerivedTypes.h"
 
 #include <limits>
 #include <optional>
@@ -113,21 +114,30 @@ LogicalResult FatRawBufferCastOp::verify() {
   return success();
 }
 
+static bool hasGlobalMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return true;
+  else if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
+    return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
+  else if (auto gpuMemorySpace =
+          llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+    return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
+  return false;
+}
+
+static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+  if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
+    return intMemorySpace.getInt() == 3;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // RawBuffer*Op
 //===----------------------------------------------------------------------===//
 template <typename T>
 static LogicalResult verifyRawBufferOp(T &op) {
   MemRefType bufferType = llvm::cast<MemRefType>(op.getMemref().getType());
-  Attribute memorySpace = bufferType.getMemorySpace();
-  bool isGlobal = false;
-  if (!memorySpace)
-    isGlobal = true;
-  else if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace))
-    isGlobal = intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
-  else if (auto gpuMemorySpace =
-               llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
-    isGlobal = gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
+  bool isGlobal = hasGlobalMemorySpace(bufferType.getMemorySpace());
 
   if (!isGlobal)
     return op.emitOpError(
@@ -473,13 +483,22 @@ LogicalResult GatherToLDSOp::verify() {
   if (elemType != dstType.getElementType())
     return emitOpError("source and destination element types must match");
 
-  // Element type sizes should be 1, 2, or 4 bytes.
-  auto elemSize = elemType.getIntOrFloatBitWidth();
-  if (elemSize != 8 && elemSize != 16 && elemSize != 32)
-    return emitOpError("source and destination element types must be 8, 16, "
-                       "or 32 bits");
+  // copy type sizes should be 1, 2, or 4 bytes.
+  auto transferType = getTransferType();
+  size_t transferSize;
+  if (auto vectorTransfer = dyn_cast<VectorType>(transferType)) {
+    transferSize = vectorTransfer.getNumElements() *
+                   vectorTransfer.getElementTypeBitWidth();
+  } else {
+    transferSize = transferType.getIntOrFloatBitWidth();
+  }
+  if (transferSize != 8 && transferSize != 16 && transferSize != 32)
+    return emitOpError("Transfering type size must be 8, 16, or 32 bits");
+
+  if (!hasGlobalMemorySpace(srcType.getMemorySpace()))
+    return emitOpError("source memory address space must be Global");
 
-  if (!gpu::GPUDialect::hasWorkgroupMemoryAddressSpace(dstType))
+  if (!hasWorkgroupMemorySpace(dstType.getMemorySpace()))
     return emitOpError("destination memory address space must be Workgroup");
 
   return success();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -466,21 +466,21 @@ func.func @sched_barrier() {
 // CHECK-LABEL: func @global_load_to_rocdl_f32
 // CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 1>)
 func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_addrspace>) {
-  %c0 = arith.constant 0 : i32
-  %c12 = arith.constant 12 : i32
-  %c32 = arith.constant 32 : i32
+  %c0 = arith.constant 0 : index
+  %c12 = arith.constant 12 : index
+  %c32 = arith.constant 32 : index
   %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace>
   // GFX942: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<128x72xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
   // GFX942: %[[ALLOC:.*]] = memref.alloc() : memref<64x64xf32, 3>
   // GFX942: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] : memref<64x64xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
   // GFX942: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)> 
   // GFX942: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
-  // GFX942: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[GLOBAL_OFFSET:.*]]] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, f32
-  // GFX942: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[LDS_OFFSET:.*]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f32
+  // GFX942: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]]
+  // GFX942: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]]
   // GFX942: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
   // GFX942: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
   // GFX942: %[[C0_2:.*]] = llvm.mlir.constant(0 : i32) : i32
   // GFX942: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]], %[[C0]], %[[C0_2]]
-  amdgpu.global_load %global[%c12, %c0], %alloc[%c32, %c0] : memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace>
+  amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] {transferType = f32} : memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace>
   func.return
 }