Add amdgpu.async_load_to_lds

pabloantoniom · pabloantoniom · commit ff001afc6daf · 2025-10-31T09:28:19.000Z
diff --git a/external/llvm-project/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/external/llvm-project/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -967,6 +967,47 @@ def AMDGPU_GatherToLDSOp :
   let hasCanonicalizer = 1;
 }
 
+def AMDGPU_AsyncLoadToLDSOp :
+    AMDGPU_Op<"async_load_to_lds", [AttrSizedOperandSegments]>,
+    Arguments<(ins
+                   Arg<AnyMemRef, "buffer to load from", [MemRead]>:$src,
+                   Variadic<Index>:$srcIndices,
+                   Arg<AnyMemRef, "buffer to write to", [MemWrite]>:$dst,
+                   Variadic<Index>:$dstIndices,
+                   TypeAttr:$transferType
+                   )>,
+    Results<(outs)> {
+  let summary = "MLIR wrapper for gfx1250 async load to LDS instructions";
+  let description = [{
+    The `amdgpu.async_load_to_lds` op is a wrapper around the `global_load_async_to_lds` instructions.
+    Compared to the `gather_to_lds` instruction, this instruction is asynchronous and also does not
+    behave like a gather, since each thread can have its own LDS address.
+
+    Operands:
+    * `$src`: global memory memref to read from.
+    * `$srcIndices`: indices into `$src` to read from for this thread.
+    * `$dst`: LDS memory memref to write to.
+    * `$dstIndices`: base indices into `$dst` to write to for the subgroup of this thread.
+      The elements gathered by the subgroup will be written contiguously in order of lane ID
+      starting at `$dst[$dstIndices]`. Byte-sized (ex. i8) or short-sized (ex. i16)
+      types will be zero-padded/extended to 32 bits before being written. 96-bit types
+      (ex. vector<3xf32>) will be zero-padded to 128 bits before being written. Only the
+      offsets held by lane 0 are used.
+    * `$transferType`: type of the data to be transferred by each thread. This is used to determine
+      the size of the data to be transferred and the number of threads in the subgroup.
+      The transfer type must be a scalar type or a vector type with a single element type.
+
+    The `$dst`, along with its indices, points to the memory location the subgroup of this thread
+    will write to.
+
+    Note: only supported on gfx1250+
+  }];
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` $transferType `,` type($src) `,` type($dst)
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_TransposeLoadOp :
     AMDGPU_Op<"transpose_load", [SameVariadicOperandSize]>,
     Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src, Variadic<Index>:$srcIndices)>,
diff --git a/external/llvm-project/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/external/llvm-project/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -41,6 +41,7 @@ constexpr Chipset kGfx908 = Chipset(9, 0, 8);
 constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
 constexpr Chipset kGfx942 = Chipset(9, 4, 2);
 constexpr Chipset kGfx950 = Chipset(9, 5, 0);
+constexpr Chipset kGfx1250 = Chipset(12, 5, 0);
 
 /// Convert an unsigned number `val` to i32.
 static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter,
@@ -1384,6 +1385,78 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
   }
 };
 
+struct AsyncLoadToLDSOpLowering
+    : public ConvertOpToLLVMPattern<AsyncLoadToLDSOp> {
+  AsyncLoadToLDSOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<AsyncLoadToLDSOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  template <typename OpTy>
+  static void emitLoadOp(mlir::PatternRewriter &rewriter, mlir::Operation *op,
+                         mlir::Value srcPtr, mlir::Value dstPtr) {
+    auto zero = rewriter.getI32IntegerAttr(0);
+    rewriter.replaceOpWithNewOp<OpTy>(op, srcPtr, dstPtr, zero, zero,
+                                      mlir::ArrayAttr{}, mlir::ArrayAttr{},
+                                      mlir::ArrayAttr{});
+  }
+
+  LogicalResult
+  matchAndRewrite(AsyncLoadToLDSOp op, AsyncLoadToLDSOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (chipset != kGfx1250)
+      return op.emitOpError("only gfx1250 is supported");
+
+    Location loc = op.getLoc();
+
+    auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+    auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
+
+    // TODO: instead of only transfering one element per thread, we could
+    // augment it to transfer multiple elements per thread by issuing multiple
+    // `global_load_lds` instructions.
+    Type transferType = op.getTransferType();
+    int loadWidth = [&]() -> int {
+      if (auto transferVectorType = dyn_cast<VectorType>(transferType)) {
+        return (transferVectorType.getNumElements() *
+                transferVectorType.getElementTypeBitWidth()) /
+               8;
+      }
+      return transferType.getIntOrFloatBitWidth() / 8;
+    }();
+
+    // Currently only 1, 4, 8 and 16 byte loads are supported.
+    if (!llvm::is_contained({1, 4, 8, 16}, loadWidth))
+      return op.emitOpError("unsupported element size: ") << loadWidth;
+
+    Value srcPtr =
+        getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
+                             (adaptor.getSrcIndices()));
+    Value dstPtr =
+        getStridedElementPtr(rewriter, loc, dstMemRefType, adaptor.getDst(),
+                             (adaptor.getDstIndices()));
+
+    switch (loadWidth) {
+    case 1:
+      emitLoadOp<ROCDL::GlobalLoadAsyncToLDSB8Op>(rewriter, op, srcPtr, dstPtr);
+      break;
+    case 4:
+      emitLoadOp<ROCDL::GlobalLoadAsyncToLDSB32Op>(rewriter, op, srcPtr,
+                                                   dstPtr);
+      break;
+    case 8:
+      emitLoadOp<ROCDL::GlobalLoadAsyncToLDSB64Op>(rewriter, op, srcPtr,
+                                                   dstPtr);
+      break;
+    case 16:
+      emitLoadOp<ROCDL::GlobalLoadAsyncToLDSB128Op>(rewriter, op, srcPtr,
+                                                    dstPtr);
+      break;
+    }
+    return success();
+  }
+};
+
 namespace {
 struct ExtPackedFp8OpLowering final
     : public ConvertOpToLLVMPattern<ExtPackedFp8Op> {
@@ -2054,7 +2127,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
            WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
-           TransposeLoadOpLowering, AMDGPUPermlaneLowering>(converter, chipset);
+           AsyncLoadToLDSOpLowering, TransposeLoadOpLowering,
+           AMDGPUPermlaneLowering>(converter, chipset);
   patterns.add<LDSBarrierOpLowering>(converter, chipset, hackForDirectToLDS);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/external/llvm-project/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/external/llvm-project/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -566,6 +566,44 @@ LogicalResult GatherToLDSOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// AsyncLoadToLDSOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AsyncLoadToLDSOp::verify() {
+  MemRefType srcType = cast<MemRefType>(getSrc().getType());
+  MemRefType dstType = cast<MemRefType>(getDst().getType());
+
+  if (!dstType.areTrailingDimsContiguous(1))
+    return emitOpError("destination type inner most dim must be contiguous");
+
+  auto elemType = srcType.getElementType();
+  // Check $src and $dst element types are the same.
+  if (elemType != dstType.getElementType())
+    return emitOpError("source and destination element types must match");
+
+  auto transferType = getTransferType();
+  int transferSize;
+  if (auto vectorTransfer = dyn_cast<VectorType>(transferType)) {
+    transferSize = vectorTransfer.getNumElements() *
+                   vectorTransfer.getElementTypeBitWidth();
+  } else {
+    transferSize = transferType.getIntOrFloatBitWidth();
+  }
+  if (!llvm::is_contained({8, 32, 64, 128}, transferSize))
+    return emitOpError("Transfering type size must be 8, 32, 64 or 128 bits");
+
+  if (!hasGlobalMemorySpace(srcType.getMemorySpace()) &&
+      !hasFatRawBufferMemorySpace(srcType.getMemorySpace()))
+    return emitOpError(
+        "source memory address space must be global or fat raw buffer");
+
+  if (!hasWorkgroupMemorySpace(dstType.getMemorySpace()))
+    return emitOpError("destination memory address space must be Workgroup");
+
+  return success();
+}
+
 namespace {
 /// If the source/target of a GatherToLDSOp is a CastOp that only removes static
 /// information or changes layout, the cast can be skipped.
diff --git a/external/llvm-project/mlir/test/Conversion/AMDGPUToROCDL/async_load_lds.mlir b/external/llvm-project/mlir/test/Conversion/AMDGPUToROCDL/async_load_lds.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+
+#gpu_global_addrspace = 1
+#gpu_lds_addrspace = 3
+
+// CHECK-LABEL: func @global_load_to_rocdl_f32
+// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 1>)
+func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_addrspace>) {
+  %c0 = arith.constant 0 : index
+  %c12 = arith.constant 12 : index
+  %c32 = arith.constant 32 : index
+  %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace>
+  // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
+
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
+  // CHECK: %[[C12:.*]] = arith.constant 12 : index
+  // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
+  // CHECK: %[[C32:.*]] = arith.constant 32 : index
+  // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]]
+
+  // CHECK: %[[ALLOC:.*]] = memref.alloc()
+  // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast
+  // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
+
+  // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
+  // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
+  // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
+
+  // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
+  // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
+
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
+  // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
+
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
+  // CHECK: rocdl.global.load.async.to.lds.b32 %[[GLOBAL_PTR]], %[[LDS_PTR]]
+  amdgpu.async_load_to_lds %global[%c12, %c0], %alloc[%c32, %c0]
+    : f32, memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace>
+  func.return
+}