iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUGreedilyDistributeToThreads.cpp‎
Lines changed: 5 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUGreedilyDistributeToThreads.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUInferMemorySpace.cpp‎
Lines changed: 6 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUInferMemorySpace.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToGlobalLoads.cpp‎
Lines changed: 225 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToGlobalLoads.cpp‎
Lines changed: 225 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp‎
Lines changed: 24 additions & 8 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp‎
Lines changed: 6 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp‎
Lines changed: 6 additions & 0 deletions
@@ -74,6 +74,7 @@ iree_compiler_cc_library(
         "GPUGeneralizeNamedOps.cpp",
         "GPUGreedilyDistributeToThreads.cpp",
         "GPUInferMemorySpace.cpp",
+        "GPULowerToGlobalLoads.cpp",
         "GPULowerToUKernels.cpp",
         "GPUMultiBuffering.cpp",
         "GPUNestedLayoutDistributionPatterns.cpp",
@@ -146,6 +147,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefTransforms",
+        "@llvm-project//mlir:MemRefUtils",
         "@llvm-project//mlir:NVGPUDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Rewrite",
 
@@ -67,6 +67,7 @@ iree_cc_library(
     "GPUGeneralizeNamedOps.cpp"
     "GPUGreedilyDistributeToThreads.cpp"
     "GPUInferMemorySpace.cpp"
+    "GPULowerToGlobalLoads.cpp"
     "GPULowerToUKernels.cpp"
     "GPUMultiBuffering.cpp"
     "GPUNestedLayoutDistributionPatterns.cpp"
@@ -116,6 +117,7 @@ iree_cc_library(
     MLIRLoopLikeInterface
     MLIRMemRefDialect
     MLIRMemRefTransforms
+    MLIRMemRefUtils
     MLIRNVGPUDialect
     MLIRPass
     MLIRRewrite
 
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
@@ -135,6 +136,10 @@ static void processRegion(RewriterBase &rewriter, Region *region) {
 
       // If an op implements the tiling interface, try to greedily tile + fuse.
       if (auto tilableOp = dyn_cast<TilingInterface>(op)) {
+        // Do not distribute to threads of an op wants to use DMA.
+        if (auto useDMAConfig =
+                getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(op))
+          continue;
         tileToThreads(rewriter, tilableOp);
         continue;
       }
 
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "llvm/ADT/STLExtras.h"
@@ -38,6 +39,11 @@ bool isDefinitelyShared(bufferization::AllocTensorOp alloc) {
   // thread distributed `scf.forall` op. All other shared allocations are
   // expected to be properly indicated in advance.
   for (auto user : alloc->getUsers()) {
+    if (isa<linalg::CopyOp>(user) &&
+        getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(user)) {
+      continue;
+    }
+
     auto forallOp = dyn_cast<scf::ForallOp>(user);
     if (!forallOp ||
         !forallOpHasMappingType<gpu::GPUThreadMappingAttr,
 
@@ -0,0 +1,225 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstdint>
+#include <numeric>
+#include <optional>
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-codegen-gpu-lower-to-global-loads"
+#define LDBG(X) LLVM_DEBUG(llvm::dbgs() << X << "\n")
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPULOWERTOGLOBALLOADSPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+static constexpr int kNumBitsPerCopy = 32;
+
+static LogicalResult
+distributeLinalgCopyToThreads(RewriterBase &rewriter, linalg::CopyOp copy,
+                              ArrayRef<int64_t> workgroupSize,
+                              int64_t subgroupSize) {
+  LDBG("==== distributing op: ");
+  LDBG(*copy);
+  Location loc = copy.getLoc();
+
+  // The linalg.copy we are dealing with represents a region we need to copy to
+  // workgroup memory. Assume there are N threads in the workgroup, then there
+  // are `num_subgroups = N / gpu.subgroup_size` subgroups in the workgroup.
+  //
+  // So we are slicing up the target memref into `num_subgroups` consecutive
+  // slices, and threads in the same subgroup will copy their slice to workgroup
+  // memory slice.
+
+  // Get the copy size:
+  auto copyMemRefType = cast<MemRefType>(copy.getOperand(1).getType());
+  if (!memref::isStaticShapeAndContiguousRowMajor(copyMemRefType)) {
+    return rewriter.notifyMatchFailure(copy,
+                                       "Copy to non-static or non-contiguous, "
+                                       "non-row major memref.");
+  }
+  int64_t rank = copyMemRefType.getRank();
+  SmallVector<OpFoldResult> tileSize(rank - 1, rewriter.getIndexAttr(1));
+
+  int64_t elementBitWidth = copyMemRefType.getElementTypeBitWidth();
+  if (kNumBitsPerCopy % elementBitWidth != 0) {
+    return rewriter.notifyMatchFailure(copy, "Copy size is not a multiple of "
+                                             "element bit width.");
+  }
+  int64_t elementsPerCopy = kNumBitsPerCopy / elementBitWidth;
+
+  // Divide the copy by subgroup, and load linearly.
+  assert(workgroupSize[0] % subgroupSize == 0);
+
+  int64_t numSubgroups = workgroupSize[0] / subgroupSize;
+  int64_t totalCopySize = copyMemRefType.getNumElements();
+  int64_t totalCopySizePerSubgroup = totalCopySize / numSubgroups;
+  int64_t numCopiesPerThread =
+      (totalCopySizePerSubgroup / elementsPerCopy) / subgroupSize;
+  int64_t residualElements =
+      totalCopySizePerSubgroup % (subgroupSize * elementsPerCopy);
+
+  LDBG("-- elementsPerCopy: " << elementsPerCopy);
+  LDBG("-- workgroupSize: " << workgroupSize[0]);
+  LDBG("-- numSubgroups: " << numSubgroups);
+  LDBG("-- totalCopySize: " << totalCopySize);
+  LDBG("-- totalCopySizePerSubgroup: " << totalCopySizePerSubgroup);
+  LDBG("-- numCopiesPerThread: " << numCopiesPerThread);
+  LDBG("-- residualElements: " << residualElements);
+
+  if (residualElements != 0) {
+    return rewriter.notifyMatchFailure(
+        copy, "Cannot proceed: cannot handle copying residual elements.");
+  }
+
+  Value subgroupId = rewriter.create<gpu::SubgroupIdOp>(loc, nullptr);
+  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, nullptr);
+
+  auto sourceType = cast<MemRefType>(copy.getOperand(0).getType());
+  auto localType = cast<MemRefType>(copy.getOutputs().front().getType());
+
+  auto getGlobalGatherIndex = [&](Value sgIdVal, Value lIdVal,
+                                  Value indVar) -> Value {
+    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    return rewriter.create<affine::AffineLinearizeIndexOp>(
+        loc, ValueRange{sgIdVal, indVar, lIdVal, zero},
+        ArrayRef<int64_t>{numSubgroups, numCopiesPerThread, subgroupSize,
+                          elementsPerCopy},
+        /*disjoint=*/true);
+  };
+
+  auto getSubgroupStoreBaseIndex = [&](Value sgIdVal, Value indVar) -> Value {
+    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    return getGlobalGatherIndex(sgIdVal, zero, indVar);
+  };
+
+  // Build a for loop skeleton:
+  scf::ForOp forOp = rewriter.create<scf::ForOp>(
+      loc, /*lb=*/rewriter.create<arith::ConstantIndexOp>(loc, 0),
+      /*ub=*/rewriter.create<arith::ConstantIndexOp>(loc, numCopiesPerThread),
+      /*steps=*/rewriter.create<arith::ConstantIndexOp>(loc, 1));
+
+  auto delinearizeIndex = [&](Value index, ArrayRef<int64_t> shape) {
+    return rewriter.create<affine::AffineDelinearizeIndexOp>(loc, index, shape)
+        .getMultiIndex();
+  };
+
+  // For loop body:
+  {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointToStart(forOp.getBody());
+    auto inductionVar = forOp.getInductionVar();
+    Value linearizedGatherIndices =
+        getGlobalGatherIndex(subgroupId, laneId, inductionVar);
+    ValueRange delinearizedGlobalIndices =
+        delinearizeIndex(linearizedGatherIndices, sourceType.getShape());
+    Value linearizedBaseIndices =
+        getSubgroupStoreBaseIndex(subgroupId, inductionVar);
+    ValueRange delinearizedLocalIndices =
+        delinearizeIndex(linearizedBaseIndices, localType.getShape());
+    rewriter.create<IREE::GPU::GlobalLoadDMAOp>(
+        loc, copy.getOperand(0), delinearizedGlobalIndices,
+        copy.getOutputs()[0], delinearizedLocalIndices);
+  }
+
+  // Sync at the end of the loop across threads.
+  rewriter.replaceOpWithNewOp<gpu::BarrierOp>(copy);
+  return success();
+}
+
+static LogicalResult isEligibleForGlobalDMA(linalg::CopyOp copy) {
+  // Source must be global address and target must be workgroup address.
+  auto sourceType = cast<MemRefType>(copy.getOperand(0).getType());
+  auto targetType = cast<MemRefType>(copy.getOutputs().front().getType());
+
+  if (!getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copy)) {
+    LDBG("-- Op: " << *copy);
+    LDBG("-- does not have `use_global_load_dma` attribute, skipping.");
+    return failure();
+  }
+
+  if (!hasGlobalMemoryAddressSpace(sourceType) ||
+      !hasSharedMemoryAddressSpace(targetType)) {
+    LDBG("-- Op: " << *copy);
+    LDBG("-- incompatible source or target memory address space.");
+    return failure();
+  }
+
+  // TODO: check that the copy's target memref is not a subview: a subview
+  // cannot guarantee contiguity of dest memory region.
+  return success();
+}
+
+struct LowerToDMAPattern : public OpRewritePattern<linalg::CopyOp> {
+  LowerToDMAPattern(MLIRContext *context, ArrayRef<int64_t> workgroupSize,
+                    int64_t subgroupSize)
+      : OpRewritePattern<linalg::CopyOp>(context), workgroupSize(workgroupSize),
+        subgroupSize(subgroupSize) {}
+
+  LogicalResult matchAndRewrite(linalg::CopyOp copy,
+                                PatternRewriter &rewriter) const override {
+    if (failed(isEligibleForGlobalDMA(copy))) {
+      return failure();
+    }
+    return distributeLinalgCopyToThreads(rewriter, copy, workgroupSize,
+                                         subgroupSize);
+  }
+
+private:
+  ArrayRef<int64_t> workgroupSize;
+  int64_t subgroupSize;
+};
+
+namespace {
+struct GPULowerToGlobalLoadsPass final
+    : impl::GPULowerToGlobalLoadsPassBase<GPULowerToGlobalLoadsPass> {
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    auto funcOp = getOperation();
+
+    std::optional<SmallVector<int64_t>> workgroupSize =
+        mlir::iree_compiler::getWorkgroupSize(funcOp);
+    if (!workgroupSize) {
+      funcOp.emitOpError(
+          "unimplemented: Distribution with dynamic workgroup size.");
+      return signalPassFailure();
+    }
+    auto subgroupSize = mlir::iree_compiler::getSubgroupSize(funcOp);
+    if (!subgroupSize) {
+      funcOp.emitOpError(
+          "unimplemented: Distribution with dynamic subgroup size.");
+      return signalPassFailure();
+    }
+
+    RewritePatternSet patterns(context);
+    patterns.add<LowerToDMAPattern>(context, *workgroupSize, *subgroupSize);
+    (void)applyPatternsGreedily(funcOp, std::move(patterns));
+  }
+};
+} // namespace
+} // namespace mlir::iree_compiler
@@ -29,14 +29,22 @@ namespace mlir::iree_compiler {
 
 namespace {
 /// Helper to insert copy with derived thread config.
-Value promoteValue(OpBuilder &builder, Location loc, Value v) {
+Value promoteValue(OpBuilder &builder, Location loc, Value v,
+                   bool useDirectLoad) {
   auto tensorType = cast<RankedTensorType>(v.getType());
   SmallVector<OpFoldResult> mixedSizes = tensor::getMixedSizes(builder, loc, v);
+
   Value empty = builder.create<tensor::EmptyOp>(loc, mixedSizes,
                                                 tensorType.getElementType());
   auto copy = builder.create<linalg::CopyOp>(loc, v, empty);
-  setLoweringConfig(
-      copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
+
+  if (useDirectLoad) {
+    setLoweringConfig(
+        copy, IREE::GPU::UseGlobalLoadDMAAttr::get(builder.getContext()));
+  } else {
+    setLoweringConfig(
+        copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
+  }
   return copy.getResult(0);
 }
 
@@ -95,7 +103,8 @@ void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
   }
 
   rewriter.setInsertionPointAfterValue(replacement);
-  replacement = promoteValue(rewriter, loc, replacement);
+  replacement =
+      promoteValue(rewriter, loc, replacement, /*useDirectLoad=*/false);
   valueToReplace.replaceUsesWithIf(replacement, [&](OpOperand &use) {
     return opsToReplaceUseIn.contains(use.getOwner());
   });
@@ -110,7 +119,7 @@ void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
 ///
 ///   %empty = tensor.empty()
 ///   %copy = linalg.copy %1 to %empty {
-///     lowering_config = #iree_gpu.derived_thread_config}
+///     lowering_config = #iree_gpu.{derived_thread_config|use_global_dma}}
 ///   linalg.matmul ins(%0, %copy)
 ///
 /// If the producer is already a tilable op, the producer is just annotated with
@@ -122,7 +131,8 @@ void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
 ///   %copy1 = linalg.copy %2 to %out_buffer
 ///   %copy2 = linalg.copy %copy1 to %empty {
 ///     lowering_config = #iree_gpu.derived_thread_config}
-void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
+void promoteOperand(OpBuilder &builder, Operation *op, unsigned index,
+                    bool useDirectLoad) {
   auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op);
   if (!dpsOp)
     return;
@@ -162,12 +172,15 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
     return;
   }
 
-  auto replacement = promoteValue(builder, op->getLoc(), operand);
+  auto replacement =
+      promoteValue(builder, op->getLoc(), operand, useDirectLoad);
   op->setOperand(index, replacement);
 }
 
 struct GPUPromoteMatmulOperandsPass final
     : impl::GPUPromoteMatmulOperandsPassBase<GPUPromoteMatmulOperandsPass> {
+  using GPUPromoteMatmulOperandsPassBase::GPUPromoteMatmulOperandsPassBase;
+
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
 
@@ -187,7 +200,10 @@ struct GPUPromoteMatmulOperandsPass final
 
       builder.setInsertionPoint(op);
       for (auto operand : promotedOperands.value()) {
-        promoteOperand(builder, op, operand);
+        // TODO: move switch `useDirectLoad` to the promotion attr list.
+        // Here using a command line option should be only a temporary
+        // solution.
+        promoteOperand(builder, op, operand, useDirectLoad);
       }
     });
   }
 
@@ -76,6 +76,12 @@ struct GPUVerifyDistributionPass final
               continue;
             }
 
+            // Allow DMA copies.
+            if (isa<linalg::CopyOp>(op) &&
+                getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(op)) {
+              continue;
+            }
+
             op->emitOpError(
                 "write affecting operations on shared resources are restricted "
                 "to lane or thread distributed contexts.");