Use another approach to work.

lialan · lialan · commit 344d4bd341a7 · 2026-01-27T18:28:22.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -59,17 +59,38 @@ static SmallVector<Attribute> getThreadMapping(MLIRContext *ctx) {
   return mapping;
 }
 
+/// Check if the source of an operation comes directly from global memory.
+/// Returns false if the source goes through tensor.pad or other local
+/// computation that would prevent using global load DMA.
+static bool sourceIsFromGlobalMemory(Operation *op) {
+  Value source;
+  if (auto copyOp = dyn_cast<linalg::CopyOp>(op)) {
+    source = copyOp.getInputs()[0];
+  } else if (auto gatherOp = dyn_cast<IREE::LinalgExt::GatherOp>(op)) {
+    source = gatherOp.getSource();
+  } else {
+    return false;
+  }
+
+  // Trace through extract_slice operations to find the origin.
+  while (auto extractOp = source.getDefiningOp<tensor::ExtractSliceOp>()) {
+    source = extractOp.getSource();
+  }
+
+  // If the source comes from tensor.pad, it's not directly from global memory.
+  if (source.getDefiningOp<tensor::PadOp>()) {
+    return false;
+  }
+
+  // Otherwise, assume it's from global memory (e.g., dispatch tensor load).
+  return true;
+}
+
 /// Helper to compute thread number of threads based on translation_info.
 /// Uses the subgroup_size from translation_info for thread-level tiling.
 static SmallVector<OpFoldResult>
 computeThreadNumThreadsImpl(OpBuilder &builder, Operation *op,
                             RankedTensorType outputType) {
-  // Check that this operation has the use_global_load_dma config.
-  auto dmaConfig = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(op);
-  if (!dmaConfig) {
-    return {};
-  }
-
   // Get the function containing this operation.
   auto funcOp = op->getParentOfType<FunctionOpInterface>();
   if (!funcOp) {
@@ -341,6 +362,11 @@ struct ConvertToCoalescedDMABase : public OpRewritePattern<OpTy> {
       return failure();
     }
 
+    // Check that source comes from global memory (not tensor.pad).
+    if (!sourceIsFromGlobalMemory(op)) {
+      return failure();
+    }
+
     SmallVector<OpFoldResult> threadNumThreads =
         computeThreadNumThreads(rewriter, op);
     if (threadNumThreads.empty()) {
@@ -386,11 +412,8 @@ struct ConvertGatherToCoalescedDMA
       return failure();
     }
 
-    // For gather ops, tile only the innermost dimension to distribute across
-    // threads.
-    auto dmaConfig =
-        getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(gatherOp);
-    if (!dmaConfig) {
+    // Check that source comes from global memory (not tensor.pad).
+    if (!sourceIsFromGlobalMemory(gatherOp)) {
       return failure();
     }
 
@@ -400,6 +423,12 @@ struct ConvertGatherToCoalescedDMA
       return failure();
     }
 
+    // Check target supports global load DMA.
+    IREE::GPU::TargetAttr target = getGPUTargetAttr(funcOp);
+    if (!target || !IREE::GPU::targetSupportsGlobalLoadDMA(target)) {
+      return failure();
+    }
+
     // Get subgroup size from translation_info.
     std::optional<int64_t> subgroupSize = getSubgroupSize(funcOp);
     if (!subgroupSize) {
@@ -418,11 +447,6 @@ struct ConvertGatherToCoalescedDMA
     Type elementType = outputType.getElementType();
     int64_t elementBits = elementType.getIntOrFloatBitWidth();
 
-    IREE::GPU::TargetAttr target = getGPUTargetAttr(funcOp);
-    if (!target) {
-      return failure();
-    }
-
     ArrayRef<int64_t> dmaSizes;
     if (DenseI64ArrayAttr dmaSizesAttr = target.getWgp().getDmaSizes()) {
       dmaSizes = dmaSizesAttr.asArrayRef();
@@ -617,10 +641,6 @@ struct GPUConvertToCoalescedDMAPass final
   FailureOr<scf::SCFTilingResult> tileAtSubgroupLevel(IRRewriter &rewriter,
                                                       OpTy op) {
     MLIRContext *context = &getContext();
-    auto dmaConfig = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(op);
-    if (!dmaConfig) {
-      return failure();
-    }
 
     // Get the function containing this operation.
     auto funcOp = op->template getParentOfType<FunctionOpInterface>();
@@ -718,18 +738,26 @@ struct GPUConvertToCoalescedDMAPass final
 
   LogicalResult applySubgroupTiling(FunctionOpInterface funcOp) {
     MLIRContext *context = &getContext();
+
+    // Check if target supports global load DMA.
+    IREE::GPU::TargetAttr target = getGPUTargetAttr(funcOp);
+    if (!target || !IREE::GPU::targetSupportsGlobalLoadDMA(target)) {
+      return success();
+    }
+
     SmallVector<Operation *> opsToTile;
 
-    // Collect all ops with iree_gpu.use_global_load_dma lowering config.
+    // Collect copy/gather ops that are eligible for coalesced DMA.
     // Skip ops that are already inside a warp-mapped forall.
     funcOp->walk([&](Operation *op) {
       if (isa<linalg::CopyOp, IREE::LinalgExt::GatherOp>(op)) {
-        auto config = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(op);
-        if (config) {
-          auto parentForall = op->getParentOfType<scf::ForallOp>();
-          if (!hasWarpMapping(parentForall)) {
-            opsToTile.push_back(op);
-          }
+        // Check that source comes from global memory (not tensor.pad).
+        if (!sourceIsFromGlobalMemory(op)) {
+          return;
+        }
+        auto parentForall = op->getParentOfType<scf::ForallOp>();
+        if (!hasWarpMapping(parentForall)) {
+          opsToTile.push_back(op);
         }
       }
     });
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -166,16 +166,16 @@ struct GPUReduceBankConflictsPass final
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
 
-    // Skip bank conflict reduction if coalesced DMA ops are present.
+    // Skip bank conflict reduction if gather_to_lds DMA ops are present.
     // DMA operations have their own optimized memory access patterns that
     // write directly to LDS with hardware-controlled coalescing. Padding
     // shared memory would interfere with the expected DMA memory layout.
-    bool hasCoalescedDMA = false;
-    funcOp.walk([&](IREE::GPU::CoalescedGatherDMAOp) {
-      hasCoalescedDMA = true;
+    bool hasGatherToLDS = false;
+    funcOp.walk([&](amdgpu::GatherToLDSOp) {
+      hasGatherToLDS = true;
       return WalkResult::interrupt();
     });
-    if (hasCoalescedDMA) {
+    if (hasGatherToLDS) {
       return;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -940,19 +940,11 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
       {"subgroup", b.getI64ArrayAttr(subgroupTileSizes)},
       {"mma_kind", kind}};
 
-  // Use global load DMA attribute (subgroup sizes will be derived from
-  // translation_info) only on gfx950+.
-  SmallVector<Attribute> promotionArray;
-  if (targetSupportsGlobalLoadDMA(target)) {
-    Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
-    promotionArray = {useGlobalDma, useGlobalDma};
-  }
+  // Build promotion list - global load DMA eligibility is determined
+  // dynamically in the GPUConvertToCoalescedDMA pass based on whether
+  // the source comes directly from global memory.
   SmallVector<int64_t> promotionList = {0, 1};
   if (scaled) {
-    // TODO(#22119): We don't use global load DMA for scaled matmuls, because
-    // compilation doesn't support it. Once this is fixed, we should use global
-    // load DMA here when possible.
-    promotionArray = {};
     promotionList.append({2, 3});
   }
   bool cWasPromoted = (!mustBeAligned || couldNeedPadding) && cPromoteIfPadding;
@@ -961,14 +953,8 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     // and scaled GEMM respectively.
     promotionList.push_back(promotionList.size());
   }
-  // Do not use direct load DMA when padding is needed, as the source will
-  // go through tensor.pad and won't be directly from global memory. Also don't
-  // use DMA types when C is promoted since C is output, not loaded from global.
-  ArrayRef<Attribute> promotionTypes =
-      (couldNeedPadding || cWasPromoted) ? ArrayRef<Attribute>{}
-                                         : ArrayRef<Attribute>(promotionArray);
   GPU::appendPromotedOperandsList(context, attrs, promotionList,
-                                  promotionTypes);
+                                  /*promotionTypes=*/{});
   if (!mustBeAligned || couldNeedPadding) {
     SmallVector<int64_t> paddingTileSizes = workgroupTileSizes;
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
@@ -211,7 +211,7 @@ module {
 
 // CHECK-LABEL: func.func @data_tiled_scaled_mma_inner_tiled
 //  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
-//  CHECK-SAME:   {gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
+//  CHECK-SAME:   {gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}
 //       CHECK:   iree_codegen.inner_tiled {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 1, 1]