Update

lialan · lialan · commit cb55700f323c · 2026-01-21T15:48:36.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -392,3 +392,35 @@ func.func @swizzle_operand_no_promote_fill(%b: tensor<128x128xf32>) -> tensor<4x
 //   CHECK-NOT:   tensor.expand_shape
 //       CHECK:   linalg.matmul
 //       CHECK: return
+
+// -----
+
+// Verify that when use_global_load_dma is requested but input comes from
+// tensor.pad, it falls back to derived_thread_config since the padded data
+// is not from global memory. Non-padded inputs should still use DMA.
+
+#lowering_config_dma_with_pad = #iree_gpu.lowering_config<{
+  promote_operands = [0, 1],
+  promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma]}>
+
+func.func @no_dma_for_padded_input(%a : tensor<4x127xf32>, %b: tensor<128x128xf32>) -> tensor<4x128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<4x128xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<4x128xf32>) -> tensor<4x128xf32>
+  %padded = tensor.pad %a low[0, 0] high[0, 1] {
+  ^bb0(%arg0: index, %arg1: index):
+    tensor.yield %cst : f32
+  } : tensor<4x127xf32> to tensor<4x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config_dma_with_pad}
+    ins(%padded, %b : tensor<4x128xf32>, tensor<128x128xf32>) outs(%fill : tensor<4x128xf32>) -> tensor<4x128xf32>
+  return %mm : tensor<4x128xf32>
+}
+
+// Padded input falls back to derived_thread_config, non-padded uses DMA.
+// CHECK-LABEL: func.func @no_dma_for_padded_input
+//       CHECK:   tensor.pad
+//       CHECK:   linalg.copy
+//  CHECK-SAME:     lowering_config = #iree_gpu.derived_thread_config
+//       CHECK:   linalg.copy
+//  CHECK-SAME:     lowering_config = #iree_gpu.use_global_load_dma
+//       CHECK:   linalg.matmul
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/PromotionImpls.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/PromotionImpls.cpp
@@ -9,6 +9,7 @@
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtInterfaces.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
@@ -124,8 +125,18 @@ Value defaultPromotionImpl(OpBuilder &builder, OpOperand &operand,
   if (promotedValue.has_value()) {
     return promotedValue.value();
   }
+
+  // Global load DMA requires the source to come from global memory. If the
+  // source comes from tensor.pad, the data is not in global memory, so fall
+  // back to derived thread config.
+  Attribute effectiveAttr = attr;
+  if (isa<UseGlobalLoadDMAAttr>(attr) &&
+      operand.get().getDefiningOp<tensor::PadOp>()) {
+    effectiveAttr = DerivedThreadConfigAttr::get(builder.getContext());
+  }
+
   return promoteValue(builder, operand.getOwner()->getLoc(), operand.get(),
-                      attr);
+                      effectiveAttr);
 }
 
 /// Inserts a `linalg.copy` directly before the given operation on the
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -924,8 +924,8 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   }
   // Do not use direct load DMA when padding is needed, as the source will
   // go through tensor.pad and won't be directly from global memory.
-  ArrayRef<Attribute> promotionTypes =
-      (useDirectLoad && !couldNeedPadding) ? ArrayRef<Attribute>(promotionArray)
+  ArrayRef<Attribute> promotionTypes = (useDirectLoad && !couldNeedPadding)
+                                           ? ArrayRef<Attribute>(promotionArray)
                                            : ArrayRef<Attribute>{};
   GPU::appendPromotedOperandsList(context, attrs, promotionList,
                                   promotionTypes);