[LDS] Do not use DMA in the presence of tensor.pad

lialan · lialan · commit 0917afe0f07c · 2026-01-21T15:48:36.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -922,8 +922,10 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     // and scaled GEMM respectively.
     promotionList.push_back(promotionList.size());
   }
-  ArrayRef<Attribute> promotionTypes = useDirectLoad
-                                           ? ArrayRef<Attribute>(promotionArray)
+  // Do not use direct load DMA when padding is needed, as the source will
+  // go through tensor.pad and won't be directly from global memory.
+  ArrayRef<Attribute> promotionTypes =
+      (useDirectLoad && !couldNeedPadding) ? ArrayRef<Attribute>(promotionArray)
                                            : ArrayRef<Attribute>{};
   GPU::appendPromotedOperandsList(context, attrs, promotionList,
                                   promotionTypes);