Remove turn on option.

lialan · lialan · commit 761329c6182c · 2026-01-21T15:48:36.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -630,9 +630,9 @@ static bool checkForElementwiseUsersWithNewOperands(linalg::LinalgOp linalgOp) {
 static FailureOr<std::pair<LoweringConfigAttr, int64_t>>
 getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     ArrayRef<int64_t> bounds, ArrayRef<AffineMap> maps,
-    ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool useDirectLoad,
-    bool isGemm, bool scaled, int64_t splitReductionTripCnt,
-    bool CPromoteIfPadding, bool hasExistingAccumulator = false,
+    ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool isGemm,
+    bool scaled, int64_t splitReductionTripCnt, bool CPromoteIfPadding,
+    bool hasExistingAccumulator = false,
     std::optional<ConvToIgemmInfo> convToIgemmInfo = std::nullopt) {
   if (target.getWgp().getMma().empty()) {
     return failure();
@@ -924,9 +924,9 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   }
   // Do not use direct load DMA when padding is needed, as the source will
   // go through tensor.pad and won't be directly from global memory.
-  ArrayRef<Attribute> promotionTypes = (useDirectLoad && !couldNeedPadding)
-                                           ? ArrayRef<Attribute>(promotionArray)
-                                           : ArrayRef<Attribute>{};
+  ArrayRef<Attribute> promotionTypes =
+      couldNeedPadding ? ArrayRef<Attribute>{}
+                       : ArrayRef<Attribute>(promotionArray);
   GPU::appendPromotedOperandsList(context, attrs, promotionList,
                                   promotionTypes);
   if (!mustBeAligned || couldNeedPadding) {
@@ -966,9 +966,10 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   return std::pair{loweringConfig, flatWorkgroupSize};
 }
 
-LogicalResult setIGEMMConvolutionLoweringConfig(
-    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
-    Operation *op, bool useDirectLoad, bool padConv) {
+LogicalResult
+setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
+                                  mlir::FunctionOpInterface entryPoint,
+                                  Operation *op, bool padConv) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) {
     return failure();
@@ -1042,7 +1043,7 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
           igemmLoopBounds, igemmContractionMaps, igemmOperands, target,
-          useDirectLoad, /*isGemm=*/false,
+          /*isGemm=*/false,
           /*scaled=*/false, splitReductionTripCnt,
           /*CPromoteIfPadding=*/CPromoteIfPadding, hasExistingAccumulator,
           convToIgemmInfo);
@@ -1055,7 +1056,7 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
   SmallVector<NamedAttribute, 1> pipelineAttrs;
   auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
       linalgOp->getContext(), /*prefetchNumStages=*/2,
-      /*no_reduce_shared_memory_bank_conflicts=*/useDirectLoad,
+      /*no_reduce_shared_memory_bank_conflicts=*/true,
       /*use_igemm_convolution=*/true,
       /*reorder_workgroups_strategy=*/std::nullopt);
   pipelineAttrs.emplace_back(
@@ -1073,7 +1074,7 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
 
 LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
                                       mlir::FunctionOpInterface entryPoint,
-                                      Operation *op, bool useDirectLoad) {
+                                      Operation *op) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp ||
       (!linalg::isaContractionOpInterface(linalgOp) &&
@@ -1100,18 +1101,15 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
 
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
-          bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
+          bounds, maps, operands, target, /*isGemm=*/true,
           /*scaled=*/false, splitReductionTripCnt, CPromoteIfPadding,
           hasExistingAccumulator);
 
   // TODO (muzasyed) : add generalization for scaled and nonscaled versions of
   // matmul lowering.
   if (failed(configAndWgSize)) {
-    // TODO (muzasyed) : Perform padding appropriately for minimizing bank
-    // conflicts when dealing with scaled matmuls. For now it is disabled.
-    useDirectLoad = true;
     configAndWgSize = getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
-        bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
+        bounds, maps, operands, target, /*isGemm=*/true,
         /*scaled=*/true, splitReductionTripCnt, CPromoteIfPadding,
         hasExistingAccumulator);
   }
@@ -1125,7 +1123,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
   SmallVector<NamedAttribute, 1> pipelineAttrs;
   auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
       linalgOp->getContext(), /*prefetchNumStages=*/2,
-      /*no_reduce_shared_memory_bank_conflicts=*/useDirectLoad,
+      /*no_reduce_shared_memory_bank_conflicts=*/true,
       /*use_igemm_convolution=*/false,
       /*reorder_workgroups_strategy=*/std::nullopt);
   pipelineAttrs.emplace_back(
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
@@ -31,17 +31,17 @@ setDirectConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
 /// specified target.
 /// TODO: Currently this only succeeds if the target supports an mma
 /// kind. Add support for a fallback direct lowering path.
-LogicalResult setIGEMMConvolutionLoweringConfig(
-    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
-    Operation *op, bool useDirectLoad = false, bool padConv = false);
+LogicalResult
+setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
+                                  mlir::FunctionOpInterface entryPoint,
+                                  Operation *op, bool padConv = false);
 
 /// Helper for setting up a matmul config based on the specified target.
 /// TODO: Currently this only succeeds if the target supports an mma
 /// kind. Add support for a fallback direct lowering path.
 LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
                                       mlir::FunctionOpInterface entryPoint,
-                                      Operation *op,
-                                      bool useDirectLoad = false);
+                                      Operation *op);
 
 /// Helper for setting up a default tile and fuse config for targeting
 /// simple thread distribution. Currently restricted to linalg ops.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -128,11 +128,6 @@ llvm::cl::opt<bool> clGPUPadConvolution(
     llvm::cl::desc("enable pre-padding for convolutions in igemm path"),
     llvm::cl::init(true));
 
-static llvm::cl::opt<bool>
-    clUseDirectLoad("iree-llvmgpu-use-direct-load",
-                    llvm::cl::desc("Use global load DMA for direct load ops."),
-                    llvm::cl::Hidden, llvm::cl::init(false));
-
 static llvm::cl::opt<bool> clDirectConvolution(
     "iree-codegen-llvmgpu-use-direct-convolution",
     llvm::cl::desc("Use direct convolution in tile and fuse pipeline"),
@@ -2278,8 +2273,8 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
     return success();
   }
   if (clGPUUseTileAndFuseMatmul) {
-    if (succeeded(IREE::GPU::setMatmulLoweringConfig(
-            target, entryPointFn, computeOp, clUseDirectLoad))) {
+    if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
+                                                     computeOp))) {
       LDBG() << "Tile and fuse matmul config";
       return success();
     }
@@ -2293,8 +2288,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
   }
   if (clLLVMGPUUseIgemm) {
     if (succeeded(IREE::GPU::setIGEMMConvolutionLoweringConfig(
-            target, entryPointFn, computeOp, clUseDirectLoad,
-            clGPUPadConvolution))) {
+            target, entryPointFn, computeOp, clGPUPadConvolution))) {
       LDBG() << "Tile and fuse IGEMM config";
       return success();
     }
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
@@ -1210,7 +1210,6 @@ iree_generated_e2e_runner_test(
     "hip"
   COMPILER_FLAGS
     ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-llvmgpu-use-direct-load"
   LABELS
     "noasan"
     "nomsan"
@@ -1239,7 +1238,6 @@ iree_generated_e2e_runner_test(
     "hip"
   COMPILER_FLAGS
     ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-llvmgpu-use-direct-load"
   LABELS
     "noasan"
     "nomsan"
@@ -1268,7 +1266,6 @@ iree_generated_e2e_runner_test(
     "hip"
   COMPILER_FLAGS
     ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-llvmgpu-use-direct-load"
   LABELS
     "noasan"
     "nomsan"
@@ -1297,7 +1294,6 @@ iree_generated_e2e_runner_test(
     "hip"
   COMPILER_FLAGS
     ${IREE_HIP_TEST_COMPILER_FLAGS}
-    "--iree-llvmgpu-use-direct-load"
   LABELS
     "noasan"
     "nomsan"