iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp‎
Lines changed: 25 additions & 16 deletions b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp‎
Lines changed: 25 additions & 16 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h‎
Lines changed: 12 additions & 10 deletions b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h‎
Lines changed: 12 additions & 10 deletions
@@ -52,7 +52,8 @@ constexpr int64_t kPreferredCopyNumBits = 128;
 
 LogicalResult setDataTiledMmaInnerTiledLoweringConfig(
     IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
-    Operation *op, IREE::Codegen::UKernelDescriptorAttr ukernelConfig) {
+    Operation *op, IREE::Codegen::UKernelDescriptorAttr ukernelConfig,
+    std::optional<uint64_t> prefetchNumStages) {
   auto multiMmaOp = dyn_cast<IREE::Codegen::InnerTiledOp>(op);
   if (!multiMmaOp) {
     return failure();
@@ -109,11 +110,12 @@ LogicalResult setDataTiledMmaInnerTiledLoweringConfig(
   DictionaryAttr configDict = b.getDictionaryAttr(attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
-  // Don't add any special padding or prefetching, since the data-tiled layout
-  // is already what we want.
+  // By default, don't add any special padding or prefetching, since the
+  // data-tiled layout is already what we want.
   SmallVector<NamedAttribute, 1> pipelineAttrs;
+  int64_t prefetchStages = prefetchNumStages.value_or(0);
   auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
-      context, /*prefetchNumStages=*/0,
+      context, /*prefetchNumStages=*/prefetchStages,
       /*no_reduce_shared_memory_bank_conflicts=*/true,
       /*use_igemm_convolution=*/false,
       /*reorder_workgroups_strategy=*/std::nullopt);
@@ -1014,7 +1016,8 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
 
 LogicalResult setIGEMMConvolutionLoweringConfig(
     IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
-    Operation *op, bool useDirectLoad, bool padConv) {
+    Operation *op, bool useDirectLoad, bool padConv,
+    std::optional<uint64_t> prefetchNumStages) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) {
     return failure();
@@ -1100,9 +1103,11 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
   LoweringConfigAttr loweringConfig = configAndWgSize->first;
 
   SmallVector<NamedAttribute, 1> pipelineAttrs;
+  // Default to 2 stages if not specified.
+  int64_t prefetchStages = prefetchNumStages.value_or(2);
   auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
       linalgOp->getContext(),
-      /*prefetchNumStages=*/useDirectLoad ? 0 : 2,
+      /*prefetchNumStages=*/prefetchStages,
       /*no_reduce_shared_memory_bank_conflicts=*/useDirectLoad,
       /*use_igemm_convolution=*/true,
       /*reorder_workgroups_strategy=*/std::nullopt);
@@ -1119,9 +1124,11 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
       workgroupSize, targetSubgroupSize, pipelineConfig);
 }
 
-LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
-                                      mlir::FunctionOpInterface entryPoint,
-                                      Operation *op, bool useDirectLoad) {
+LogicalResult
+setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
+                        mlir::FunctionOpInterface entryPoint, Operation *op,
+                        bool useDirectLoad,
+                        std::optional<uint64_t> prefetchNumStages) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp ||
       (!linalg::isaContractionOpInterface(linalgOp) &&
@@ -1172,9 +1179,11 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
   LoweringConfigAttr loweringConfig = configAndWgSize->first;
 
   SmallVector<NamedAttribute, 1> pipelineAttrs;
+  // Default to 2 stages if not specified.
+  int64_t prefetchStages = prefetchNumStages.value_or(2);
   auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
       linalgOp->getContext(),
-      /*prefetchNumStages=*/useDirectLoad ? 0 : 2,
+      /*prefetchNumStages=*/prefetchStages,
       /*no_reduce_shared_memory_bank_conflicts=*/useDirectLoad,
       /*use_igemm_convolution=*/false,
       /*reorder_workgroups_strategy=*/std::nullopt);
@@ -1781,10 +1790,9 @@ LogicalResult setScatterLoweringConfig(IREE::GPU::TargetAttr target,
       {flatWorkgroupSize, 1, 1}, flatWorkgroupSize, DictionaryAttr());
 }
 
-LogicalResult
-setDirectConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
-                                   mlir::FunctionOpInterface entryPoint,
-                                   Operation *op) {
+LogicalResult setDirectConvolutionLoweringConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    Operation *op, std::optional<uint64_t> prefetchNumStages) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp || !linalg::isaConvolutionOpInterface(linalgOp)) {
     return failure();
@@ -2029,9 +2037,10 @@ setDirectConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
   auto configDict = DictionaryAttr::get(context, attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
-  // Prefetch shared memory is kept off.
+  // By default, prefetch shared memory is kept off.
+  int64_t prefetchStages = prefetchNumStages.value_or(0);
   auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
-      context, /*prefetchNumStages=*/0,
+      context, /*prefetchNumStages=*/prefetchStages,
       /*no_reduce_shared_memory_bank_conflicts=*/false,
       /*use_igemm_convolution=*/false,
       /*reorder_workgroups_strategy=*/std::nullopt);
 
@@ -18,30 +18,32 @@ namespace mlir::iree_compiler::IREE::GPU {
 /// specified target.
 LogicalResult setDataTiledMmaInnerTiledLoweringConfig(
     IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
-    Operation *op, IREE::Codegen::UKernelDescriptorAttr ukernelConfig);
+    Operation *op, IREE::Codegen::UKernelDescriptorAttr ukernelConfig,
+    std::optional<uint64_t> prefetchNumStages);
 
 /// Helper for setting up a convolution config using direct convolution based on
 /// the specified target.
-LogicalResult
-setDirectConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
-                                   mlir::FunctionOpInterface entryPoint,
-                                   Operation *op);
+LogicalResult setDirectConvolutionLoweringConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    Operation *op, std::optional<uint64_t> prefetchNumStages);
 
 /// Helper for setting up a convolution config using IGEMM based on the
 /// specified target.
 /// TODO: Currently this only succeeds if the target supports an mma
 /// kind. Add support for a fallback direct lowering path.
 LogicalResult setIGEMMConvolutionLoweringConfig(
     IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
-    Operation *op, bool useDirectLoad = false, bool padConv = false);
+    Operation *op, bool useDirectLoad, bool padConv,
+    std::optional<uint64_t> prefetchNumStages);
 
 /// Helper for setting up a matmul config based on the specified target.
 /// TODO: Currently this only succeeds if the target supports an mma
 /// kind. Add support for a fallback direct lowering path.
-LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
-                                      mlir::FunctionOpInterface entryPoint,
-                                      Operation *op,
-                                      bool useDirectLoad = false);
+LogicalResult
+setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
+                        mlir::FunctionOpInterface entryPoint, Operation *op,
+                        bool useDirectLoad,
+                        std::optional<uint64_t> prefetchNumStages);
 
 /// Helper for setting up a default tile and fuse config for targeting
 /// simple thread distribution. Currently restricted to linalg ops.