Added function to determine bounds of dynamic dims in GEMMs using InterRangeAnalysis in TAF

YashDeshpande25 · YashDeshpande25 · commit 19f6438309f6 · 2026-02-10T15:15:13.000-08:00
Signed-off-by: Yash Deshpande &lt;ydeshpan@amd.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/BlockDynamicDimensions.cpp b/compiler/src/iree/compiler/Codegen/Common/BlockDynamicDimensions.cpp
@@ -25,7 +25,7 @@ static llvm::cl::opt<bool> clEnableBlockedMatmuls(
     "iree-codegen-block-dynamic-dimensions-of-contractions",
     llvm::cl::desc("developer flag to gaurd blocking dynamic dimensions of "
                    "contraction-like ops"),
-    llvm::cl::Hidden, llvm::cl::init(true));
+    llvm::cl::Hidden, llvm::cl::init(false));
 
 namespace mlir::iree_compiler {
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -31,6 +31,13 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "iree/compiler/Codegen/Common/TensorDynamicDimAnalysis.h"
+#include "llvm/ADT/DenseSet.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 
 #define DEBUG_TYPE "iree-gpu-config-utils"
 
@@ -653,7 +660,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     ArrayRef<int64_t> bounds, ArrayRef<AffineMap> maps,
     ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool useDirectLoad,
     bool isGemm, bool scaled, int64_t splitReductionTripCnt,
-    bool cPromoteIfPadding, bool hasExistingAccumulator = false,
+    bool cPromoteIfPadding, bool boundsUsingAnalysis, bool hasExistingAccumulator = false,
     std::optional<ConvToIgemmInfo> convToIgemmInfo = std::nullopt) {
   if (target.getWgp().getMma().empty()) {
     return failure();
@@ -969,7 +976,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
                                            : ArrayRef<Attribute>{};
   GPU::appendPromotedOperandsList(context, attrs, promotionList,
                                   promotionTypes);
-  if (!mustBeAligned || couldNeedPadding) {
+  if (!mustBeAligned || couldNeedPadding || boundsUsingAnalysis) {
     SmallVector<int64_t> paddingTileSizes = workgroupTileSizes;
 
     // Initialize inner and outer padding sizes from reductionTileSizes.
@@ -1085,7 +1092,7 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
           igemmLoopBounds, igemmContractionMaps, igemmOperands, target,
           useDirectLoad, /*isGemm=*/false,
           /*scaled=*/false, splitReductionTripCnt,
-          /*cPromoteIfPadding=*/cPromoteIfPadding, hasExistingAccumulator,
+          /*cPromoteIfPadding=*/cPromoteIfPadding, /*boundsUsingAnalysis=*/ false, hasExistingAccumulator,
           convToIgemmInfo);
   if (failed(configAndWgSize)) {
     return failure();
@@ -1112,6 +1119,122 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
       workgroupSize, targetSubgroupSize, pipelineConfig);
 }
 
+
+static FailureOr<SmallVector<int64_t>>
+      getLoopBoundsWithRangeAnalysis(linalg::LinalgOp linalgOp,
+                                     mlir::FunctionOpInterface entryPoint) {
+    // Initialize DataFlowSolver for integer range analysis.
+    DataFlowSolver solver;
+    solver.load<dataflow::DeadCodeAnalysis>();
+    solver.load<dataflow::SparseConstantPropagation>();
+    solver.load<dataflow::IntegerRangeAnalysis>();
+
+    if (failed(solver.initializeAndRun(entryPoint))) {
+      return linalgOp.getStaticLoopRanges();
+    }
+
+    SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
+    SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
+
+    // Sentinel value used by IntegerRangeAnalysis when bounds are unknown.
+    constexpr int64_t unboundedSentinel = 9007199254740991;
+
+    // Helper to recursively collect index values from an operation.
+    // Uses a visited set instead of hardcoded depth limit.
+    std::function<void(Value, SmallVectorImpl<Value> &, DenseSet<Value> &)>
+        collectIndexValues = [&](Value value, SmallVectorImpl<Value> &indexValues,
+                                 DenseSet<Value> &visited) -> void {
+      // Use visited set to prevent infinite recursion.
+      if (!visited.insert(value).second)
+        return;
+
+      if (value.getType().isIndex()) {
+        indexValues.push_back(value);
+      }
+
+      Operation *defOp = value.getDefiningOp();
+      if (!defOp)
+        return;
+
+      // Recursively traverse all operands.
+      for (Value operand : defOp->getOperands()) {
+        if (operand.getType().isIndex()) {
+          indexValues.push_back(operand);
+        }
+        // Continue traversing for shaped types to find their dimension operands.
+        if (isa<ShapedType>(operand.getType())) {
+          Operation *operandDef = operand.getDefiningOp();
+          if (operandDef) {
+            for (Value v : operandDef->getOperands()) {
+              if (v.getType().isIndex()) {
+                collectIndexValues(v, indexValues, visited);
+              }
+            }
+          }
+        }
+      }
+    };
+
+    for (auto [loopIdx, bound] : llvm::enumerate(bounds)) {
+      if (!ShapedType::isDynamic(bound)) {
+        continue;
+      }
+
+      bool boundRefined = false;
+
+      // Find operand and dimension that corresponds to this loop.
+      for (auto [operandIdx, operand] :
+           llvm::enumerate(linalgOp->getOperands())) {
+        auto shapedType = dyn_cast<ShapedType>(operand.getType());
+        if (!shapedType)
+          continue;
+
+        AffineMap map = indexingMaps[operandIdx];
+        for (auto [dimIdx, expr] : llvm::enumerate(map.getResults())) {
+          auto dimExpr = dyn_cast<AffineDimExpr>(expr);
+          if (!dimExpr || dimExpr.getPosition() != loopIdx)
+            continue;
+          if (!ShapedType::isDynamic(shapedType.getDimSize(dimIdx)))
+            continue;
+
+          // Collect all index values related to this operand by traversing use-def chain.
+          SmallVector<Value> indexValues;
+          DenseSet<Value> visited;
+          collectIndexValues(operand, indexValues, visited);
+
+          // Try each index value with getDynamicUpperBound.
+          for (Value indexValue : indexValues) {
+            FailureOr<int64_t> ub = getDynamicUpperBound(indexValue, solver);
+            if (succeeded(ub) && *ub > 0) {
+              // Filter out the unbounded sentinel.
+              if (*ub >= unboundedSentinel) {
+                continue;
+              }
+
+              bounds[loopIdx] = *ub;
+              boundRefined = true;
+              break;
+            }
+          }
+
+          if (boundRefined)
+            break;
+        }
+
+        if (boundRefined) {
+          break;
+        }
+      }
+
+      // TODO: If we couldn't refine the bound, set it to the largest power of 2.
+      if (!boundRefined && ShapedType::isDynamic(bounds[loopIdx])) {
+        bounds[loopIdx] = 1 << 20;
+      }
+    }
+
+    return bounds;
+  }
+
 LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
                                       mlir::FunctionOpInterface entryPoint,
                                       Operation *op, bool useDirectLoad) {
@@ -1122,7 +1245,20 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
     return failure();
   }
 
-  SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
+  // SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
+  // Use IntegerRangeAnalysis to get better bounds for dynamic shapes
+  bool boundsUsingAnalysis = false;
+  FailureOr<SmallVector<int64_t>> maybeBounds =
+      getLoopBoundsWithRangeAnalysis(linalgOp, entryPoint);
+  SmallVector<int64_t> bounds;
+  if (succeeded(maybeBounds)) {
+    boundsUsingAnalysis = true;
+    bounds = std::move(*maybeBounds);
+    } else {
+    // Fallback to static loop ranges if analysis fails completely.
+    bounds = linalgOp.getStaticLoopRanges();
+    LDBG() << "Fallback to static loop ranges: [";
+  }
   SmallVector<AffineMap> maps = linalgOp.getIndexingMapsArray();
   SmallVector<Value> operands(linalgOp->getOperands());
 
@@ -1143,7 +1279,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
           bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
-          /*scaled=*/false, splitReductionTripCnt, cPromoteIfPadding,
+          /*scaled=*/false, splitReductionTripCnt, cPromoteIfPadding, boundsUsingAnalysis,
           hasExistingAccumulator);
 
   // TODO (muzasyed) : add generalization for scaled and nonscaled versions of
@@ -1154,7 +1290,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
     useDirectLoad = true;
     configAndWgSize = getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
         bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
-        /*scaled=*/true, splitReductionTripCnt, cPromoteIfPadding,
+        /*scaled=*/true, splitReductionTripCnt, cPromoteIfPadding, boundsUsingAnalysis, 
         hasExistingAccumulator);
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1451,3 +1451,70 @@ hal.executable public @multi_result_index_generic_with_scatter_fusion {
 // CHECK:           vector.transfer_write
 // CHECK:           vector.transfer_write
 // CHECK:           iree_linalg_ext.scatter
+
+// -----                                                                                                                          
+                                                                                                                                    
+  // Test dynamic matmul with util.assume.int providing bounds for range analysis.                                                  
+  // The getLoopBoundsWithRangeAnalysis function uses IntegerRangeAnalysis to infer
+  // the upper bound from util.assume.int and select appropriate tile sizes.
+
+  #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [
+    #hal.pipeline.binding<storage_buffer>,
+    #hal.pipeline.binding<storage_buffer>,
+    #hal.pipeline.binding<storage_buffer>
+  ]>
+  #config = #iree_gpu.lowering_config<{
+    workgroup = [128, 128, 0],
+    reduction = [0, 0, 4],
+    subgroup = [4, 4],
+    mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+    promote_operands = [0, 1],
+    padding = [128, 128, 16]
+  }>
+  hal.executable public @main {
+    hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+      hal.executable.export public @matmul_dynamic_m_with_assume ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) ->
+  (index, index, index) {
+        %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        func.func @matmul_dynamic_m_with_assume()
+          attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1]
+  subgroup_size = 64>} {
+          %cst = arith.constant 0.000000e+00 : f32
+          %c0 = arith.constant 0 : index
+          %dim = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index
+          %m = util.assume.int %dim<umin = 0, umax = 1024, udiv = 16> : index
+          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf32>>{%m}
+          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x4096xf32>>
+          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%m}
+          %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%m, 2048], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf32>>{%m} -> tensor<?x2048xf32>
+          %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 4096], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x4096xf32>> -> tensor<2048x4096xf32>
+          %5 = tensor.empty(%m) : tensor<?x4096xf32>
+          %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+          %7 = linalg.matmul {lowering_config = #config}
+            ins(%3, %4 : tensor<?x2048xf32>, tensor<2048x4096xf32>)
+            outs(%6 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+          iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [%m, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%m}
+          return
+        }
+      }
+    }
+  }
+
+  // CHECK-LABEL: func @matmul_dynamic_m_with_assume
+  //   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)                                            
+  //   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)                                          
+  //   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+  //   CHECK-DAG:   memref.alloc() : memref<16x130xf32, #gpu.address_space<workgroup>>
+  //   CHECK-DAG:   memref.alloc() : memref<128x18xf32, #gpu.address_space<workgroup>>
+  //       CHECK:   scf.forall ({{.*}}) in (%{{.+}}, 32) {
+  //       CHECK:     scf.for {{.*}} = %c0 to %c512 step %c4 {{.*}} -> (vector<4x4x4x1xf32>)
+  //       CHECK:       gpu.barrier
+  //       CHECK:       vector.transfer_read
+  //       CHECK:       vector.transfer_write
+  //       CHECK:       gpu.barrier
+  // CHECK-COUNT-64:    amdgpu.mfma 16x16x4
+  //       CHECK:       scf.yield
+  //       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}