iree-org · YashDeshpande25 · Feb 10, 2026 · nirvedhmeshram · Feb 12, 2026
@@ -25,7 +25,7 @@ static llvm::cl::opt<bool> clEnableBlockedMatmuls(
     "iree-codegen-block-dynamic-dimensions-of-contractions",
     llvm::cl::desc("developer flag to gaurd blocking dynamic dimensions of "
                    "contraction-like ops"),
-    llvm::cl::Hidden, llvm::cl::init(true));
+    llvm::cl::Hidden, llvm::cl::init(false));
 
 namespace mlir::iree_compiler {
 

@@ -103,57 +103,6 @@ func.func @block_attention_dims() {
 
 // -----
 
-func.func @basic_blocking_test(%arg0 : index, %lhs : tensor<?x2048xf32>, %rhs : tensor<2048x4096xf32>) -> tensor<?x4096xf32> {
-  %0 = util.assume.int %arg0<umin = 0, umax = 1024, udiv = 16> : index
-  %init = tensor.empty(%0) : tensor<?x4096xf32>
-  %matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x2048xf32>, tensor<2048x4096xf32>)
-      outs(%init : tensor<?x4096xf32>) -> tensor<?x4096xf32>
-  return %matmul : tensor<?x4096xf32>
-}
-// CHECK-LABEL: func @basic_blocking_test(
-//   CHECK-DAG:   %[[LHS:.+]] = tensor.expand_shape %{{.+}} {{\[}}[0, 1], [2]]
-//   CHECK-DAG:   %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?x16x4096xf32>
-//       CHECK:   %[[MATMUL:.+]] = linalg.generic
-//  CHECK-SAME:       ins(%[[LHS]],
-//  CHECK-SAME:       outs(%[[INIT]] :
-//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %[[MATMUL]]
-//       CHECK:   return %[[COLLAPSE]]
-
-// -----
-
-func.func @no_blocking(%arg0 : index, %lhs : tensor<?x2048xf32>, %rhs : tensor<2048x4096xf32>) -> tensor<?x4096xf32> {
-  %init = tensor.empty(%arg0) : tensor<?x4096xf32>
-  %matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x2048xf32>, tensor<2048x4096xf32>)
-      outs(%init : tensor<?x4096xf32>) -> tensor<?x4096xf32>
-  return %matmul : tensor<?x4096xf32>
-}
-// CHECK-LABEL: func @no_blocking(
-//   CHECK-DAG:   %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?x4096xf32>
-//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
-//  CHECK-SAME:       ins(%{{.+}},
-//  CHECK-SAME:       outs(%[[INIT]] :
-//       CHECK:   return %[[MATMUL]]
-
-// -----
-
-func.func @no_unit_blocking(%arg0 : index, %lhs : tensor<?x2048xf32>, %rhs : tensor<2048x4096xf32>) -> tensor<?x4096xf32> {
-  %0 = util.assume.int %arg0<umin = 0, umax = 1024, udiv = 1> : index
-  %init = tensor.empty(%0) : tensor<?x4096xf32>
-  %matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x2048xf32>, tensor<2048x4096xf32>)
-      outs(%init : tensor<?x4096xf32>) -> tensor<?x4096xf32>
-  return %matmul : tensor<?x4096xf32>
-}
-// CHECK-LABEL: func @no_unit_blocking(
-//  CHECK-SAME:   %[[LHS:[a-zA-Z0-9]+]]: tensor<?x2048xf32>
-//  CHECK-SAME:   %[[RHS:[a-zA-Z0-9]+]]: tensor<2048x4096xf32>
-//   CHECK-DAG:   %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?x4096xf32>
-//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
-//  CHECK-SAME:       ins(%[[LHS]], %[[RHS]]
-//  CHECK-SAME:       outs(%[[INIT]] :
-//       CHECK:   return %[[MATMUL]]
-
-// -----
-
 func.func @contract_op_interface_op(%rhs : tensor<2048x4096xf16>, %m : index, %lhs : tensor<?x4096xf16>)
     -> tensor<?x2048xf32> {
   %0 = util.assume.int %m<udiv = 16> : index

@@ -7,6 +7,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
 
 #include "iree/compiler/Codegen/Common/GPU/GPUHeuristics.h"
+#include "iree/compiler/Codegen/Common/TensorDynamicDimAnalysis.h"
 #include "iree/compiler/Codegen/Common/TileInferenceUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
@@ -20,10 +21,16 @@
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/MatchUtils.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/InterleavedRange.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/Attributes.h"
@@ -45,6 +52,14 @@
 constexpr int64_t kCacheLineSizeBits = 128 * 8;
 constexpr int64_t kPreferredCopyNumBits = 128;
 
+// Sentinel value used by IntegerRangeAnalysis when bounds are unknown.
+static constexpr uint64_t MAX_DIM_VALUE = (static_cast<uint64_t>(1) << 53) - 1;
+
+// Fallback bound when IntegerRangeAnalysis cannot determine the actual value.
+// Kept small (2^14) to avoid int64_t overflow when dimensions are multiplied
+// together in heuristic calculations.
+static constexpr uint64_t MAX_BOUND_VALUE = static_cast<uint64_t>(1) << 14;
+
 //===----------------------------------------------------------------------===//
 // Lowering Config Selection
 //===----------------------------------------------------------------------===//
@@ -653,7 +668,8 @@
     ArrayRef<int64_t> bounds, ArrayRef<AffineMap> maps,
     ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool useDirectLoad,
     bool isGemm, bool scaled, int64_t splitReductionTripCnt,
-    bool cPromoteIfPadding, bool hasExistingAccumulator = false,
+    bool cPromoteIfPadding, bool boundsUsingAnalysis,
+    bool hasExistingAccumulator = false,
     std::optional<ConvToIgemmInfo> convToIgemmInfo = std::nullopt) {
   if (target.getWgp().getMma().empty()) {
     return failure();
@@ -969,7 +985,7 @@
                                            : ArrayRef<Attribute>{};
   GPU::appendPromotedOperandsList(context, attrs, promotionList,
                                   promotionTypes);
-  if (!mustBeAligned || couldNeedPadding) {
+  if (!mustBeAligned || couldNeedPadding || boundsUsingAnalysis) {
     SmallVector<int64_t> paddingTileSizes = workgroupTileSizes;
 
     // Initialize inner and outer padding sizes from reductionTileSizes.
@@ -1085,7 +1101,8 @@
           igemmLoopBounds, igemmContractionMaps, igemmOperands, target,
           useDirectLoad, /*isGemm=*/false,
           /*scaled=*/false, splitReductionTripCnt,
-          /*cPromoteIfPadding=*/cPromoteIfPadding, hasExistingAccumulator,
+          /*cPromoteIfPadding=*/cPromoteIfPadding,
+          /*boundsUsingAnalysis=*/false, hasExistingAccumulator,
           convToIgemmInfo);
   if (failed(configAndWgSize)) {
     return failure();
@@ -1112,7 +1129,69 @@
       workgroupSize, targetSubgroupSize, pipelineConfig);
 }
 
+static FailureOr<SmallVector<int64_t>>
+getLoopBoundsWithRangeAnalysis(linalg::LinalgOp linalgOp,
+                               mlir::FunctionOpInterface entryPoint) {
+  // Use TensorDynamicDimAnalysis for cleaner range queries.
+  TensorDynamicDimAnalysis dynamicDimAnalysis(entryPoint);
+  if (failed(dynamicDimAnalysis.run())) {
+    return linalgOp.getStaticLoopRanges();
+  }
+
+  SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
+  SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
+
+  for (auto [loopIdx, bound] : llvm::enumerate(bounds)) {
+    if (!ShapedType::isDynamic(bound)) {
+      continue;
+    }
+
+    bool boundRefined = false;
+
+    // Find operand and dimension that corresponds to this loop.
+    for (auto [operandIdx, operand] :
+         llvm::enumerate(linalgOp->getOperands())) {
+      auto shapedType = dyn_cast<ShapedType>(operand.getType());
+      if (!shapedType) {
+        continue;
+      }
+
+      AffineMap map = indexingMaps[operandIdx];
+      for (auto [dimIdx, expr] : llvm::enumerate(map.getResults())) {
+        auto dimExpr = dyn_cast<AffineDimExpr>(expr);
+        if (!dimExpr || dimExpr.getPosition() != loopIdx) {
+          continue;
+        }
+        if (!ShapedType::isDynamic(shapedType.getDimSize(dimIdx))) {
+          continue;
+        }
+
+        // Use TensorDynamicDimAnalysis to get range info directly.
+        if (auto range = dynamicDimAnalysis.getRangeInfo(operand, dimIdx)) {
+          int64_t ub = range->smax().getSExtValue();
+          if (ub > 0 && ub < MAX_DIM_VALUE) {
+            bounds[loopIdx] = ub;
+            boundRefined = true;
+            break;
+          }
+        }
+      }
+
+      if (boundRefined) {
+        break;
+      }
+    }
+
+    // If we couldn't refine the bound, set it to a large value.
+    if (!boundRefined && ShapedType::isDynamic(bounds[loopIdx])) {
+      bounds[loopIdx] = MAX_BOUND_VALUE;
+    }
+  }
+
+  return bounds;
+}
+
 LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
                                       mlir::FunctionOpInterface entryPoint,
                                       Operation *op, bool useDirectLoad) {
  auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
@@ -1122,7 +1201,15 @@
     return failure();
   }
 
+  // Use IntegerRangeAnalysis to get better bounds for dynamic shapes.
+  bool boundsUsingAnalysis = false;
+  FailureOr<SmallVector<int64_t>> maybeBounds =
+      getLoopBoundsWithRangeAnalysis(linalgOp, entryPoint);
   SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
+  if (succeeded(maybeBounds) && (maybeBounds != bounds)) {
+    boundsUsingAnalysis = true;
+    bounds = std::move(*maybeBounds);
+  }
   SmallVector<AffineMap> maps = linalgOp.getIndexingMapsArray();
   SmallVector<Value> operands(linalgOp->getOperands());
 
@@ -1144,7 +1231,7 @@
       getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
           bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
           /*scaled=*/false, splitReductionTripCnt, cPromoteIfPadding,
-          hasExistingAccumulator);
+          boundsUsingAnalysis, hasExistingAccumulator);
 
   // TODO (muzasyed) : add generalization for scaled and nonscaled versions of
   // matmul lowering.
@@ -1155,7 +1242,7 @@
     configAndWgSize = getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
         bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
         /*scaled=*/true, splitReductionTripCnt, cPromoteIfPadding,
-        hasExistingAccumulator);
+        boundsUsingAnalysis, hasExistingAccumulator);
   }
 
   if (failed(configAndWgSize)) {

@@ -1451,3 +1451,70 @@ hal.executable public @multi_result_index_generic_with_scatter_fusion {
 // CHECK:           vector.transfer_write
 // CHECK:           vector.transfer_write
 // CHECK:           iree_linalg_ext.scatter
+
+// -----
+
+// Test dynamic matmul with util.assume.int providing bounds for range analysis.
+// The getLoopBoundsWithRangeAnalysis function uses IntegerRangeAnalysis to infer
+// the upper bound from util.assume.int and select appropriate tile sizes.
+
+  #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [
+    #hal.pipeline.binding<storage_buffer>,
+    #hal.pipeline.binding<storage_buffer>,
+    #hal.pipeline.binding<storage_buffer>
+  ]>
+  #config = #iree_gpu.lowering_config<{
+    workgroup = [128, 128, 0],
+    reduction = [0, 0, 4],
+    subgroup = [4, 4],
+    mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+    promote_operands = [0, 1],
+    padding = [128, 128, 16]
+  }>
+  hal.executable public @main {
+    hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+      hal.executable.export public @matmul_dynamic_m_with_assume ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) ->
+  (index, index, index) {
+        %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        func.func @matmul_dynamic_m_with_assume()
+          attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1]
+  subgroup_size = 64>} {
+          %cst = arith.constant 0.000000e+00 : f32
+          %c0 = arith.constant 0 : index
+          %dim = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index
+          %m = util.assume.int %dim<umin = 0, umax = 1024, udiv = 16> : index
+          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf32>>{%m}
+          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x4096xf32>>
+          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%m}
+          %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%m, 2048], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf32>>{%m} -> tensor<?x2048xf32>
+          %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 4096], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x4096xf32>> -> tensor<2048x4096xf32>
+          %5 = tensor.empty(%m) : tensor<?x4096xf32>
+          %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+          %7 = linalg.matmul {lowering_config = #config}
+            ins(%3, %4 : tensor<?x2048xf32>, tensor<2048x4096xf32>)
+            outs(%6 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+          iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [%m, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%m}
+          return
+        }
+      }
+    }
+  }
+
+  // CHECK-LABEL: func @matmul_dynamic_m_with_assume
+  //   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+  //   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+  //   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+  //   CHECK-DAG:   memref.alloc() : memref<16x130xf32, #gpu.address_space<workgroup>>
+  //   CHECK-DAG:   memref.alloc() : memref<128x18xf32, #gpu.address_space<workgroup>>
+  //       CHECK:   scf.forall ({{.*}}) in (%{{.+}}, 32) {
+  //       CHECK:     scf.for {{.*}} = %c0 to %c512 step %c4 {{.*}} -> (vector<4x4x4x1xf32>)
+  //       CHECK:       gpu.barrier
+  //       CHECK:       vector.transfer_read
+  //       CHECK:       vector.transfer_write
+  //       CHECK:       gpu.barrier
+  // CHECK-COUNT-64:    amdgpu.mfma 16x16x4
+  //       CHECK:       scf.yield
+  //       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}