Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ static llvm::cl::opt<bool> clEnableBlockedMatmuls(
"iree-codegen-block-dynamic-dimensions-of-contractions",
llvm::cl::desc("developer flag to gaurd blocking dynamic dimensions of "
"contraction-like ops"),
llvm::cl::Hidden, llvm::cl::init(true));
llvm::cl::Hidden, llvm::cl::init(false));

namespace mlir::iree_compiler {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,57 +103,6 @@ func.func @block_attention_dims() {

// -----

func.func @basic_blocking_test(%arg0 : index, %lhs : tensor<?x2048xf32>, %rhs : tensor<2048x4096xf32>) -> tensor<?x4096xf32> {
%0 = util.assume.int %arg0<umin = 0, umax = 1024, udiv = 16> : index
%init = tensor.empty(%0) : tensor<?x4096xf32>
%matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x2048xf32>, tensor<2048x4096xf32>)
outs(%init : tensor<?x4096xf32>) -> tensor<?x4096xf32>
return %matmul : tensor<?x4096xf32>
}
// CHECK-LABEL: func @basic_blocking_test(
// CHECK-DAG: %[[LHS:.+]] = tensor.expand_shape %{{.+}} {{\[}}[0, 1], [2]]
// CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?x16x4096xf32>
// CHECK: %[[MATMUL:.+]] = linalg.generic
// CHECK-SAME: ins(%[[LHS]],
// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[MATMUL]]
// CHECK: return %[[COLLAPSE]]

// -----

func.func @no_blocking(%arg0 : index, %lhs : tensor<?x2048xf32>, %rhs : tensor<2048x4096xf32>) -> tensor<?x4096xf32> {
%init = tensor.empty(%arg0) : tensor<?x4096xf32>
%matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x2048xf32>, tensor<2048x4096xf32>)
outs(%init : tensor<?x4096xf32>) -> tensor<?x4096xf32>
return %matmul : tensor<?x4096xf32>
}
// CHECK-LABEL: func @no_blocking(
// CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?x4096xf32>
// CHECK: %[[MATMUL:.+]] = linalg.matmul
// CHECK-SAME: ins(%{{.+}},
// CHECK-SAME: outs(%[[INIT]] :
// CHECK: return %[[MATMUL]]

// -----

func.func @no_unit_blocking(%arg0 : index, %lhs : tensor<?x2048xf32>, %rhs : tensor<2048x4096xf32>) -> tensor<?x4096xf32> {
%0 = util.assume.int %arg0<umin = 0, umax = 1024, udiv = 1> : index
%init = tensor.empty(%0) : tensor<?x4096xf32>
%matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x2048xf32>, tensor<2048x4096xf32>)
outs(%init : tensor<?x4096xf32>) -> tensor<?x4096xf32>
return %matmul : tensor<?x4096xf32>
}
// CHECK-LABEL: func @no_unit_blocking(
// CHECK-SAME: %[[LHS:[a-zA-Z0-9]+]]: tensor<?x2048xf32>
// CHECK-SAME: %[[RHS:[a-zA-Z0-9]+]]: tensor<2048x4096xf32>
// CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?x4096xf32>
// CHECK: %[[MATMUL:.+]] = linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[INIT]] :
// CHECK: return %[[MATMUL]]

// -----

func.func @contract_op_interface_op(%rhs : tensor<2048x4096xf16>, %m : index, %lhs : tensor<?x4096xf16>)
-> tensor<?x2048xf32> {
%0 = util.assume.int %m<udiv = 16> : index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"

#include "iree/compiler/Codegen/Common/GPU/GPUHeuristics.h"
#include "iree/compiler/Codegen/Common/TensorDynamicDimAnalysis.h"
#include "iree/compiler/Codegen/Common/TileInferenceUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
Expand All @@ -20,10 +21,16 @@
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
#include "iree/compiler/Dialect/LinalgExt/Utils/MatchUtils.h"
#include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/DebugLog.h"
#include "llvm/Support/InterleavedRange.h"
#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
#include "mlir/Analysis/DataFlowFramework.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/IR/Attributes.h"
Expand All @@ -45,6 +52,14 @@
constexpr int64_t kCacheLineSizeBits = 128 * 8;
constexpr int64_t kPreferredCopyNumBits = 128;

// Sentinel value used by IntegerRangeAnalysis when bounds are unknown.
static constexpr uint64_t MAX_DIM_VALUE = (static_cast<uint64_t>(1) << 53) - 1;

// Fallback bound when IntegerRangeAnalysis cannot determine the actual value.
// Kept small (2^14) to avoid int64_t overflow when dimensions are multiplied
// together in heuristic calculations.
static constexpr uint64_t MAX_BOUND_VALUE = static_cast<uint64_t>(1) << 14;

//===----------------------------------------------------------------------===//
// Lowering Config Selection
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -653,7 +668,8 @@
ArrayRef<int64_t> bounds, ArrayRef<AffineMap> maps,
ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool useDirectLoad,
bool isGemm, bool scaled, int64_t splitReductionTripCnt,
bool cPromoteIfPadding, bool hasExistingAccumulator = false,
bool cPromoteIfPadding, bool boundsUsingAnalysis,
bool hasExistingAccumulator = false,
std::optional<ConvToIgemmInfo> convToIgemmInfo = std::nullopt) {
if (target.getWgp().getMma().empty()) {
return failure();
Expand Down Expand Up @@ -969,7 +985,7 @@
: ArrayRef<Attribute>{};
GPU::appendPromotedOperandsList(context, attrs, promotionList,
promotionTypes);
if (!mustBeAligned || couldNeedPadding) {
if (!mustBeAligned || couldNeedPadding || boundsUsingAnalysis) {
SmallVector<int64_t> paddingTileSizes = workgroupTileSizes;

// Initialize inner and outer padding sizes from reductionTileSizes.
Expand Down Expand Up @@ -1085,7 +1101,8 @@
igemmLoopBounds, igemmContractionMaps, igemmOperands, target,
useDirectLoad, /*isGemm=*/false,
/*scaled=*/false, splitReductionTripCnt,
/*cPromoteIfPadding=*/cPromoteIfPadding, hasExistingAccumulator,
/*cPromoteIfPadding=*/cPromoteIfPadding,
/*boundsUsingAnalysis=*/false, hasExistingAccumulator,
convToIgemmInfo);
if (failed(configAndWgSize)) {
return failure();
Expand All @@ -1112,7 +1129,69 @@
workgroupSize, targetSubgroupSize, pipelineConfig);
}

static FailureOr<SmallVector<int64_t>>
getLoopBoundsWithRangeAnalysis(linalg::LinalgOp linalgOp,
mlir::FunctionOpInterface entryPoint) {
// Use TensorDynamicDimAnalysis for cleaner range queries.
TensorDynamicDimAnalysis dynamicDimAnalysis(entryPoint);
if (failed(dynamicDimAnalysis.run())) {
return linalgOp.getStaticLoopRanges();
}

SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();

for (auto [loopIdx, bound] : llvm::enumerate(bounds)) {
if (!ShapedType::isDynamic(bound)) {
continue;
}

bool boundRefined = false;

// Find operand and dimension that corresponds to this loop.
for (auto [operandIdx, operand] :
llvm::enumerate(linalgOp->getOperands())) {
auto shapedType = dyn_cast<ShapedType>(operand.getType());
if (!shapedType) {
continue;
}

AffineMap map = indexingMaps[operandIdx];
for (auto [dimIdx, expr] : llvm::enumerate(map.getResults())) {
auto dimExpr = dyn_cast<AffineDimExpr>(expr);
if (!dimExpr || dimExpr.getPosition() != loopIdx) {
continue;
}
if (!ShapedType::isDynamic(shapedType.getDimSize(dimIdx))) {
continue;
}

// Use TensorDynamicDimAnalysis to get range info directly.
if (auto range = dynamicDimAnalysis.getRangeInfo(operand, dimIdx)) {
int64_t ub = range->smax().getSExtValue();
if (ub > 0 && ub < MAX_DIM_VALUE) {
bounds[loopIdx] = ub;
boundRefined = true;
break;
}
}
}

if (boundRefined) {
break;
}
}

// If we couldn't refine the bound, set it to a large value.
if (!boundRefined && ShapedType::isDynamic(bounds[loopIdx])) {
bounds[loopIdx] = MAX_BOUND_VALUE;
}
}

return bounds;
}

LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,

Check warning on line 1194 in compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp:1194:15 [misc-use-internal-linkage]

function 'setMatmulLoweringConfig' can be made static or moved into an anonymous namespace to enforce internal linkage
mlir::FunctionOpInterface entryPoint,
Operation *op, bool useDirectLoad) {
auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
Expand All @@ -1122,7 +1201,15 @@
return failure();
}

// Use IntegerRangeAnalysis to get better bounds for dynamic shapes.
bool boundsUsingAnalysis = false;
FailureOr<SmallVector<int64_t>> maybeBounds =
getLoopBoundsWithRangeAnalysis(linalgOp, entryPoint);
SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
if (succeeded(maybeBounds) && (maybeBounds != bounds)) {
boundsUsingAnalysis = true;
Copy link
Contributor

@nirvedhmeshram nirvedhmeshram Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this bool will be set true even when the bounds you find are equal to the bounds of getStaticLoopRanges, but what we want is to only set this to true if you are making bounds that are different from the bounds of getStaticLoopRanges.

bounds = std::move(*maybeBounds);
}
SmallVector<AffineMap> maps = linalgOp.getIndexingMapsArray();
SmallVector<Value> operands(linalgOp->getOperands());

Expand All @@ -1144,7 +1231,7 @@
getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
/*scaled=*/false, splitReductionTripCnt, cPromoteIfPadding,
hasExistingAccumulator);
boundsUsingAnalysis, hasExistingAccumulator);

// TODO (muzasyed) : add generalization for scaled and nonscaled versions of
// matmul lowering.
Expand All @@ -1155,7 +1242,7 @@
configAndWgSize = getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
/*scaled=*/true, splitReductionTripCnt, cPromoteIfPadding,
hasExistingAccumulator);
boundsUsingAnalysis, hasExistingAccumulator);
}

if (failed(configAndWgSize)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1451,3 +1451,70 @@ hal.executable public @multi_result_index_generic_with_scatter_fusion {
// CHECK: vector.transfer_write
// CHECK: vector.transfer_write
// CHECK: iree_linalg_ext.scatter

// -----

// Test dynamic matmul with util.assume.int providing bounds for range analysis.
// The getLoopBoundsWithRangeAnalysis function uses IntegerRangeAnalysis to infer
// the upper bound from util.assume.int and select appropriate tile sizes.

#pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [128, 128, 0],
reduction = [0, 0, 4],
subgroup = [4, 4],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
promote_operands = [0, 1],
padding = [128, 128, 16]
}>
hal.executable public @main {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @matmul_dynamic_m_with_assume ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) ->
(index, index, index) {
%x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_dynamic_m_with_assume()
attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1]
subgroup_size = 64>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%dim = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index
%m = util.assume.int %dim<umin = 0, umax = 1024, udiv = 16> : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf32>>{%m}
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x4096xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%m}
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%m, 2048], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf32>>{%m} -> tensor<?x2048xf32>
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 4096], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x4096xf32>> -> tensor<2048x4096xf32>
%5 = tensor.empty(%m) : tensor<?x4096xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
%7 = linalg.matmul {lowering_config = #config}
ins(%3, %4 : tensor<?x2048xf32>, tensor<2048x4096xf32>)
outs(%6 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [%m, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%m}
return
}
}
}
}

// CHECK-LABEL: func @matmul_dynamic_m_with_assume
// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
// CHECK-DAG: memref.alloc() : memref<16x130xf32, #gpu.address_space<workgroup>>
// CHECK-DAG: memref.alloc() : memref<128x18xf32, #gpu.address_space<workgroup>>
// CHECK: scf.forall ({{.*}}) in (%{{.+}}, 32) {
// CHECK: scf.for {{.*}} = %c0 to %c512 step %c4 {{.*}} -> (vector<4x4x4x1xf32>)
// CHECK: gpu.barrier
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: gpu.barrier
// CHECK-COUNT-64: amdgpu.mfma 16x16x4
// CHECK: scf.yield
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
Loading