[LLVMCPU] Tracks the dimension mapping for multi lowering config (#21649)

Yu-Zhewen · web-flow · commit 08efffa523e1 · 2025-08-13T22:36:56.000+01:00
This adds `IterationDimTracker` to determine dimension mappings both
within individual operations and across multiple operations.

The tracker assigns a global dimension index to all loop dimensions
encountered (where “local” refers to an individual operation and
“global” refers to all target operations). By analyzing
producer-consumer relationships of SSA values, dimensions that are
considered equivalent are assigned the same global dimension index.

It is currently used to improve lowering configuration propagation by
identifying loop dimensions that are common across all target
operations.

---------

Signed-off-by: Yu-Zhewen &lt;zhewenyu@amd.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -145,6 +145,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:IndexToLLVM",
+        "@llvm-project//mlir:IndexingMapOpInterface",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LinalgDialect",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -114,6 +114,7 @@ iree_cc_library(
     MLIRFunctionInterfaces
     MLIRIR
     MLIRIndexToLLVM
+    MLIRIndexingMapOpInterface
     MLIRLLVMCommonConversion
     MLIRLLVMDialect
     MLIRLinalgDialect
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -19,13 +19,15 @@
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/IndexingUtils.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/InterleavedRange.h"
 #include "llvm/Support/MathExtras.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -39,6 +41,7 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/IndexingMapOpInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -1067,6 +1070,219 @@ class LoweringConfigGenerator {
   SmallVector<bool> vectorScalableFlags;
 };
 
+/// A helper class that tracks dimension mappings both within individual
+/// operations and across multiple operations by analyzing the producer-consumer
+/// relationships of SSA values. This tracking is established by assigning a
+/// global dimension index to all loop dimensions encountered. Dimensions
+/// sharing the same global index are considered equivalent.
+class IterationDimTracker {
+public:
+  explicit IterationDimTracker(ArrayRef<Operation *> operations)
+      : operations(operations.begin(), operations.end()) {
+    // Ensure operations are processed in topological order.
+    mlir::computeTopologicalSorting(this->operations);
+    buildDimMapping();
+  }
+
+  /// Returns true if the given dimension of `op` is common across all
+  /// operations.
+  bool isCommonDim(Operation *op, unsigned pos) {
+    assert(operationToGlobalDimMaps.contains(op));
+    int64_t dim = operationToGlobalDimMaps[op][pos];
+    for ([[maybe_unused]] auto &[_, dims] : operationToGlobalDimMaps) {
+      if (!llvm::is_contained(dims, dim)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+private:
+  /// Builds and unifies dimension index mappings for all operations,
+  /// using producer–consumer SSA value relationships.
+  void buildDimMapping() {
+    // Tracks equivalent global dimension indices.
+    llvm::EquivalenceClasses<int64_t> indicesEquivalence;
+    // For each SSA value, maps its local dimension index to a global index.
+    // Value -> (local dim index -> global dim index)
+    llvm::SmallDenseMap<Value, SmallVector<int64_t>> valueToGlobalDimMaps;
+
+    for (Operation *op : operations) {
+      auto tilingOp = cast<TilingInterface>(op);
+      int64_t numLoops = tilingOp.getLoopIteratorTypes().size();
+      // Unconditionally assign new global indices, to be unified later.
+      for (int64_t i = 0; i < numLoops; ++i) {
+        int64_t globalIndex = totalLoopNum++;
+        indicesEquivalence.insert(globalIndex);
+        operationToGlobalDimMaps[op].push_back(globalIndex);
+      }
+      // The assigned global dimension indices are now unified based on
+      // producer–consumer SSA value relationships:
+      // - For operations implementing `IndexingMapOpInterface`, unify
+      // dimensions by iterating over their indexing maps.
+      // - For pack/unpack operations, use an identity mapping, since tiling
+      // applies to the outer (unpacked) dimensions.
+      // - For all other (unknown) operations, assume an identity mapping for
+      // any value whose rank matches the operation’s loop count.
+      TypeSwitch<Operation *>(op)
+          .Case<IndexingMapOpInterface>([&](auto op) {
+            propagateOnIndexingMapOp(op, indicesEquivalence,
+                                     valueToGlobalDimMaps);
+          })
+          .Case<linalg::PackOp, linalg::UnPackOp>([&](auto op) {
+            propagateOnPackUnpackOp(op, indicesEquivalence,
+                                    valueToGlobalDimMaps, numLoops);
+          })
+          .Default([&](auto op) {
+            propagateOnUnknownOp(op, indicesEquivalence, valueToGlobalDimMaps,
+                                 numLoops);
+          });
+    }
+
+    // Remap the global dimension indices in two steps:
+    // 1. Assign the same temporary index to all equivalent dimensions.
+    // 2. Convert these temporary indices to a compact, zero-based range.
+    auto applyReplaceMap = [&](llvm::SmallDenseMap<int64_t, int64_t> &map) {
+      for (auto &opEntry : operationToGlobalDimMaps) {
+        for (auto &dim : opEntry.second) {
+          dim = map.lookup(dim);
+        }
+      }
+    };
+    llvm::SmallDenseMap<int64_t, int64_t> replaceMap0, replaceMap1;
+    int64_t tempDimIndex = totalLoopNum;
+    totalLoopNum = 0;
+    for (auto it = indicesEquivalence.begin(); it != indicesEquivalence.end();
+         ++it) {
+      if (!(*it)->isLeader()) {
+        continue;
+      }
+      for (auto mit = indicesEquivalence.member_begin(**it);
+           mit != indicesEquivalence.member_end(); ++mit) {
+        replaceMap0[*mit] = tempDimIndex;
+      }
+      replaceMap1[tempDimIndex] = totalLoopNum;
+      tempDimIndex++;
+      totalLoopNum++;
+    }
+    applyReplaceMap(replaceMap0);
+    applyReplaceMap(replaceMap1);
+  }
+
+  /// Ties loop dimensions together based on the operation’s indexing maps,
+  /// considering only simple result dimension expressions (`AffineDimExpr`).
+  ///
+  /// Complex expressions (e.g., `affine_map<(d0, d1, d2, d3) -> (d0 * 2 + d2,
+  /// d1 * 3 + d3)>`) are ignored because they fall outside the "loop dimension"
+  /// concept. Such expressions describe how indices are computed within the
+  /// innermost loop body, but they do not directly identify which loop
+  /// dimensions correspond or should be tied.
+  void propagateOnIndexingMapOp(
+      IndexingMapOpInterface indexingMapOp,
+      llvm::EquivalenceClasses<int64_t> &indicesEquivalence,
+      llvm::SmallDenseMap<Value, SmallVector<int64_t>> &valueToGlobalDimMaps) {
+    Operation *op = indexingMapOp.getOperation();
+    for (OpOperand &operand : op->getOpOperands()) {
+      Value value = operand.get();
+      // Skip operands that have no known mapping from their producers.
+      if (!valueToGlobalDimMaps.contains(value)) {
+        continue;
+      }
+      AffineMap map = indexingMapOp.getMatchingIndexingMap(&operand);
+      for (auto [dim, expr] : llvm::enumerate(map.getResults())) {
+        // Stop if the current dimension exceeds the number of mapped ones.
+        if (dim >= valueToGlobalDimMaps[value].size()) {
+          break;
+        }
+        // Skip on complex expressions.
+        auto dimExpr = dyn_cast<AffineDimExpr>(expr);
+        if (!dimExpr) {
+          continue;
+        }
+        int64_t pos = dimExpr.getPosition();
+        // Unify the dimension index between the producer and the current op.
+        indicesEquivalence.unionSets(valueToGlobalDimMaps[value][dim],
+                                     operationToGlobalDimMaps[op][pos]);
+      }
+    }
+    // Propogate to results.
+    auto dsOp = cast<DestinationStyleOpInterface>(op);
+    for (OpResult result : op->getResults()) {
+      OpOperand *operand = dsOp.getTiedOpOperand(result);
+      AffineMap map = indexingMapOp.getMatchingIndexingMap(operand);
+      for (auto [dim, expr] : llvm::enumerate(map.getResults())) {
+        // Skip on complex expressions.
+        auto dimExpr = dyn_cast<AffineDimExpr>(expr);
+        if (!dimExpr) {
+          continue;
+        }
+        int64_t pos = dimExpr.getPosition();
+        valueToGlobalDimMaps[result].push_back(
+            operationToGlobalDimMaps[op][pos]);
+      }
+    }
+  }
+
+  /// Ties the dimensions of pack and unpack operations with their operands in
+  /// the outer (unpacked) dimensions.
+  void propagateOnPackUnpackOp(
+      Operation *op, llvm::EquivalenceClasses<int64_t> &indicesEquivalence,
+      llvm::SmallDenseMap<Value, SmallVector<int64_t>> &valueToGlobalDimMaps,
+      int64_t numLoops) {
+    for (OpOperand &operand : op->getOpOperands()) {
+      Value value = operand.get();
+      if (!valueToGlobalDimMaps.contains(value)) {
+        continue;
+      }
+      int64_t rank = cast<ShapedType>(value.getType()).getRank();
+      int64_t outDimSize = std::min(rank, numLoops);
+      for (int64_t i = 0; i < outDimSize; ++i) {
+        indicesEquivalence.unionSets(valueToGlobalDimMaps[value][i],
+                                     operationToGlobalDimMaps[op][i]);
+      }
+    }
+    // Propagate to results.
+    for (Value result : op->getResults()) {
+      valueToGlobalDimMaps[result] = operationToGlobalDimMaps[op];
+    }
+  }
+
+  /// Ties the dimensions of operations with their operands, if the operand rank
+  /// matches the operation’s loop count.
+  void propagateOnUnknownOp(
+      Operation *op, llvm::EquivalenceClasses<int64_t> &indicesEquivalence,
+      llvm::SmallDenseMap<Value, SmallVector<int64_t>> &valueToGlobalDimMaps,
+      int64_t numLoops) {
+    for (OpOperand &operand : op->getOpOperands()) {
+      Value value = operand.get();
+      if (!valueToGlobalDimMaps.contains(value) ||
+          numLoops != cast<ShapedType>(value.getType()).getRank()) {
+        continue;
+      }
+      for (int64_t i = 0; i < numLoops; ++i) {
+        indicesEquivalence.unionSets(valueToGlobalDimMaps[value][i],
+                                     operationToGlobalDimMaps[op][i]);
+      }
+    }
+    // Propagate to results.
+    for (Value result : op->getResults()) {
+      if (numLoops == cast<ShapedType>(result.getType()).getRank()) {
+        valueToGlobalDimMaps[result] = operationToGlobalDimMaps[op];
+      }
+    }
+  }
+
+  SmallVector<Operation *> operations;
+  // Tracks the total number of unique loop dimensions among the given set of
+  // operations.
+  int64_t totalLoopNum = 0;
+  // For each compute operation, maps its local loop dimension index to the
+  // global index. Operation -> (local dim index -> global dim
+  // index)
+  llvm::SmallDenseMap<Operation *, SmallVector<int64_t>>
+      operationToGlobalDimMaps;
+};
+
 /// Returns the same lowering_config attribute with the updated tile sizes and
 /// scalable tile flags. The distribution tiling sizes is not set if it is
 /// false.
@@ -3054,10 +3270,12 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
   SmallVector<bool> commonVecScalableTileFlags = parallelVecScalableTileSizes;
   SmallVector<int64_t> innerVecTileSizes(maxLoopNums, 0);
   SmallVector<bool> innerVecScalableTileFlags(maxLoopNums, false);
+  IterationDimTracker dimTracker(computeOps);
   for (auto op : computeOps) {
     auto iterTypes = cast<TilingInterface>(op).getLoopIteratorTypes();
     for (auto [idx, iterType] : llvm::enumerate(iterTypes)) {
-      if (iterType == utils::IteratorType::reduction) {
+      if (iterType == utils::IteratorType::reduction ||
+          !dimTracker.isCommonDim(op, idx)) {
         innerVecTileSizes[idx] = parallelVecTileSizes[idx];
         innerVecScalableTileFlags[idx] = parallelVecScalableTileSizes[idx];
         commonVecTileSizes[idx] = 0;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -2068,3 +2068,52 @@ func.func @complex_view_as_real() attributes {hal.executable.target = #executabl
 //      CHECK: func.func @complex_view_as_real()
 //      CHECK:   linalg.generic
 // CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+// -----
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0)>
+#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map5 = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
+func.func @decode_reduction_f32(%arg0: tensor<32x262144xf16>, %arg1: tensor<32xf32>, %arg2: tensor<32x16x16384xf16>, %arg3: tensor<32x16xf16>, %arg4: tensor<32x16xf16>) -> tensor<16384x32x16xf16> attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 2.621440e+05 : f32
+  %cst_1 = arith.constant 9.99999997E-7 : f32
+  %0 = tensor.empty() : tensor<16384x32x16xf16>
+  %1 = tensor.empty() : tensor<32xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<32xf32>) -> tensor<32xf32>
+  %3 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0, %arg1 : tensor<32x262144xf16>, tensor<32xf32>) outs(%2 : tensor<32xf32>) {
+  ^bb0(%in: f16, %in_2: f32, %out: f32):
+    %5 = arith.extf %in : f16 to f32
+    %6 = arith.subf %5, %in_2 : f32
+    %7 = arith.mulf %6, %6 : f32
+    %8 = arith.addf %7, %out : f32
+    linalg.yield %8 : f32
+  } -> tensor<32xf32>
+  %4 = linalg.generic {indexing_maps = [#map2, #map3, #map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2, %arg1, %3, %arg3, %arg4 : tensor<32x16x16384xf16>, tensor<32xf32>, tensor<32xf32>, tensor<32x16xf16>, tensor<32x16xf16>) outs(%0 : tensor<16384x32x16xf16>) {
+  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %in_4: f16, %in_5: f16, %out: f16):
+    %5 = arith.divf %in_3, %cst_0 : f32
+    %6 = arith.addf %5, %cst_1 : f32
+    %7 = math.rsqrt %6 : f32
+    %8 = arith.extf %in : f16 to f32
+    %9 = arith.subf %8, %in_2 : f32
+    %10 = arith.mulf %9, %7 : f32
+    %11 = arith.extf %in_4 : f16 to f32
+    %12 = arith.mulf %10, %11 : f32
+    %13 = arith.extf %in_5 : f16 to f32
+    %14 = arith.addf %12, %13 : f32
+    %15 = arith.truncf %14 : f32 to f16
+    linalg.yield %15 : f16
+  } -> tensor<16384x32x16xf16>
+  return %4 : tensor<16384x32x16xf16>
+}
+//  CHECK-DAG: #[[CONFIG0:.+]] = #iree_cpu.lowering_config<distribution = [4, 0], vector_common_parallel = [4, 0], vector_reduction = [0, 8]>
+//  CHECK-DAG: #[[CONFIG1:.+]] = #iree_cpu.lowering_config<vector_common_parallel = [4, 0, 0], vector_inner_parallel = [0, 1, 4]>
+//      CHECK: func.func @decode_reduction_f32
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG0]]
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG1]]