[DT][CPU] Exclude pack ops with reshape producers from lowering config setting (#21675)

Yu-Zhewen · web-flow · commit ac3e153ea729 · 2025-08-14T09:02:01.000Z
Without this, the reshape can break dimension tracking, creating extra false dimensions that lead to incorrect lowering configurations. See #21670. --------- Signed-off-by: Yu-Zhewen <zhewenyu@amd.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -3450,6 +3450,29 @@ lowerUsingDefaultPipeline(mlir::FunctionOpInterface entryPointFn) {
   return setTranslationInfo(entryPointFn, translationInfo);
 }
 
+/// Returns true if the given operation should have a lowering config set.
+///
+/// This predicate excludes:
+///   - Ops inside a `CustomOp` that already have a lowering config.
+///   - `linalg.pack` ops whose producer is a `tensor.collapse_shape`,
+///     as they will be lowered together into a `map_scatter` later in the
+///     pipeline.
+static bool shouldSetLoweringConfig(Operation *op) {
+  if (isa_and_nonnull<IREE::LinalgExt::CustomOp>(op->getParentOp()) &&
+      getLoweringConfig(op) != nullptr) {
+    return false;
+  }
+
+  if (auto packOp = dyn_cast<linalg::PackOp>(op)) {
+    if (isa_and_nonnull<tensor::CollapseShapeOp>(
+            packOp.getSource().getDefiningOp())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /// Sets the translation information to use for a dispatch region.
 static LogicalResult
 setTranslationInfoAndRootConfig(mlir::FunctionOpInterface entryPointFn,
@@ -3488,14 +3511,8 @@ setTranslationInfoAndRootConfig(mlir::FunctionOpInterface entryPointFn,
       return failure();
     }
 
-    // Avoid this for ops within a custom_op since those ops have already their
-    // configuration set.
-    auto prunedComputeOps =
-        llvm::to_vector(llvm::make_filter_range(computeOps, [](Operation *op) {
-          return !isa_and_nonnull<IREE::LinalgExt::CustomOp>(
-                     op->getParentOp()) ||
-                 getLoweringConfig(op) == nullptr;
-        }));
+    auto prunedComputeOps = llvm::to_vector(
+        llvm::make_filter_range(computeOps, shouldSetLoweringConfig));
     if (failed(setLoweringConfigForComputeOps(entryPointFn, prunedComputeOps,
                                               rootOperation))) {
       return failure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -2117,3 +2117,33 @@ func.func @decode_reduction_f32(%arg0: tensor<32x262144xf16>, %arg1: tensor<32xf
 // CHECK-SAME:       lowering_config = #[[CONFIG0]]
 //      CHECK:   linalg.generic
 // CHECK-SAME:       lowering_config = #[[CONFIG1]]
+
+// -----
+
+#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
+func.func @attention_reshape_pack(%arg0: index, %arg1: tensor<4x2x?x32xf16>, %arg2: tensor<?x4x32xf16>, %arg3: tensor<?x4x32xf16>, %arg4: tensor<4x2x?x?xf16>) -> tensor<?x256x1x1xf16> attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
+  %cst = arith.constant 0.000000e+00 : f16
+  %cst_0 = arith.constant 1.767580e-01 : f16
+  %0 = tensor.empty(%arg0) : tensor<?x4x2x32xf16>
+  %1 = tensor.empty(%arg0) : tensor<4x2x?x32xf16>
+  %2 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d5, d0, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d5, d0, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]} ins(%arg1, %arg2, %arg3, %cst_0, %arg4 : tensor<4x2x?x32xf16>, tensor<?x4x32xf16>, tensor<?x4x32xf16>, f16, tensor<4x2x?x?xf16>) outs(%1 : tensor<4x2x?x32xf16>) {
+  ^bb0(%arg5: f32):
+    iree_linalg_ext.yield %arg5 : f32
+  } -> tensor<4x2x?x32xf16>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<4x2x?x32xf16>) outs(%0 : tensor<?x4x2x32xf16>) {
+  ^bb0(%in: f16, %out: f16):
+    linalg.yield %in : f16
+  } -> tensor<?x4x2x32xf16>
+  %collapsed = tensor.collapse_shape %3 [[0], [1, 2, 3]] : tensor<?x4x2x32xf16> into tensor<?x256xf16>
+  %4 = tensor.empty(%arg0) : tensor<?x256x1x1xf16>
+  %pack = linalg.pack %collapsed padding_value(%cst : f16) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %4 : tensor<?x256xf16> -> tensor<?x256x1x1xf16>
+  return %pack : tensor<?x256x1x1xf16>
+}
+//  CHECK-DAG: #[[CONFIG0:.+]] = #iree_cpu.lowering_config<distribution = [1, 1, 64, 16, 0, 0], vector_common_parallel = [1, 1, 4, 16, 0, 0], vector_reduction = [0, 0, 0, 0, 0, 32]>
+//  CHECK-DAG: #[[CONFIG1:.+]] = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 4, 16]>
+//  CHECK-NOT: #iree_cpu.lowering_config
+//      CHECK: func.func @attention_reshape_pack
+//      CHECK:   iree_linalg_ext.attention
+// CHECK-SAME:       lowering_config = #[[CONFIG0]]
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG1]]