Skip to content

Commit 2c5c620

Browse files
authored
[CPU] Drop empty tile sizes from lowering config. (#21542)
If a tile size list has all zeros, it means no tiling at all. It is redundant information in the IR. After switching to `IREE::CPU::LoweringConfigAttr`, we no longer need to fill empty tile sizes to make `TilingConfig` happy. The revision deletes `emitInnerParallelList` option from `LoweringConfigGenerator`, and it now only emits the tiling level if one of the tile sizes is set. The pipeline changes are needed because now the `rootOp` does not have `vector_inner_parallel` level. It is mainly for the consumer ops that have dimensions not captured by the `rootOp`. Given that it is the last level of tiling, we move it out of the for loop. --------- Signed-off-by: hanhanW <[email protected]>
1 parent 938b062 commit 2c5c620

15 files changed

+173
-155
lines changed

compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ bool LoweringConfigAttr::hasTilingLevel(unsigned level) const {
178178
}
179179

180180
bool LoweringConfigAttr::hasWorkgroupTilingLevel() const {
181-
return !getWorkgroupTileSizes().empty();
181+
return getConfig().contains(
182+
getTilingLevelName(TilingLevel::DistributionTiles));
182183
}
183184

184185
std::optional<unsigned> LoweringConfigAttr::getNumTilingLevels() const {

compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
2121
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
2222
#include "iree/compiler/Dialect/LinalgExt/Utils/IndexingUtils.h"
23+
#include "llvm/ADT/STLExtras.h"
2324
#include "llvm/ADT/SmallVectorExtras.h"
2425
#include "llvm/ADT/TypeSwitch.h"
2526
#include "llvm/Support/CommandLine.h"
@@ -110,6 +111,7 @@ static llvm::cl::opt<bool> clEnableRiscvAggressiveDist(
110111
llvm::cl::init(false));
111112

112113
using IREE::Codegen::DispatchLoweringPassPipeline;
114+
using IREE::CPU::TilingLevel;
113115

114116
// Encodes the pre-processing strategy to be applied on a Linalg operation
115117
// before vectorization.
@@ -960,14 +962,14 @@ static void setAlwaysVectorizeSizes(linalg::LinalgOp op,
960962
}
961963

962964
/// A helper class to record different level tiling sizes and generate
963-
/// corresponding IREE::CPU::LoweringConfigAttr. Only vector level supports
964-
/// scalable tile sizes for now.
965+
/// corresponding IREE::CPU::LoweringConfigAttr for the rootOp. It implies that
966+
/// the distribution tiling level is always set, even if tile sizes are all
967+
/// zeros. Because a rootOp must have distribution tiling level.
968+
/// Only vector level supports scalable tile sizes for now.
965969
class LoweringConfigGenerator {
966970
public:
967-
explicit LoweringConfigGenerator(Operation *op,
968-
bool emitInnerParallelList = false)
969-
: ctx(op->getContext()), rootOp(op),
970-
emitInnerParallelList(emitInnerParallelList) {}
971+
explicit LoweringConfigGenerator(Operation *op)
972+
: ctx(op->getContext()), rootOp(op) {}
971973

972974
void setDistributionTileSizes(ArrayRef<int64_t> tileSizes) {
973975
assert(distTileSizes.empty() && "expected to set only once");
@@ -990,7 +992,6 @@ class LoweringConfigGenerator {
990992
/// existing values. By default, it will always contain distribution tile
991993
/// sizes, unless the rootOp does not implement TilingInterface.
992994
IREE::CPU::LoweringConfigAttr generateCPULoweringConfig() {
993-
using TilingLevel = IREE::CPU::TilingLevel;
994995
SmallVector<NamedAttribute> items;
995996
if (!distTileSizes.empty()) {
996997
appendLoweringConfigLevelAttr(items, TilingLevel::DistributionTiles,
@@ -1024,21 +1025,23 @@ class LoweringConfigGenerator {
10241025
parallelTileSizes, parallelScalableFlags);
10251026
appendLoweringConfigLevelAttr(items, TilingLevel::VectorReductionTiles,
10261027
reductionTileSizes, reductionScalableFlags);
1027-
if (emitInnerParallelList) {
1028-
size_t size = parallelTileSizes.size();
1029-
appendLoweringConfigLevelAttr(
1030-
items, TilingLevel::VectorInnerParallelTiles,
1031-
SmallVector<int64_t>(size, 0), SmallVector<bool>(size, false));
1032-
}
10331028
}
10341029
return IREE::CPU::LoweringConfigAttr::get(ctx, items);
10351030
}
10361031

10371032
private:
1033+
/// Appends the `level` with (`tileSizes`, `scalableFlags`) tiling config to
1034+
/// `items`, if it is not a NOP config. E.g., if all the tile sizes are zeros,
1035+
/// it means no tiling at all. Only the distribution tiling level is
1036+
/// unconditionally added because a root op expects the level to be present.
10381037
void appendLoweringConfigLevelAttr(SmallVectorImpl<NamedAttribute> &items,
1039-
IREE::CPU::TilingLevel level,
1038+
TilingLevel level,
10401039
ArrayRef<int64_t> tileSizes,
10411040
ArrayRef<bool> scalableFlags = {}) {
1041+
if (level != TilingLevel::DistributionTiles &&
1042+
llvm::all_of(tileSizes, [](int64_t v) { return v == 0; })) {
1043+
return;
1044+
}
10421045
items.emplace_back(IREE::CPU::getTilingLevelName(level),
10431046
IREE::CPU::LoweringConfigAttr::getTilingLevelAttr(
10441047
ctx, tileSizes, scalableFlags));
@@ -1047,12 +1050,6 @@ class LoweringConfigGenerator {
10471050
MLIRContext *ctx;
10481051
Operation *rootOp;
10491052

1050-
// Generates the `IREE::CPU::TilingLevel::VectorInnerParallelTiles` tile sizes
1051-
// in the lowering config. Usually, they are zero values.
1052-
// TODO(hanchung): Remove the field once all the pipelines switch to CPU
1053-
// lowering_config. It is alive for legacy setup.
1054-
bool emitInnerParallelList = false;
1055-
10561053
// The tile sizes for distribution from the `rootOp`'s perspective.
10571054
SmallVector<int64_t> distTileSizes;
10581055

@@ -1092,8 +1089,14 @@ static IREE::Codegen::LoweringConfigAttrInterface getNewLoweringConfig(
10921089

10931090
SmallVector<NamedAttribute> newItems;
10941091
for (auto [level, tileSizes, scalableFlags] : tilingInfo) {
1095-
if (!setDistributionConfig &&
1096-
level == IREE::CPU::TilingLevel::DistributionTiles) {
1092+
if (!setDistributionConfig && level == TilingLevel::DistributionTiles) {
1093+
continue;
1094+
}
1095+
// Distribution tile sizes is a must for rootOp, because it is the
1096+
// definition of root op. An operation that has distribution tile sizes is
1097+
// the root op. Other level can be dropped if all the tile sizes are zeros.
1098+
if (level != TilingLevel::DistributionTiles &&
1099+
llvm::all_of(tileSizes, [](int64_t val) { return val == 0; })) {
10971100
continue;
10981101
}
10991102
newItems.emplace_back(IREE::CPU::getTilingLevelName(level),
@@ -1155,7 +1158,7 @@ static LogicalResult setMatmulPeelingRootConfig(
11551158
inputVecScalableTileFlags.end());
11561159
vectorScalableFlags.back() = false;
11571160

1158-
LoweringConfigGenerator generator(op, /*emitInnerParallelList=*/true);
1161+
LoweringConfigGenerator generator(op);
11591162
generator.setDistributionTileSizes(distTileSizes);
11601163
generator.setCacheTileSizes(cacheTileSizes);
11611164
generator.setVectorTileSizes(vecTileSizes, vectorScalableFlags);
@@ -1206,7 +1209,7 @@ static LogicalResult setMatmulRootConfig(
12061209
}
12071210
limitVectorTileSizes(cast<linalg::LinalgOp>(op.getOperation()), vecTileSizes);
12081211

1209-
LoweringConfigGenerator generator(op, /*emitInnerParallelList=*/true);
1212+
LoweringConfigGenerator generator(op);
12101213
generator.setDistributionTileSizes(distTileSizes);
12111214
generator.setVectorTileSizes(vecTileSizes, vecScalableFlags);
12121215
IREE::CPU::LoweringConfigAttr loweringConfig =
@@ -2085,10 +2088,9 @@ setDefaultGenericOpRootConfig(mlir::FunctionOpInterface entryPointFn,
20852088
// If there are no loops, there is nothing to do.
20862089
unsigned numLoops = genericOp.getNumLoops();
20872090
if (numLoops == 0) {
2091+
LoweringConfigGenerator generator(genericOp);
20882092
return setOpConfigAndEntryPointFnTranslation(
2089-
entryPointFn, genericOp,
2090-
IREE::CPU::LoweringConfigAttr::get(genericOp.getContext(),
2091-
SmallVector<NamedAttribute>()),
2093+
entryPointFn, genericOp, generator.generateCPULoweringConfig(),
20922094
DispatchLoweringPassPipeline::CPUDefault);
20932095
}
20942096

@@ -2113,7 +2115,7 @@ setDefaultGenericOpRootConfig(mlir::FunctionOpInterface entryPointFn,
21132115
distConfig.maxTileSizes, vecPreProcStrategy, vecTileSizes);
21142116
limitVectorTileSizes(genericOp, vecTileSizes);
21152117

2116-
LoweringConfigGenerator generator(genericOp, /*emitInnerParallelList=*/true);
2118+
LoweringConfigGenerator generator(genericOp);
21172119
generator.setDistributionTileSizes(distTileSizes);
21182120
generator.setVectorTileSizes(vecTileSizes);
21192121
IREE::CPU::LoweringConfigAttr loweringConfig =
@@ -2267,7 +2269,7 @@ setTransposeLikeOpRootConfig(mlir::FunctionOpInterface entryPointFn,
22672269
SmallVector<int64_t> distTileSizes =
22682270
getDefaultDistributedLevelTileSizes(genericOp, distConfig);
22692271

2270-
LoweringConfigGenerator generator(genericOp, /*emitInnerParallelList=*/true);
2272+
LoweringConfigGenerator generator(genericOp);
22712273
generator.setDistributionTileSizes(distTileSizes);
22722274
generator.setVectorTileSizes(vecSizes, vecScalableDims);
22732275
IREE::CPU::LoweringConfigAttr loweringConfig =
@@ -2346,7 +2348,7 @@ static LogicalResult setElementwiseGenericOpRootConfig(
23462348
vecPreProcStrategy == VectorPreProcStrategy::Masking);
23472349
}
23482350

2349-
LoweringConfigGenerator generator(genericOp, /*emitInnerParallelList=*/true);
2351+
LoweringConfigGenerator generator(genericOp);
23502352
generator.setDistributionTileSizes(distTileSizes);
23512353
generator.setVectorTileSizes(vecTileSizes);
23522354
IREE::CPU::LoweringConfigAttr loweringConfig =
@@ -2603,7 +2605,7 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
26032605

26042606
SmallVector<int64_t> distTileSizes =
26052607
getDefaultDistributedLevelTileSizes(padOp, distConfig);
2606-
LoweringConfigGenerator generator(padOp, /*emitInnerParallelList=*/true);
2608+
LoweringConfigGenerator generator(padOp);
26072609
generator.setDistributionTileSizes(distTileSizes);
26082610
generator.setVectorTileSizes(distConfig.vectorSizeHints);
26092611
IREE::CPU::LoweringConfigAttr loweringConfig =
@@ -2879,7 +2881,7 @@ adjustTileSizesForGenericOp(mlir::FunctionOpInterface entryPointFn,
28792881
/// `level`, if it is present. Otherwise, adds a new item to the vector.
28802882
static void updateOrAddTilingLevelInfo(
28812883
SmallVectorImpl<IREE::CPU::LoweringConfigLevelInfo> &tilingInfo,
2882-
IREE::CPU::TilingLevel level, ArrayRef<int64_t> tileSizes,
2884+
TilingLevel level, ArrayRef<int64_t> tileSizes,
28832885
ArrayRef<bool> scalableFlags) {
28842886
for (IREE::CPU::LoweringConfigLevelInfo &info : tilingInfo) {
28852887
if (info.level == level) {
@@ -2946,16 +2948,18 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
29462948
}
29472949

29482950
auto rootLoweringConfig = getLoweringConfig(rootOperation);
2949-
std::unique_ptr<TilingConfig> tilingConfig =
2950-
TilingConfig::create(rootLoweringConfig);
29512951
SmallVector<int64_t> distTileSizes, parallelVecTileSizes;
29522952
SmallVector<bool> distScalableTileSizes, parallelVecScalableTileSizes;
2953-
if (tilingConfig->getNumTilingLevels() > 0) {
2954-
distTileSizes = tilingConfig->getDistributionTileSizes();
2955-
}
2956-
if (tilingConfig->getNumTilingLevels() > 1) {
2957-
std::tie(parallelVecTileSizes, parallelVecScalableTileSizes) =
2958-
tilingConfig->getVectorCommonParallelSizes();
2953+
assert(rootLoweringConfig.hasWorkgroupTilingLevel());
2954+
distTileSizes = rootLoweringConfig.getWorkgroupTileSizes();
2955+
if (rootLoweringConfig.hasTilingLevel(
2956+
TilingLevel::VectorCommonParallelTiles)) {
2957+
auto attr = cast<IREE::Codegen::LoweringConfigTilingLevelAttr>(
2958+
rootLoweringConfig.getTilingLevelAttr(
2959+
TilingLevel::VectorCommonParallelTiles));
2960+
parallelVecTileSizes.assign(attr.getSizes().begin(), attr.getSizes().end());
2961+
parallelVecScalableTileSizes.assign(attr.getScalableFlags().begin(),
2962+
attr.getScalableFlags().end());
29592963
}
29602964

29612965
size_t maxLoopNums = 0;
@@ -3080,6 +3084,9 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
30803084
}
30813085

30823086
// Set the lowering configs with new tile sizes.
3087+
// TODO(hanchung): Deprecate TilingConfig from the file.
3088+
std::unique_ptr<TilingConfig> tilingConfig =
3089+
TilingConfig::create(rootLoweringConfig);
30833090
for (auto op : computeOps) {
30843091
int numLoops = cast<TilingInterface>(op).getLoopIteratorTypes().size();
30853092
SmallVector<IREE::CPU::LoweringConfigLevelInfo> newTilingInfo;

compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,10 @@ void LLVMCPUTileAndFusePass::runOnOperation() {
269269
LDBG() << "can't find lowering_config, skip TileAndFuse";
270270
return;
271271
}
272+
if (!loweringConfig.hasTilingLevel(tilingLevel)) {
273+
LDBG() << "no tile sizes for the tiling level, skip TileAndFuse";
274+
return;
275+
}
272276

273277
SmallVector<int64_t> tileSizes;
274278
SmallVector<bool> tileScalableFlags;

compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -434,18 +434,24 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
434434
static_cast<IREE::CPU::TilingLevel>(i), /*skipRootOp=*/true));
435435
break;
436436
case IREE::CPU::TilingLevel::VectorInnerParallelTiles:
437-
funcPassManager.addPass(createLLVMCPUTileAndFusePass(
438-
tilingConfig.getVectorInnerParallelLevel()));
439-
break;
440437
case IREE::CPU::TilingLevel::DistributionTiles:
441438
case IREE::CPU::TilingLevel::MaxNumTileLevels:
442439
case IREE::CPU::TilingLevel::InvalidLevel:
443-
break;
440+
continue;
444441
};
445442
funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
446443
funcPassManager.addPass(createConcretizePadResultShapePass());
447444
}
448445

446+
// `VectorInnerParallelTiles` level models the tiling and fusion for the
447+
// dimensions that are not captured in root op. I.e., root op may not have the
448+
// config for the level. Thus, we run the LLVMCPUTileAndFuse pass for
449+
// consumers.
450+
funcPassManager.addPass(createLLVMCPUTileAndFusePass(
451+
IREE::CPU::TilingLevel::VectorInnerParallelTiles));
452+
funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
453+
funcPassManager.addPass(createConcretizePadResultShapePass());
454+
449455
funcPassManager.addPass(createForallToForPass());
450456
if (pipelineOpt.enablePeeling) {
451457
funcPassManager.addPass(createLLVMCPUPeelPass());

compiler/src/iree/compiler/Codegen/LLVMCPU/test/2d_scalable_to_1d_scalable.mlir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-2d-scalable-to-1d-scalable{assume-arm-sme=true},cse))" --split-input-file %s | FileCheck %s
22

3-
#compute_config = #iree_cpu.lowering_config<vector_common_parallel = [[4], [4]], vector_reduction = [0, 0], vector_inner_parallel = [0, 0]>
4-
#matmul_config = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = [[4], [4], 0], vector_reduction = [0, 0, 1], vector_inner_parallel = [0, 0, 0]>
3+
#compute_config = #iree_cpu.lowering_config<vector_common_parallel = [[4], [4]]>
4+
#matmul_config = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = [[4], [4], 0], vector_reduction = [0, 0, 1]>
55
#dim_0_map = affine_map<(d0)[s0] -> (-d0 + 32400, s0)>
66
#dim_1_map = affine_map<(d0)[s0] -> (-d0 + 16, s0)>
77

@@ -60,9 +60,9 @@ func.func @scalable_2d_matmul_and_generic(%arg0: tensor<32400x32xf32>, %arg1: te
6060
}
6161
return %2 : tensor<32400x16xf32>
6262
}
63-
// CHECK: #[[FILL_CONFIG:.*]] = #iree_cpu.lowering_config<vector_common_parallel = {{\[}}[4], [4]], vector_inner_parallel = [0, 0], vector_reduction = [0, 0]>
64-
// CHECK: #[[MATMUL_CONFIG:.*]] = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = {{\[}}[4], [4], 0], vector_inner_parallel = [0, 0, 0], vector_reduction = [0, 0, 1]>
65-
// CHECK: #[[GENERIC_CONFIG:.*]] = #iree_cpu.lowering_config<vector_common_parallel = [4, [4]], vector_inner_parallel = [0, 0], vector_reduction = [0, 0]>
63+
// CHECK: #[[FILL_CONFIG:.*]] = #iree_cpu.lowering_config<vector_common_parallel = {{\[}}[4], [4]]>
64+
// CHECK: #[[MATMUL_CONFIG:.*]] = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = {{\[}}[4], [4], 0], vector_reduction = [0, 0, 1]>
65+
// CHECK: #[[GENERIC_CONFIG:.*]] = #iree_cpu.lowering_config<vector_common_parallel = [4, [4]]>
6666
//
6767
// CHECK: func.func @scalable_2d_matmul_and_generi
6868
// CHECK: %[[C4:.*]] = arith.constant 4 : index

compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_arm_sme_streaming_mode_tests.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#hal.pipeline.binding<storage_buffer>,
77
#hal.pipeline.binding<storage_buffer>
88
]>
9-
#config = #iree_cpu.lowering_config<distribution = [0], vector_common_parallel = [1], vector_reduction = [0], vector_inner_parallel = [0]>
9+
#config = #iree_cpu.lowering_config<distribution = [0], vector_common_parallel = [1]>
1010
module {
1111
module {
1212
func.func @fixed_size_dispatch() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>,
@@ -43,7 +43,7 @@ module {
4343
#hal.pipeline.binding<storage_buffer>,
4444
#hal.pipeline.binding<storage_buffer>
4545
]>
46-
#config = #iree_cpu.lowering_config<distribution = [0], vector_common_parallel = [[1]], vector_reduction = [0], vector_inner_parallel = [0]>
46+
#config = #iree_cpu.lowering_config<distribution = [0], vector_common_parallel = [[1]]>
4747
module {
4848
module {
4949
func.func @scalable_dispatch() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>,
@@ -81,7 +81,7 @@ module {
8181
#hal.pipeline.binding<storage_buffer>,
8282
#hal.pipeline.binding<storage_buffer>
8383
]>
84-
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[4], [4]], vector_reduction = [0, 0], vector_inner_parallel = [0, 0]>
84+
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[4], [4]]>
8585
module {
8686
module {
8787
func.func @scalable_dispatch_using_za() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>,

0 commit comments

Comments
 (0)