Skip to content

Commit b082090

Browse files
authored
[CPU] Expose more options to CPUCodegenOptions. (#23586)
The revision moves `useFastMinMaxOps`, `skipIntermediateRoundings`, `useSoftmaxInterFusion`, and `instrumentMemoryAccesses` to `CPUCodegenOptions`, and hides the remaining developer CLI flags. There are three categories of use cases: - **Users**: They do not need to understand individual flags. The expectation is that they use optimization levels and optional flags recommended by core developers. - **Core developers**: Internal knobs for debugging or testing specific codegen behaviors. These are hidden from `--help` and may change or disappear at any time. - **External developers**: Contributors pushing the boundary on specific targets (e.g., ARM SVE/SME). Their flags need discoverability but are not yet stable. Moving flags related to scalable vectors was considered but ultimately not done because the plumbing would violate layering. For example, `clEnableScalableVectorization` is accessed globally through a free function. It cannot be passed to the encoding materialization pass because that pass should not carry target-specific codegen options. The root cause is that this information is not queryable from the IR. Promoting it to `CPUCodegenOptions` with an `experimental-` prefix was considered, but the layering violation makes it the wrong approach. Thus, it is left where it is. Signed-off-by: hanhanW <hanhan0912@gmail.com>
1 parent a314ffe commit b082090

File tree

3 files changed

+56
-39
lines changed

3 files changed

+56
-39
lines changed

compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

Lines changed: 13 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -46,51 +46,24 @@ namespace mlir::iree_compiler {
4646
static llvm::cl::opt<bool> clFailOnLargeVector(
4747
"iree-llvmcpu-fail-on-large-vector",
4848
llvm::cl::desc("fail if there are operations with large vectors"),
49-
llvm::cl::init(true));
49+
llvm::cl::init(true), llvm::cl::Hidden);
5050

5151
static llvm::cl::opt<bool> clCheckLinalgVectorization(
5252
"iree-llvmcpu-check-linalg-vectorization",
5353
llvm::cl::desc(
5454
"Runs the pass to check if all the Linalg ops are vectorized"),
55-
llvm::cl::init(false));
56-
57-
static llvm::cl::opt<bool> clUseFastMinMaxOps(
58-
"iree-llvmcpu-use-fast-min-max-ops",
59-
llvm::cl::desc(
60-
"Use `arith.minf/maxf` instead of `arith.minimumf/maximumf` ops"),
61-
llvm::cl::init(false));
62-
63-
static llvm::cl::opt<bool> clSkipIntermediateRoundings(
64-
"iree-llvmcpu-skip-intermediate-roundings",
65-
llvm::cl::desc(
66-
"Allow skipping intermediate roundings. For example, in f16 matmul "
67-
"kernels on targets with only f32 arithmetic, we have to perform each "
68-
"multiply-accumulate in f32, and if this flag is false, then we have "
69-
"to round those f32 accumulators to the nearest f16 every time, which "
70-
"is slow."),
71-
llvm::cl::init(true));
72-
73-
static llvm::cl::opt<bool> clInstrumentMemoryAccesses{
74-
"iree-llvmcpu-instrument-memory-accesses",
75-
llvm::cl::desc("Instruments memory accesses in dispatches when dispatch "
76-
"instrumentation is enabled."),
77-
llvm::cl::init(false)};
78-
79-
static llvm::cl::opt<bool> clUseSoftmaxInterFusion(
80-
"iree-llvmcpu-use-decompose-softmax-fuse",
81-
llvm::cl::desc("Enables inter-pass fusion for the DecomposeSoftmax pass."),
82-
llvm::cl::init(true));
55+
llvm::cl::init(false), llvm::cl::Hidden);
8356

8457
static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
8558
"iree-llvmcpu-enable-vector-contract-custom-kernels",
8659
llvm::cl::desc("Enables vector contract custom kernels for "
8760
"LLVMCPUMmt4dVectorLowering pass."),
88-
llvm::cl::init(false));
61+
llvm::cl::init(false), llvm::cl::Hidden);
8962

9063
static llvm::cl::opt<bool> clTileDispatchUsingForall(
9164
"iree-llvmcpu-tile-dispatch-using-forall",
9265
llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
93-
llvm::cl::init(true));
66+
llvm::cl::init(true), llvm::cl::Hidden);
9467

9568
// By default, IREE does not enable the Armv9-A streaming SVE mode in the
9669
// presence of scalable vectors (even when using `+sme`), as currently there's
@@ -103,7 +76,7 @@ static llvm::cl::opt<bool> clForceArmStreaming(
10376
"Enables Armv9-A streaming SVE mode for any dispatch region that "
10477
"contains supported scalable vector operations (i.e., use SSVE rather "
10578
"than SVE). Requires the +sme feature flag."),
106-
llvm::cl::init(false));
79+
llvm::cl::init(false), llvm::cl::Hidden);
10780

10881
static llvm::cl::opt<bool> clPatchFuncOps(
10982
"iree-llvmcpu-debug-patch-func-ops",
@@ -375,8 +348,8 @@ void addMmt4dTilingExpertPassPipeline(
375348
// The below two passes are nop if the "mmt4d" is explicitly excluded in the
376349
// ukernels attribute.
377350
funcPassManager.addPass(createCPUPrepareUkernelsPass());
378-
funcPassManager.addPass(
379-
createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
351+
funcPassManager.addPass(createCPULowerToUKernelsPass(
352+
pipelineOpt.cpuOpts.skipIntermediateRoundings));
380353
funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass(
381354
IREE::CPU::TilingLevel::VectorReductionTiles));
382355
// `VectorInnerParallelTiles` level models the tiling and fusion for the
@@ -427,8 +400,8 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
427400
// The below two passes are nop if pack/unpack is not specified in ukernels
428401
// attribute. By default, they are disabled.
429402
funcPassManager.addPass(createCPUPrepareUkernelsPass());
430-
funcPassManager.addPass(
431-
createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
403+
funcPassManager.addPass(createCPULowerToUKernelsPass(
404+
pipelineOpt.cpuOpts.skipIntermediateRoundings));
432405

433406
funcPassManager.addPass(createLLVMCPUTilePass(
434407
IREE::CPU::TilingLevel::VectorCommonParallelTiles, /*skipRootOp=*/false));
@@ -557,7 +530,8 @@ static void addLowerToLLVMPasses(OpPassManager &modulePassManager,
557530
.addPass(createMathTransformPass)
558531
.addPass(createHoistStaticallyBoundAllocationsPass)
559532
// Use `arith.minf/maxf` instead of `arith.minimumf/maximumf`.
560-
.addPredicatedPass(clUseFastMinMaxOps, createReplaceSlowMinMaxOpsPass);
533+
.addPredicatedPass(cpuOpts.useFastMinMaxOps,
534+
createReplaceSlowMinMaxOpsPass);
561535

562536
if (enableAArch64SME) {
563537
modulePassManager.addPass(mlir::arm_sme::createVectorLegalizationPass());
@@ -632,7 +606,7 @@ static void addLowerToLLVMPasses(OpPassManager &modulePassManager,
632606
.addPass(createEmulateNarrowTypePass)
633607
.addPass(createCanonicalizerPass)
634608
.addPass(createCSEPass)
635-
.addPredicatedPass(clInstrumentMemoryAccesses,
609+
.addPredicatedPass(cpuOpts.instrumentMemoryAccesses,
636610
createInstrumentMemoryAccessesPass);
637611

638612
if (enableAArch64SME) {
@@ -659,7 +633,7 @@ void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
659633
{
660634
FunctionLikeNest funcPassManager(modulePassManager);
661635
addCommonTargetExecutablePreprocessingPasses(funcPassManager,
662-
clUseSoftmaxInterFusion);
636+
cpuOpts.useSoftmaxInterFusion);
663637
}
664638
modulePassManager.addPass(createMaterializeTuningSpecsPass(
665639
MaterializeTuningSpecsPassOptions{cpuOpts.tuningSpecPath}));

compiler/src/iree/compiler/Codegen/Utils/CodegenOptions.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,36 @@ void CPUCodegenOptions::bindOptions(OptionsBinder &binder) {
6161
initAtOpt(llvm::OptimizationLevel::O2, true)},
6262
llvm::cl::desc("Enables reassociation for FP reductions."),
6363
llvm::cl::cat(category));
64+
65+
binder.opt<bool>(
66+
"iree-llvmcpu-use-fast-min-max-ops", useFastMinMaxOps,
67+
llvm::cl::desc(
68+
"Use `arith.minf/maxf` instead of `arith.minimumf/maximumf` ops."),
69+
llvm::cl::cat(category));
70+
71+
binder.opt<bool>(
72+
"iree-llvmcpu-skip-intermediate-roundings", skipIntermediateRoundings,
73+
llvm::cl::desc(
74+
"Allow skipping intermediate roundings. For example, in f16 matmul "
75+
"kernels on targets with only f32 arithmetic, we have to perform "
76+
"each multiply-accumulate in f32, and if this flag is false, then "
77+
"we have to round those f32 accumulators to the nearest f16 every "
78+
"time, which is slow."),
79+
llvm::cl::cat(category));
80+
81+
binder.opt<bool>(
82+
"iree-llvmcpu-use-decompose-softmax-fuse", useSoftmaxInterFusion,
83+
llvm::cl::desc(
84+
"Enables inter-pass fusion for the DecomposeSoftmax pass."),
85+
llvm::cl::cat(category));
86+
87+
binder.opt<bool>(
88+
"iree-llvmcpu-instrument-memory-accesses", instrumentMemoryAccesses,
89+
llvm::cl::desc(
90+
"Instruments memory reads and writes in dispatches for address "
91+
"tracking. Use with --iree-hal-instrument-dispatches=<buffer-size> "
92+
"and analyze results with iree-dump-instruments."),
93+
llvm::cl::cat(category));
6494
}
6595

6696
void GPUCodegenOptions::bindOptions(OptionsBinder &binder) {

compiler/src/iree/compiler/Codegen/Utils/CodegenOptions.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,19 @@ struct CPUCodegenOptions : CodegenOptions {
4040
// Enables reassociation for FP reductions.
4141
bool reassociateFpReductions = false;
4242

43+
// Use arith.minf/maxf instead of arith.minimumf/maximumf.
44+
bool useFastMinMaxOps = false;
45+
46+
// Allow skipping intermediate roundings (e.g., in f16 matmul on f32
47+
// hardware).
48+
bool skipIntermediateRoundings = true;
49+
50+
// Enables inter-pass fusion for the DecomposeSoftmax pass.
51+
bool useSoftmaxInterFusion = true;
52+
53+
// Instruments memory reads and writes in dispatches for address tracking.
54+
bool instrumentMemoryAccesses = false;
55+
4356
void bindOptions(OptionsBinder &binder);
4457
using FromFlags = OptionsFromFlags<CPUCodegenOptions>;
4558
};

0 commit comments

Comments
 (0)