[CPU] Expose more options to CPUCodegenOptions. (#23586)

hanhanW · web-flow · commit b0820908359b · 2026-02-26T16:53:59.000-08:00
The revision moves `useFastMinMaxOps`, `skipIntermediateRoundings`,
`useSoftmaxInterFusion`, and `instrumentMemoryAccesses` to
`CPUCodegenOptions`, and hides the remaining developer CLI flags.

There are three categories of use cases:

- **Users**: They do not need to understand individual flags. The
expectation is that they use optimization levels and optional flags
recommended by core developers.
- **Core developers**: Internal knobs for debugging or testing specific
codegen behaviors. These are hidden from `--help` and may change or
disappear at any time.
- **External developers**: Contributors pushing the boundary on specific
targets (e.g., ARM SVE/SME). Their flags need discoverability but are
not yet stable.

Moving flags related to scalable vectors was considered but ultimately
not done because the plumbing would violate layering. For example,
`clEnableScalableVectorization` is accessed globally through a free
function. It cannot be passed to the encoding materialization pass
because that pass should not carry target-specific codegen options. The
root cause is that this information is not queryable from the IR.
Promoting it to `CPUCodegenOptions` with an `experimental-` prefix was
considered, but the layering violation makes it the wrong approach.
Thus, it is left where it is.

Signed-off-by: hanhanW &lt;hanhan0912@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -46,51 +46,24 @@ namespace mlir::iree_compiler {
 static llvm::cl::opt<bool> clFailOnLargeVector(
     "iree-llvmcpu-fail-on-large-vector",
     llvm::cl::desc("fail if there are operations with large vectors"),
-    llvm::cl::init(true));
+    llvm::cl::init(true), llvm::cl::Hidden);
 
 static llvm::cl::opt<bool> clCheckLinalgVectorization(
     "iree-llvmcpu-check-linalg-vectorization",
     llvm::cl::desc(
         "Runs the pass to check if all the Linalg ops are vectorized"),
-    llvm::cl::init(false));
-
-static llvm::cl::opt<bool> clUseFastMinMaxOps(
-    "iree-llvmcpu-use-fast-min-max-ops",
-    llvm::cl::desc(
-        "Use `arith.minf/maxf` instead of `arith.minimumf/maximumf` ops"),
-    llvm::cl::init(false));
-
-static llvm::cl::opt<bool> clSkipIntermediateRoundings(
-    "iree-llvmcpu-skip-intermediate-roundings",
-    llvm::cl::desc(
-        "Allow skipping intermediate roundings. For example, in f16 matmul "
-        "kernels on targets with only f32 arithmetic, we have to perform each "
-        "multiply-accumulate in f32, and if this flag is false, then we have "
-        "to round those f32 accumulators to the nearest f16 every time, which "
-        "is slow."),
-    llvm::cl::init(true));
-
-static llvm::cl::opt<bool> clInstrumentMemoryAccesses{
-    "iree-llvmcpu-instrument-memory-accesses",
-    llvm::cl::desc("Instruments memory accesses in dispatches when dispatch "
-                   "instrumentation is enabled."),
-    llvm::cl::init(false)};
-
-static llvm::cl::opt<bool> clUseSoftmaxInterFusion(
-    "iree-llvmcpu-use-decompose-softmax-fuse",
-    llvm::cl::desc("Enables inter-pass fusion for the DecomposeSoftmax pass."),
-    llvm::cl::init(true));
+    llvm::cl::init(false), llvm::cl::Hidden);
 
 static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
     "iree-llvmcpu-enable-vector-contract-custom-kernels",
     llvm::cl::desc("Enables vector contract custom kernels for "
                    "LLVMCPUMmt4dVectorLowering pass."),
-    llvm::cl::init(false));
+    llvm::cl::init(false), llvm::cl::Hidden);
 
 static llvm::cl::opt<bool> clTileDispatchUsingForall(
     "iree-llvmcpu-tile-dispatch-using-forall",
     llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
-    llvm::cl::init(true));
+    llvm::cl::init(true), llvm::cl::Hidden);
 
 // By default, IREE does not enable the Armv9-A streaming SVE mode in the
 // presence of scalable vectors (even when using `+sme`), as currently there's
@@ -103,7 +76,7 @@ static llvm::cl::opt<bool> clForceArmStreaming(
         "Enables Armv9-A streaming SVE mode for any dispatch region that "
         "contains supported scalable vector operations (i.e., use SSVE rather "
         "than SVE). Requires the +sme feature flag."),
-    llvm::cl::init(false));
+    llvm::cl::init(false), llvm::cl::Hidden);
 
 static llvm::cl::opt<bool> clPatchFuncOps(
     "iree-llvmcpu-debug-patch-func-ops",
@@ -375,8 +348,8 @@ void addMmt4dTilingExpertPassPipeline(
   // The below two passes are nop if the "mmt4d" is explicitly excluded in the
   // ukernels attribute.
   funcPassManager.addPass(createCPUPrepareUkernelsPass());
-  funcPassManager.addPass(
-      createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
+  funcPassManager.addPass(createCPULowerToUKernelsPass(
+      pipelineOpt.cpuOpts.skipIntermediateRoundings));
   funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass(
       IREE::CPU::TilingLevel::VectorReductionTiles));
   // `VectorInnerParallelTiles` level models the tiling and fusion for the
@@ -427,8 +400,8 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
   // The below two passes are nop if pack/unpack is not specified in ukernels
   // attribute. By default, they are disabled.
   funcPassManager.addPass(createCPUPrepareUkernelsPass());
-  funcPassManager.addPass(
-      createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
+  funcPassManager.addPass(createCPULowerToUKernelsPass(
+      pipelineOpt.cpuOpts.skipIntermediateRoundings));
 
   funcPassManager.addPass(createLLVMCPUTilePass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles, /*skipRootOp=*/false));
@@ -557,7 +530,8 @@ static void addLowerToLLVMPasses(OpPassManager &modulePassManager,
       .addPass(createMathTransformPass)
       .addPass(createHoistStaticallyBoundAllocationsPass)
       // Use `arith.minf/maxf` instead of `arith.minimumf/maximumf`.
-      .addPredicatedPass(clUseFastMinMaxOps, createReplaceSlowMinMaxOpsPass);
+      .addPredicatedPass(cpuOpts.useFastMinMaxOps,
+                         createReplaceSlowMinMaxOpsPass);
 
   if (enableAArch64SME) {
     modulePassManager.addPass(mlir::arm_sme::createVectorLegalizationPass());
@@ -632,7 +606,7 @@ static void addLowerToLLVMPasses(OpPassManager &modulePassManager,
       .addPass(createEmulateNarrowTypePass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass)
-      .addPredicatedPass(clInstrumentMemoryAccesses,
+      .addPredicatedPass(cpuOpts.instrumentMemoryAccesses,
                          createInstrumentMemoryAccessesPass);
 
   if (enableAArch64SME) {
@@ -659,7 +633,7 @@ void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
   {
     FunctionLikeNest funcPassManager(modulePassManager);
     addCommonTargetExecutablePreprocessingPasses(funcPassManager,
-                                                 clUseSoftmaxInterFusion);
+                                                 cpuOpts.useSoftmaxInterFusion);
   }
   modulePassManager.addPass(createMaterializeTuningSpecsPass(
       MaterializeTuningSpecsPassOptions{cpuOpts.tuningSpecPath}));
diff --git a/compiler/src/iree/compiler/Codegen/Utils/CodegenOptions.cpp b/compiler/src/iree/compiler/Codegen/Utils/CodegenOptions.cpp
@@ -61,6 +61,36 @@ void CPUCodegenOptions::bindOptions(OptionsBinder &binder) {
                     initAtOpt(llvm::OptimizationLevel::O2, true)},
                    llvm::cl::desc("Enables reassociation for FP reductions."),
                    llvm::cl::cat(category));
+
+  binder.opt<bool>(
+      "iree-llvmcpu-use-fast-min-max-ops", useFastMinMaxOps,
+      llvm::cl::desc(
+          "Use `arith.minf/maxf` instead of `arith.minimumf/maximumf` ops."),
+      llvm::cl::cat(category));
+
+  binder.opt<bool>(
+      "iree-llvmcpu-skip-intermediate-roundings", skipIntermediateRoundings,
+      llvm::cl::desc(
+          "Allow skipping intermediate roundings. For example, in f16 matmul "
+          "kernels on targets with only f32 arithmetic, we have to perform "
+          "each multiply-accumulate in f32, and if this flag is false, then "
+          "we have to round those f32 accumulators to the nearest f16 every "
+          "time, which is slow."),
+      llvm::cl::cat(category));
+
+  binder.opt<bool>(
+      "iree-llvmcpu-use-decompose-softmax-fuse", useSoftmaxInterFusion,
+      llvm::cl::desc(
+          "Enables inter-pass fusion for the DecomposeSoftmax pass."),
+      llvm::cl::cat(category));
+
+  binder.opt<bool>(
+      "iree-llvmcpu-instrument-memory-accesses", instrumentMemoryAccesses,
+      llvm::cl::desc(
+          "Instruments memory reads and writes in dispatches for address "
+          "tracking. Use with --iree-hal-instrument-dispatches=<buffer-size> "
+          "and analyze results with iree-dump-instruments."),
+      llvm::cl::cat(category));
 }
 
 void GPUCodegenOptions::bindOptions(OptionsBinder &binder) {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/CodegenOptions.h b/compiler/src/iree/compiler/Codegen/Utils/CodegenOptions.h
@@ -40,6 +40,19 @@ struct CPUCodegenOptions : CodegenOptions {
   // Enables reassociation for FP reductions.
   bool reassociateFpReductions = false;
 
+  // Use arith.minf/maxf instead of arith.minimumf/maximumf.
+  bool useFastMinMaxOps = false;
+
+  // Allow skipping intermediate roundings (e.g., in f16 matmul on f32
+  // hardware).
+  bool skipIntermediateRoundings = true;
+
+  // Enables inter-pass fusion for the DecomposeSoftmax pass.
+  bool useSoftmaxInterFusion = true;
+
+  // Instruments memory reads and writes in dispatches for address tracking.
+  bool instrumentMemoryAccesses = false;
+
   void bindOptions(OptionsBinder &binder);
   using FromFlags = OptionsFromFlags<CPUCodegenOptions>;
 };