@@ -95,7 +95,7 @@ static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
9595static llvm::cl::opt<bool > clTileDispatchUsingForall (
9696 " iree-llvmcpu-tile-dispatch-using-forall" ,
9797 llvm::cl::desc (" Enable tile and distribute to workgroups using scf.forall" ),
98- llvm::cl::init(false ));
98+ llvm::cl::init(true ));
9999
100100// By default, IREE does not enable the Armv9-A streaming SVE mode in the
101101// presence of scalable vectors (even when using `+sme`), as currently there's
@@ -111,9 +111,8 @@ static llvm::cl::opt<bool> clForceArmStreaming(
111111 llvm::cl::init(false ));
112112
113113// TODO: Enable `TileDispatchUsingForall` for every pipeline.
114- static void addTileAndDistributePasses (OpPassManager &funcPassManager,
115- bool enableTileDispatchUsingForall) {
116- if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
114+ static void addTileAndDistributePasses (OpPassManager &funcPassManager) {
115+ if (clTileDispatchUsingForall) {
117116 funcPassManager.addPass (
118117 createTileAndDistributeToWorkgroupsUsingForallOpPass ());
119118 } else {
@@ -346,8 +345,7 @@ void buildLLVMCPUVectorLoweringPipeline(
346345void addCPUBufferOpsTileAndVectorizePipeline (
347346 OpPassManager &funcPassManager, TilingConfig &tilingConfig,
348347 LLVMCPUPipelineOptions &pipelineOpt) {
349- addTileAndDistributePasses (funcPassManager,
350- /* enableTileDispatchUsingForall=*/ true );
348+ addTileAndDistributePasses (funcPassManager);
351349
352350 // Skip tiling reduction loops because this is expected to apply on copy ops
353351 // only.
@@ -384,8 +382,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(
384382void addMultiTilingExpertPassPipeline (OpPassManager &funcPassManager,
385383 TilingConfig &tilingConfig,
386384 LLVMCPUPipelineOptions &pipelineOpt) {
387- addTileAndDistributePasses (funcPassManager,
388- /* enableTileDispatchUsingForall=*/ true );
385+ addTileAndDistributePasses (funcPassManager);
389386
390387 SmallVector<int64_t > allFusableLevels (tilingConfig.getFusableLevels ());
391388 // Apply tile and fuse to all the non-distribution fusable levels. Skip
@@ -464,8 +461,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
464461void addConvTileAndDecomposeExpertPassPipeline (
465462 OpPassManager &funcPassManager, TilingConfig &tilingConfig,
466463 LLVMCPUPipelineOptions &pipelineOpt) {
467- addTileAndDistributePasses (funcPassManager,
468- /* enableTileDispatchUsingForall=*/ true );
464+ addTileAndDistributePasses (funcPassManager);
469465
470466 // Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
471467 // ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -528,8 +524,7 @@ void addConvTileAndDecomposeExpertPassPipeline(
528524void addMmt4dTilingExpertPassPipeline (OpPassManager &funcPassManager,
529525 TilingConfig &tilingConfig,
530526 LLVMCPUPipelineOptions &pipelineOpt) {
531- addTileAndDistributePasses (funcPassManager,
532- /* enableTileDispatchUsingForall=*/ true );
527+ addTileAndDistributePasses (funcPassManager);
533528
534529 funcPassManager.addPass (createLLVMCPUTileAndFusePass (
535530 static_cast <int64_t >(tilingConfig.getVectorCommonParallelLevel ())));
@@ -577,8 +572,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
577572void addCPUDataTilingPipeline (OpPassManager &funcPassManager,
578573 TilingConfig &tilingConfig,
579574 LLVMCPUPipelineOptions &pipelineOpt) {
580- addTileAndDistributePasses (funcPassManager,
581- /* enableTileDispatchUsingForall=*/ true );
575+ addTileAndDistributePasses (funcPassManager);
582576
583577 // The below two passes are nop if pack/unpack is not specified in ukernels
584578 // attribute. By default, they are disabled.
@@ -621,8 +615,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
621615void addCPULinalgExtTileAndVectorizePipeline (
622616 OpPassManager &funcPassManager, TilingConfig &tilingConfig,
623617 LLVMCPUPipelineOptions &pipelineOpt) {
624- addTileAndDistributePasses (funcPassManager,
625- /* enableTileDispatchUsingForall=*/ false );
618+ addTileAndDistributePasses (funcPassManager);
626619 funcPassManager.addPass (
627620 createLLVMCPUTilePass (tilingConfig.getVectorCommonParallelLevel ()));
628621 // TODO: Remove the pass once we have PartialReductionOpInterface implemented
@@ -661,8 +654,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
661654}
662655
663656void addCPUDefaultPassPipeline (OpPassManager &funcPassManager) {
664- addTileAndDistributePasses (funcPassManager,
665- /* enableTileDispatchUsingForall=*/ false );
657+ addTileAndDistributePasses (funcPassManager);
666658 addCPUBufferizePasses (funcPassManager);
667659}
668660
0 commit comments