Skip to content

Commit 3b751a4

Browse files
authored
[LLVMCPU] Enable tileDispatchUsingForall as default (#18777)
1 parent e96e3c0 commit 3b751a4

File tree

7 files changed

+38
-40
lines changed

7 files changed

+38
-40
lines changed

compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -202,13 +202,16 @@ static LogicalResult dropUnitDistributedDims(RewriterBase &rewriter,
202202
llvm::SmallDenseSet<int> droppedLoops;
203203
for (auto [index, lb, ub, step] :
204204
llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
205-
if (!isa<Attribute>(lb) || !isa<Attribute>(ub) || !isa<Attribute>(step)) {
205+
206+
std::optional<int64_t> lbVal = getConstantIntValue(lb);
207+
std::optional<int64_t> ubVal = getConstantIntValue(ub);
208+
std::optional<int64_t> stepVal = getConstantIntValue(step);
209+
210+
if (!(lbVal && ubVal && stepVal)) {
206211
continue;
207212
}
208-
int64_t lbVal = getConstantIntValue(lb).value();
209-
int64_t ubVal = getConstantIntValue(ub).value();
210-
int64_t stepVal = getConstantIntValue(step).value();
211-
if (CEILDIV(ubVal - lbVal, stepVal) == 1) {
213+
214+
if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) {
212215
droppedLoops.insert(index);
213216
}
214217
}

compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
9595
static llvm::cl::opt<bool> clTileDispatchUsingForall(
9696
"iree-llvmcpu-tile-dispatch-using-forall",
9797
llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
98-
llvm::cl::init(false));
98+
llvm::cl::init(true));
9999

100100
// By default, IREE does not enable the Armv9-A streaming SVE mode in the
101101
// presence of scalable vectors (even when using `+sme`), as currently there's
@@ -111,9 +111,8 @@ static llvm::cl::opt<bool> clForceArmStreaming(
111111
llvm::cl::init(false));
112112

113113
// TODO: Enable `TileDispatchUsingForall` for every pipeline.
114-
static void addTileAndDistributePasses(OpPassManager &funcPassManager,
115-
bool enableTileDispatchUsingForall) {
116-
if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
114+
static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
115+
if (clTileDispatchUsingForall) {
117116
funcPassManager.addPass(
118117
createTileAndDistributeToWorkgroupsUsingForallOpPass());
119118
} else {
@@ -346,8 +345,7 @@ void buildLLVMCPUVectorLoweringPipeline(
346345
void addCPUBufferOpsTileAndVectorizePipeline(
347346
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
348347
LLVMCPUPipelineOptions &pipelineOpt) {
349-
addTileAndDistributePasses(funcPassManager,
350-
/*enableTileDispatchUsingForall=*/true);
348+
addTileAndDistributePasses(funcPassManager);
351349

352350
// Skip tiling reduction loops because this is expected to apply on copy ops
353351
// only.
@@ -384,8 +382,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(
384382
void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
385383
TilingConfig &tilingConfig,
386384
LLVMCPUPipelineOptions &pipelineOpt) {
387-
addTileAndDistributePasses(funcPassManager,
388-
/*enableTileDispatchUsingForall=*/true);
385+
addTileAndDistributePasses(funcPassManager);
389386

390387
SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
391388
// Apply tile and fuse to all the non-distribution fusable levels. Skip
@@ -464,8 +461,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
464461
void addConvTileAndDecomposeExpertPassPipeline(
465462
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
466463
LLVMCPUPipelineOptions &pipelineOpt) {
467-
addTileAndDistributePasses(funcPassManager,
468-
/*enableTileDispatchUsingForall=*/true);
464+
addTileAndDistributePasses(funcPassManager);
469465

470466
// Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
471467
// ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -528,8 +524,7 @@ void addConvTileAndDecomposeExpertPassPipeline(
528524
void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
529525
TilingConfig &tilingConfig,
530526
LLVMCPUPipelineOptions &pipelineOpt) {
531-
addTileAndDistributePasses(funcPassManager,
532-
/*enableTileDispatchUsingForall=*/true);
527+
addTileAndDistributePasses(funcPassManager);
533528

534529
funcPassManager.addPass(createLLVMCPUTileAndFusePass(
535530
static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
@@ -577,8 +572,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
577572
void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
578573
TilingConfig &tilingConfig,
579574
LLVMCPUPipelineOptions &pipelineOpt) {
580-
addTileAndDistributePasses(funcPassManager,
581-
/*enableTileDispatchUsingForall=*/true);
575+
addTileAndDistributePasses(funcPassManager);
582576

583577
// The below two passes are nop if pack/unpack is not specified in ukernels
584578
// attribute. By default, they are disabled.
@@ -621,8 +615,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
621615
void addCPULinalgExtTileAndVectorizePipeline(
622616
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
623617
LLVMCPUPipelineOptions &pipelineOpt) {
624-
addTileAndDistributePasses(funcPassManager,
625-
/*enableTileDispatchUsingForall=*/false);
618+
addTileAndDistributePasses(funcPassManager);
626619
funcPassManager.addPass(
627620
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
628621
// TODO: Remove the pass once we have PartialReductionOpInterface implemented
@@ -661,8 +654,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
661654
}
662655

663656
void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
664-
addTileAndDistributePasses(funcPassManager,
665-
/*enableTileDispatchUsingForall=*/false);
657+
addTileAndDistributePasses(funcPassManager);
666658
addCPUBufferizePasses(funcPassManager);
667659
}
668660

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ hal.executable private @main {
290290
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
291291
// CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index
292292
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
293-
// CHECK: scf.forall ({{.*}}) in (2, 4, 1, 5) {
293+
// CHECK: scf.forall ({{.*}}) in (2, 4, 5) {
294294
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>)
295295
// CHECK: gpu.barrier
296296
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
@@ -307,7 +307,7 @@ hal.executable private @main {
307307
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
308308
// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
309309
// CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]]
310-
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
310+
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
311311

312312
// -----
313313

compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,17 @@
99

1010
stream.executable private @__builtin_fill_i64 {
1111
stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) {
12-
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
12+
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
1313
stream.return %x, %y, %z : index, index, index
1414
}
1515
builtin.module {
1616
func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
1717
%c0 = arith.constant 0 : index
18-
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
19-
%0 = tensor.empty(%count) : tensor<?xi64>
18+
%count0 = flow.dispatch.workload.ordinal %count, 0 : index
19+
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
20+
%0 = tensor.empty(%count0) : tensor<?xi64>
2021
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
21-
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
22+
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
2223
return
2324
}
2425
}

compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,17 @@
99

1010
stream.executable private @__builtin_splat_i64 {
1111
stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) {
12-
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
12+
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
1313
stream.return %x, %y, %z : index, index, index
1414
}
1515
builtin.module {
1616
func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
1717
%c0 = arith.constant 0 : index
18-
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
19-
%0 = tensor.empty(%count) : tensor<?xi64>
18+
%count0 = flow.dispatch.workload.ordinal %count, 0 : index
19+
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
20+
%0 = tensor.empty(%count0) : tensor<?xi64>
2021
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
21-
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
22+
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
2223
return
2324
}
2425
}

compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,14 @@ isFusableWithConsumer(OpOperand &fusedOperand,
547547
return false;
548548
}
549549

550+
// TODO: Enable grouped convolution and depth wise pooling fusion.
551+
// Rightnow, this is going through the default CPU pipeline and not through
552+
// CONVTilingExpert.
553+
if (isa<linalg::Conv2DNgchwFgchwOp, linalg::Conv2DNgchwGfchwOp,
554+
linalg::PoolingNdhwcSumOp>(producer)) {
555+
return false;
556+
}
557+
550558
auto producerFusionOp =
551559
dyn_cast<IREE::LinalgExt::LinalgFusionOpInterface>(producer);
552560
auto consumerFusionOp =

tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -392,13 +392,6 @@
392392
"onnx/node/generated/test_softsign_example",
393393
"onnx/node/generated/test_stft",
394394
"onnx/node/generated/test_stft_with_window",
395-
"onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
396-
"onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
397-
"onnx/node/generated/test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
398-
"onnx/node/generated/test_tfidfvectorizer_tf_only_bigrams_skip0",
399-
"onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_levelempty",
400-
"onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_skip5",
401-
"onnx/node/generated/test_tfidfvectorizer_tf_uniandbigrams_skip5",
402395
"onnx/node/generated/test_training_dropout",
403396
"onnx/node/generated/test_training_dropout_default",
404397
"onnx/node/generated/test_training_dropout_default_mask",

0 commit comments

Comments
 (0)