Skip to content

Commit 5b0680d

Browse files
authored
[Codegen] Add option to disable copy vectorization (#18673)
Vectorization of linalg.copy introduces two vector.transfer ops that immediately fold away which can cause unexpected results from LICM resulting in unlinking copy destinations from surrounding loops. Since vectorization of a tensor copy does not work anyway, this adds an option to disable vectorization of copies on tensors and defer it until after bufferization.
1 parent cc3b28f commit 5b0680d

File tree

4 files changed

+73
-2
lines changed

4 files changed

+73
-2
lines changed

compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ void GenericVectorizationPass::runOnOperation() {
333333
SmallVector<Operation *> candidates;
334334
funcOp.walk([&](Operation *op) {
335335
if (isa<linalg::LinalgOp>(op)) {
336+
if (isa<linalg::CopyOp>(op) && !vectorizeCopies) {
337+
return;
338+
}
336339
candidates.push_back(op);
337340
} else if (vectorizePadding && enableVectorMasking &&
338341
isa<tensor::PadOp>(op)) {

compiler/src/iree/compiler/Codegen/Common/Passes.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@ def GenericVectorizationPass :
286286
"Enable vector masking during vectorization.">,
287287
Option<"useConfiguredVectorSizes", "use-configured-vector-sizes", "bool",/*default=*/"true",
288288
"Control whether the op lowering config represents a set of masked vector sizes">,
289+
Option<"vectorizeCopies", "vectorize-copies", "bool", /*default=*/"true",
290+
"Enable vectorization of linalg.copy operations.">,
289291
Option<"vectorizePadding", "vectorize-padding", "bool", /*default=*/"false",
290292
"Rewrite all tensor.pad ops in the function to vector form.">,
291293
Option<"vectorizeGatherAccesses", "vectorize-gather-accesses", "bool", /*default=*/"false",

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,14 +216,16 @@ static void tileAndBufferize(OpPassManager &funcPassManager) {
216216
addBufferizePasses(funcPassManager);
217217
}
218218

219-
static void addGPUVectorizationPasses(OpPassManager &funcPassManager) {
219+
static void addGPUVectorizationPasses(OpPassManager &funcPassManager,
220+
bool vectorizeCopies = true) {
220221
funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
221222
funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass());
222223
funcPassManager.addPass(
223224
IREE::VectorExt::createVectorizeIREEVectorExtOpsPass());
224225
// Vectorize.
225226
GenericVectorizationPassOptions options;
226227
options.vectorizePadding = true;
228+
options.vectorizeCopies = vectorizeCopies;
227229
options.vectorizeGatherAccesses = true;
228230
options.enableCleanup = false;
229231
options.foldCastIntoContract = true;
@@ -410,7 +412,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
410412

411413
// Step 6. Lower special ops and vectorize.
412414
funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
413-
addGPUVectorizationPasses(funcPassManager);
415+
addGPUVectorizationPasses(funcPassManager, /*vectorizeCopies=*/false);
414416
funcPassManager.addPass(createCleanupBufferAllocViewPass());
415417
funcPassManager.addPass(createGPUCombineValueBarriersPass());
416418

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,3 +877,67 @@ hal.executable public @main {
877877
// CHECK: vector.insert_strided_slice %[[C_70_4]], {{.*}}offsets = [7, 0, 0, 0, 0, 0]{{.*}} : vector<4xf32> into vector<8x1x2x1x1x4xf32>
878878
// CHECK: vector.insert_strided_slice %[[C_71_4]], {{.*}}offsets = [7, 0, 1, 0, 0, 0]{{.*}} : vector<4xf32> into vector<8x1x2x1x1x4xf32>
879879
// CHECK: vector.transfer_write
880+
881+
// -----
882+
883+
#layout = #hal.pipeline.layout<bindings = [
884+
#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
885+
#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
886+
#hal.pipeline.binding<storage_buffer, Indirect>
887+
], flags = Indirect>
888+
889+
#lowering_config = #iree_gpu.lowering_config<{
890+
promote_operands = [0, 1],
891+
reduction = [0, 0, 4],
892+
thread = [1, 4, 0],
893+
workgroup = [1, 128, 0]
894+
}>
895+
#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>
896+
897+
hal.executable public @main {
898+
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
899+
hal.executable.export public @small_m_matmul ordinal(0) layout(#layout) {
900+
^bb0(%arg0: !hal.device):
901+
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
902+
hal.return %x, %y, %z : index, index, index
903+
}
904+
builtin.module {
905+
func.func @small_m_matmul() attributes {translation_info = #translation_info} {
906+
%cst = arith.constant 0.000000e+00 : f32
907+
%c0 = arith.constant 0 : index
908+
%0 = hal.interface.binding.subspan layout(#layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x1000xf32>>
909+
%1 = hal.interface.binding.subspan layout(#layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1000x512xf32>>
910+
%2 = hal.interface.binding.subspan layout(#layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<4x512xf32>>
911+
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 1000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1000xf32>> -> tensor<4x1000xf32>
912+
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1000, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1000x512xf32>> -> tensor<1000x512xf32>
913+
%5 = tensor.empty() : tensor<4x512xf32>
914+
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x512xf32>) -> tensor<4x512xf32>
915+
%7 = linalg.matmul {lowering_config = #lowering_config}
916+
ins(%3, %4 : tensor<4x1000xf32>, tensor<1000x512xf32>)
917+
outs(%6 : tensor<4x512xf32>) -> tensor<4x512xf32>
918+
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 512], strides = [1, 1] : tensor<4x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x512xf32>>
919+
return
920+
}
921+
}
922+
}
923+
}
924+
925+
// CHECK-LABEL: func @small_m_matmul
926+
// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
927+
// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
928+
// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
929+
// CHECK-DAG: %[[LHS_ALLOC:.+]] = memref.alloc() : memref<1x6xf32, #gpu.address_space<workgroup>>
930+
// CHECK-DAG: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<4x130xf32, #gpu.address_space<workgroup>>
931+
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1000 step %c4 {{.*}} -> (vector<1x4xf32>)
932+
// CHECK: gpu.barrier
933+
934+
// TODO: The fact that this read gets hoisted out of the subsequent for loop
935+
// is a bug in LICM that does no verification that the loop has at least one
936+
// trip.
937+
// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<4xf32>
938+
// CHECK: scf.for %{{.*}} = %{{.*}} to %c1 step %c32
939+
// CHECK-NEXT: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC]]
940+
// CHECK: gpu.barrier
941+
// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<4xf32>
942+
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf32>
943+
// CHECK: vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]]

0 commit comments

Comments
 (0)