Skip to content

Commit 1cbcb4e

Browse files
authored
[LLVMGPU] Delete LLVMGPUPadAndVectorDistribute (iree-org#21095)
LLVMGPUPadAndVectorDistribute was added to handle unaligned batch matmuls before TileAndFuse existed. TileAndFuse now handles unaligned matmuls much better. We don't want to do padding in VectorDistribute this way.
1 parent d347008 commit 1cbcb4e

15 files changed

+9
-535
lines changed

compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,10 @@ def LLVMGPU_MatmulTensorCoreMmaSync
5050
: I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 107>;
5151
def LLVMGPU_VectorDistribute
5252
: I32EnumAttrCase<"LLVMGPUVectorDistribute", 108>;
53-
def LLVMGPU_PadAndVectorDistribute
54-
: I32EnumAttrCase<"LLVMGPUPadAndVectorDistribute", 109>;
5553
def LLVMGPU_WinogradVectorize
56-
: I32EnumAttrCase<"LLVMGPUWinogradVectorize", 110>;
54+
: I32EnumAttrCase<"LLVMGPUWinogradVectorize", 109>;
5755
def LLVMGPU_TileAndFuse
58-
: I32EnumAttrCase<"LLVMGPUTileAndFuse", 111>;
56+
: I32EnumAttrCase<"LLVMGPUTileAndFuse", 110>;
5957

6058
def SPIRV_BaseLowering
6159
: I32EnumAttrCase<"SPIRVBaseLowering", 200>;
@@ -97,8 +95,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr<
9795
LLVMGPU_Vectorize, LLVMGPU_MatmulTensorCore,
9896
LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction,
9997
LLVMGPU_MatmulTensorCoreMmaSync, LLVMGPU_VectorDistribute,
100-
LLVMGPU_PadAndVectorDistribute, LLVMGPU_WinogradVectorize,
101-
LLVMGPU_TileAndFuse,
98+
LLVMGPU_WinogradVectorize, LLVMGPU_TileAndFuse,
10299

103100
// SPIR-V CodeGen pipelines
104101
SPIRV_BaseLowering, SPIRV_BaseDistribute, SPIRV_BaseVectorize,

compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ iree_compiler_cc_library(
103103
"LLVMGPULowerExecutableTarget.cpp",
104104
"LLVMGPUPackSharedMemoryAlloc.cpp",
105105
"LLVMGPUPrefetching.cpp",
106-
"LLVMGPUPromoteMatmulToFitMMA.cpp",
107106
"LLVMGPUSelectLoweringStrategy.cpp",
108107
"LLVMGPUTensorCoreVectorization.cpp",
109108
"LLVMGPUTensorPad.cpp",

compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ iree_cc_library(
8383
"LLVMGPULowerExecutableTarget.cpp"
8484
"LLVMGPUPackSharedMemoryAlloc.cpp"
8585
"LLVMGPUPrefetching.cpp"
86-
"LLVMGPUPromoteMatmulToFitMMA.cpp"
8786
"LLVMGPUSelectLoweringStrategy.cpp"
8887
"LLVMGPUTensorCoreVectorization.cpp"
8988
"LLVMGPUTensorPad.cpp"

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,7 @@ static bool needsLoweringConfigPropagation(
150150
using Pipeline = IREE::Codegen::DispatchLoweringPassPipeline;
151151
// Pipelines that do not need propagation of lowering config.
152152
Pipeline supportedPipelines[] = {Pipeline::LLVMGPUTileAndFuse,
153-
Pipeline::LLVMGPUVectorDistribute,
154-
Pipeline::LLVMGPUPadAndVectorDistribute};
153+
Pipeline::LLVMGPUVectorDistribute};
155154
return !llvm::is_contained(supportedPipelines, pipeline);
156155
}
157156

@@ -1197,26 +1196,6 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
11971196
/*canUpcastAcc=*/true);
11981197
}
11991198

1200-
// Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute
1201-
// pipeline.
1202-
// TODO(hanchung): Support cases that there are fused producers.
1203-
if (!schedule && !contractionDims->batch.empty() && !hasFusedLeadingOp(op) &&
1204-
clGPUUnalignedGEMMVectorDistribution) {
1205-
LDBG("Matmul Pad and Vector Distribute");
1206-
pipeline = CodeGenPipeline::LLVMGPUPadAndVectorDistribute;
1207-
bool mustBeAligned = false;
1208-
schedule =
1209-
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
1210-
targetSubgroupSize, transposedLhs, transposedRhs,
1211-
/*canUpcastAcc=*/false, mustBeAligned);
1212-
if (!schedule) {
1213-
// Then try again by allowing upcasting accumulator.
1214-
schedule =
1215-
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
1216-
targetSubgroupSize, transposedLhs, transposedRhs,
1217-
/*canUpcastAcc=*/true, mustBeAligned);
1218-
}
1219-
}
12201199
if (!schedule) {
12211200
LDBG("Failed to deduce MMA schedule");
12221201
return failure();

compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,7 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
142142
addGPUTransposePassPipeline(pipeline, pipelineOptions);
143143
break;
144144
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUVectorDistribute:
145-
addGPUVectorDistributePassPipeline(pipeline, pipelineOptions,
146-
/*usePadToModelSharedMemcpy=*/false);
147-
break;
148-
case IREE::Codegen::DispatchLoweringPassPipeline::
149-
LLVMGPUPadAndVectorDistribute:
150-
addGPUVectorDistributePassPipeline(pipeline, pipelineOptions,
151-
/*usePadToModelSharedMemcpy=*/true);
145+
addGPUVectorDistributePassPipeline(pipeline, pipelineOptions);
152146
break;
153147
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUWarpReduction:
154148
addGPUWarpReductionPassPipeline(pipeline, forROCDL);

compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteMatmulToFitMMA.cpp

Lines changed: 0 additions & 115 deletions
This file was deleted.

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -855,8 +855,7 @@ static void addVectorBufferizePasses(OpPassManager &funcPassManager) {
855855
}
856856

857857
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
858-
const GPUPipelineOptions &options,
859-
bool usePadToModelSharedMemcpy) {
858+
const GPUPipelineOptions &options) {
860859

861860
ReorderWorkgroupsStrategy reorderStrategy =
862861
getReorderWorkgroupsStrategy(options.reorderStrategy);
@@ -868,10 +867,6 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
868867
// Some of the elementwise fusion can benefit from this pass.
869868
funcPassManager.addPass(createRematerializeParallelOpsPass());
870869

871-
if (usePadToModelSharedMemcpy) {
872-
funcPassManager.addPass(createLLVMGPUPromoteMatmulToFitMMAPass());
873-
}
874-
875870
funcPassManager.addPass(
876871
IREE::LinalgExt::createConvertAttentionToOnlineAttentionPass());
877872

@@ -916,8 +911,8 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
916911
// be safely dropped. This additionally allows vectorization of convolution to
917912
// `vector.contract` as filter dimensions are expected to be tiled to 1 by
918913
// this point.
919-
funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass());
920-
if (!usePadToModelSharedMemcpy) {
914+
{
915+
funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass());
921916
LinalgFoldUnitExtentDimsPassOptions options;
922917
options.useRankReducingSlices = true;
923918
funcPassManager.addPass(IREE::LinalgExt::createFoldUnitExtentDimsPass());

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,7 @@ void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager);
6666

6767
/// Lowering based on vector distribution patterns.
6868
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
69-
const GPUPipelineOptions &options,
70-
bool usePadToModelSharedMemcpy);
69+
const GPUPipelineOptions &options);
7170

7271
/// Lowering reductions to warp reductions.
7372
void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager,

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,6 @@ def LLVMGPUPrefetchSharedMemoryPass :
9191
let summary = "Rotate scf.for loops to prefetch shared memory with distance 1";
9292
}
9393

94-
def LLVMGPUPromoteMatmulToFitMMAPass :
95-
InterfacePass<"iree-llvmgpu-promote-matmul-to-fit-mma", "mlir::FunctionOpInterface"> {
96-
let summary = "Pass to promote contraction ops to fit mma shapes";
97-
}
98-
9994
def LLVMGPUSelectLoweringStrategyPass :
10095
Pass<"iree-llvmgpu-select-lowering-strategy", "ModuleOp"> {
10196
let summary = "Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the target variant";

compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ iree_lit_test_suite(
5454
"llvmgpu_bufferize.mlir",
5555
"pack_shared_memory_alloc.mlir",
5656
"prefetch_shared_memory.mlir",
57-
"promote_matmul_to_fit_mma.mlir",
5857
"sort_pipeline_test.mlir",
5958
"tensor_pad.mlir",
6059
"tensorcore_vectorization.mlir",

0 commit comments

Comments
 (0)