Skip to content

Commit 51bf8e0

Browse files
newlinghhkit
authored andcommitted
[Codegen][LLVMGPU] Remove LLVMGPUWarpReduction pipeline (iree-org#21821)
The necessary changes to LLVMGPUVectorDistrubte pipeline and all tests are hopefully already done and landed now, and this can go in cleanly as is. Signed-off-by: James Newling <[email protected]> Signed-off-by: Ivan Ho <[email protected]>
1 parent 469fa4e commit 51bf8e0

File tree

6 files changed

+4
-294
lines changed

6 files changed

+4
-294
lines changed

compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,12 @@ def LLVMGPU_Vectorize
4242
: I32EnumAttrCase<"LLVMGPUVectorize", 103>;
4343
def LLVMGPU_TransposeSharedMem
4444
: I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 104>;
45-
def LLVMGPU_WarpReduction
46-
: I32EnumAttrCase<"LLVMGPUWarpReduction", 105>;
4745
def LLVMGPU_VectorDistribute
48-
: I32EnumAttrCase<"LLVMGPUVectorDistribute", 106>;
46+
: I32EnumAttrCase<"LLVMGPUVectorDistribute", 105>;
4947
def LLVMGPU_WinogradVectorize
50-
: I32EnumAttrCase<"LLVMGPUWinogradVectorize", 107>;
48+
: I32EnumAttrCase<"LLVMGPUWinogradVectorize", 106>;
5149
def LLVMGPU_TileAndFuse
52-
: I32EnumAttrCase<"LLVMGPUTileAndFuse", 108>;
50+
: I32EnumAttrCase<"LLVMGPUTileAndFuse", 107>;
5351

5452
def SPIRV_BaseLowering
5553
: I32EnumAttrCase<"SPIRVBaseLowering", 200>;
@@ -88,7 +86,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr<
8886

8987
// LLVMGPU CodeGen pipelines
9088
LLVMGPU_Default, LLVMGPU_BaseLowering, LLVMGPU_SimpleDistribute,
91-
LLVMGPU_Vectorize, LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction,
89+
LLVMGPU_Vectorize, LLVMGPU_TransposeSharedMem,
9290
LLVMGPU_VectorDistribute, LLVMGPU_WinogradVectorize, LLVMGPU_TileAndFuse,
9391

9492
// SPIR-V CodeGen pipelines

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

Lines changed: 0 additions & 228 deletions
Original file line numberDiff line numberDiff line change
@@ -2494,230 +2494,6 @@ static bool isMatvecLike(linalg::LinalgOp linalgOp) {
24942494
return true;
24952495
}
24962496

2497-
//====---------------------------------------------------------------------===//
2498-
// Warp Reduction Pipeline Configuration
2499-
//====---------------------------------------------------------------------===//
2500-
2501-
/// Set the configuration for reductions that can be mapped to warp reductions.
2502-
static LogicalResult
2503-
setWarpReductionConfig(IREE::GPU::TargetAttr target,
2504-
mlir::FunctionOpInterface entryPoint,
2505-
linalg::LinalgOp op) {
2506-
if (!target.supportsSubgroupShuffle())
2507-
return failure();
2508-
2509-
SmallVector<unsigned> parallelDims;
2510-
SmallVector<unsigned> reductionDims;
2511-
op.getParallelDims(parallelDims);
2512-
op.getReductionDims(reductionDims);
2513-
2514-
SmallVector<int64_t> bounds = op.getStaticLoopRanges();
2515-
int64_t numParallelDims = op.getNumParallelLoops();
2516-
2517-
if (reductionDims.empty())
2518-
return failure();
2519-
2520-
// Make sure reduction dimensions are static and innermost ones.
2521-
int64_t numDynamicReductionDims = 0;
2522-
for (unsigned dim : reductionDims) {
2523-
if (ShapedType::isDynamic(bounds[dim])) {
2524-
numDynamicReductionDims++;
2525-
}
2526-
if (dim < numParallelDims) {
2527-
return failure();
2528-
}
2529-
}
2530-
int numDynamicDims = llvm::count_if(bounds, ShapedType::isDynamic);
2531-
2532-
// Distribution of multi-dim masked writes currently aren't fully supported.
2533-
if (numDynamicReductionDims > 1) {
2534-
return failure();
2535-
}
2536-
2537-
if (op.getRegionOutputArgs().size() != 1)
2538-
return failure();
2539-
2540-
// Only support projected permutation, this could be extended to projected
2541-
// permutated with broadcast.
2542-
if (llvm::any_of(op.getDpsInputOperands(), [&](OpOperand *input) {
2543-
return !op.getMatchingIndexingMap(input).isProjectedPermutation();
2544-
}))
2545-
return failure();
2546-
2547-
bool foundSingleReductionOutput = false;
2548-
for (auto [index, initOpOperand] : llvm::enumerate(op.getDpsInitsMutable())) {
2549-
// Only single combiner operations are supported for now.
2550-
SmallVector<Operation *> combinerOps;
2551-
if (matchReduction(op.getRegionOutputArgs(), index, combinerOps) &&
2552-
combinerOps.size() == 1) {
2553-
if (foundSingleReductionOutput)
2554-
return failure();
2555-
foundSingleReductionOutput = true;
2556-
continue;
2557-
}
2558-
if (!op.getMatchingIndexingMap(&initOpOperand).isIdentity())
2559-
return failure();
2560-
}
2561-
if (!foundSingleReductionOutput)
2562-
return failure();
2563-
2564-
SmallVector<int64_t> workgroupTileSizes(op.getNumParallelLoops(), 1);
2565-
2566-
int64_t reductionSize = 1;
2567-
for (int64_t dim : reductionDims)
2568-
reductionSize *= bounds[dim];
2569-
2570-
int64_t subgroupSize = 0;
2571-
for (int s : target.getWgp().getSubgroupSizeChoices().asArrayRef()) {
2572-
if (reductionSize % s == 0) {
2573-
subgroupSize = s;
2574-
break;
2575-
}
2576-
}
2577-
if (subgroupSize == 0)
2578-
return failure();
2579-
2580-
// Without any bounds on dynamic dims, we need specialization to
2581-
// get peak performance. For now, just use the warp size.
2582-
if (numDynamicDims > 0) {
2583-
SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0);
2584-
int64_t preferredSubgroupSize = target.getPreferredSubgroupSize();
2585-
// We should set the subgroup size on:
2586-
// Priority 1: The innermost reduction dimension with static shapes.
2587-
// Priority 2: If there's no reduction dimension with static shapes
2588-
// then the innermost reduction dim.
2589-
unsigned lastNonDynamicReductionDim = reductionDims.back();
2590-
if (reductionDims.size() > 1) {
2591-
for (unsigned dim : reductionDims) {
2592-
if (ShapedType::isDynamic(bounds[dim])) {
2593-
reductionTileSizes[dim] = 1;
2594-
} else {
2595-
lastNonDynamicReductionDim = dim;
2596-
}
2597-
}
2598-
}
2599-
reductionTileSizes[lastNonDynamicReductionDim] = preferredSubgroupSize;
2600-
TileSizesListType tileSizes;
2601-
tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level
2602-
tileSizes.emplace_back(std::move(reductionTileSizes)); // Reduction level
2603-
std::array<int64_t, 3> workgroupSize = {preferredSubgroupSize, 1, 1};
2604-
if (failed(setOpConfigAndEntryPointFnTranslation(
2605-
entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUWarpReduction,
2606-
workgroupSize, preferredSubgroupSize))) {
2607-
return failure();
2608-
}
2609-
return success();
2610-
}
2611-
2612-
const Type elementType =
2613-
llvm::cast<ShapedType>(op.getDpsInitOperand(0)->get().getType())
2614-
.getElementType();
2615-
if (!elementType.isIntOrFloat())
2616-
return failure();
2617-
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
2618-
// Reduction distribution only supports 8/16/32 bit types now.
2619-
if (bitWidth != 32 && bitWidth != 16 && bitWidth != 8)
2620-
return failure();
2621-
2622-
const unsigned largestLoadSizeInBits = 128;
2623-
unsigned vectorSize = largestLoadSizeInBits / bitWidth;
2624-
while ((reductionSize / vectorSize) % subgroupSize != 0)
2625-
vectorSize /= 2;
2626-
2627-
// Deduce the workgroup size we should use for reduction. Currently a
2628-
// workgroup processes all elements in reduction dimensions. Need to make sure
2629-
// the workgroup size we use can divide the total reduction size, and it's
2630-
// also within hardware limitations.
2631-
const int64_t maxWorkgroupSize = 1024;
2632-
int64_t groupSize = reductionSize / vectorSize;
2633-
if (groupSize > maxWorkgroupSize) {
2634-
groupSize = llvm::APIntOps::GreatestCommonDivisor(
2635-
{64, uint64_t(groupSize)}, {64, uint64_t(maxWorkgroupSize)})
2636-
.getZExtValue();
2637-
}
2638-
2639-
// Then we need to strike a balance--
2640-
// 1) parallel dimensions are distributed to workgroups. If there are many
2641-
// workgroups dispatched, we'd want to have each GPU core hosting multiple
2642-
// of them for occupancy.
2643-
// 2) we want each thread to read quite a few 128-bit vectors for better
2644-
// memory cache behavior.
2645-
// Both means we cannot use a too large workgroup size.
2646-
2647-
std::optional<int64_t> parallelSize = 1;
2648-
for (int64_t dim : parallelDims) {
2649-
if (ShapedType::isDynamic(bounds[dim])) {
2650-
parallelSize = std::nullopt;
2651-
break;
2652-
}
2653-
*parallelSize *= bounds[dim];
2654-
}
2655-
// Total parallel size that can fill the GPU with enough workgorups.
2656-
// TODO: query from the target device; roughly 2x hardware compute unit.
2657-
const int parallelThreshold = 256;
2658-
// How many 128-bit vectors each thread should at least read.
2659-
const int targetVectorCount = 8;
2660-
while (parallelSize && *parallelSize > parallelThreshold &&
2661-
(groupSize / 2) % subgroupSize == 0 &&
2662-
reductionSize / (groupSize * vectorSize) < targetVectorCount) {
2663-
// Use less subgroups per workgroup..
2664-
groupSize /= 2;
2665-
// in order to host more workgroups per hardware compute unit.
2666-
*parallelSize /= 2;
2667-
}
2668-
2669-
// Current warp reduction pattern is a two step butterfly warp reduce.
2670-
// First, do warp reductions along multiple subgroups.
2671-
// Second, reduce results from multiple subgroups using single warp reduce.
2672-
// The final warp reduce requires subgroup count <= subgroup size to work.
2673-
if ((groupSize / subgroupSize) > subgroupSize)
2674-
return failure();
2675-
2676-
// With just one subgroup per workgroup, make each subgroup do more work and
2677-
// process a few reductions (rows) along the last parallel dimension.
2678-
//
2679-
// TODO: This is enabled for matvec on ROCm for now. We should
2680-
// validate this strategy and extend to more linalg generics and to CUDA.
2681-
if (isROCmBackend(target) && ShapedType::isStaticShape(bounds) &&
2682-
isMatvecLike(op)) {
2683-
int64_t parallelIdx = *llvm::find_if(
2684-
parallelDims, [&](int64_t currIdx) { return bounds[currIdx] != 1; });
2685-
int64_t parallelBound = bounds[parallelIdx];
2686-
int64_t numParallelReductions = 1;
2687-
const int64_t maxParallelFactor = groupSize / 4;
2688-
for (int64_t parallelFactor = 2; (parallelFactor < maxParallelFactor) &&
2689-
(parallelBound % parallelFactor == 0) &&
2690-
(parallelBound > parallelFactor);
2691-
parallelFactor *= 2) {
2692-
numParallelReductions = parallelFactor;
2693-
}
2694-
workgroupTileSizes[parallelIdx] = numParallelReductions;
2695-
}
2696-
2697-
std::array<int64_t, 3> workgroupSize = {groupSize, 1, 1};
2698-
SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0);
2699-
int64_t remainingGroupSize = groupSize;
2700-
for (int i = reductionDims.size() - 1; i >= 0; --i) {
2701-
int64_t dim = reductionDims[i];
2702-
int64_t bound = bounds[dim];
2703-
if (i == reductionDims.size() - 1)
2704-
bound /= vectorSize;
2705-
APInt size = llvm::APIntOps::GreatestCommonDivisor(
2706-
{64, uint64_t(remainingGroupSize)}, {64, uint64_t(bound)});
2707-
reductionTileSizes[dim] = size.getSExtValue();
2708-
if (i == reductionDims.size() - 1)
2709-
reductionTileSizes[dim] *= vectorSize;
2710-
remainingGroupSize /= size.getSExtValue();
2711-
}
2712-
TileSizesListType tileSizes;
2713-
tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level
2714-
tileSizes.emplace_back(std::move(reductionTileSizes)); // Reduction level
2715-
return setOpConfigAndEntryPointFnTranslation(
2716-
entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUWarpReduction,
2717-
workgroupSize, subgroupSize);
2718-
return success();
2719-
}
2720-
27212497
static bool hasTwoOrThreeLoopsInfo(linalg::LinalgOp linalgOp) {
27222498
return linalgOp.getNumParallelLoops() >= 2 &&
27232499
linalgOp.getNumParallelLoops() <= 3;
@@ -3083,10 +2859,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
30832859
LDBG() << "Vector Distribution Subgroup Reduction Config";
30842860
return success();
30852861
}
3086-
if (succeeded(setWarpReductionConfig(target, entryPointFn, linalgOp))) {
3087-
LDBG() << "Warp Reduction Config";
3088-
return success();
3089-
}
30902862
if (succeeded(setConvolutionConfig(target, entryPointFn, linalgOp, 16))) {
30912863
LDBG() << "Convolution Config";
30922864
return success();

compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,6 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
120120
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUVectorDistribute:
121121
addGPUVectorDistributePassPipeline(pipeline, pipelineOptions, forROCDL);
122122
break;
123-
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUWarpReduction:
124-
addGPUWarpReductionPassPipeline(pipeline, forROCDL);
125-
break;
126123
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUTileAndFuse:
127124
addGPUTileAndFusePassPipeline(pipeline, pipelineOptions, forROCDL);
128125
break;

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -875,56 +875,6 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
875875
funcPassManager.addPass(createCSEPass());
876876
}
877877

878-
void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager,
879-
bool forROCDL) {
880-
tileAndDistributeToWorkgroup(
881-
funcPassManager, /*useForall=*/clDistributeToWorkgroupsUsingForall);
882-
funcPassManager.addPass(createRematerializeParallelOpsPass());
883-
funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
884-
funcPassManager.addPass(createGPUTileReductionPass());
885-
funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
886-
funcPassManager.addPass(createCSEPass());
887-
funcPassManager.addPass(createPropagateDispatchSizeBoundsPass());
888-
889-
// Linalg -> vector
890-
{
891-
GenericVectorizationPassOptions options;
892-
options.enableVectorMasking = true;
893-
options.useConfiguredVectorSizes = false;
894-
options.vectorizePadding = true;
895-
options.vectorizeGatherAccesses = true;
896-
options.enableCleanup = false;
897-
options.generateContract = false;
898-
funcPassManager.addPass(createGenericVectorizationPass(options));
899-
funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
900-
funcPassManager.addPass(createCanonicalizerPass());
901-
funcPassManager.addPass(createCSEPass());
902-
}
903-
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
904-
funcPassManager.addPass(createCanonicalizerPass());
905-
funcPassManager.addPass(createCSEPass());
906-
907-
addBufferizePasses(funcPassManager);
908-
909-
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
910-
funcPassManager.addPass(createOptimizeVectorTransferPass());
911-
funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
912-
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
913-
funcPassManager.addPass(createCanonicalizerPass());
914-
funcPassManager.addPass(createCSEPass());
915-
funcPassManager.addPass(createForOpCanonicalizationPass());
916-
funcPassManager.addPass(createCanonicalizerPass());
917-
918-
// vector -> simt gpu + vector
919-
VectorReductionToGPUPassOptions options;
920-
options.expandSubgroupReduction = !forROCDL;
921-
funcPassManager.addPass(createVectorReductionToGPUPass(options));
922-
funcPassManager.addPass(createCanonicalizerPass());
923-
funcPassManager.addPass(createCSEPass());
924-
funcPassManager.addPass(affine::createLoopCoalescingPass());
925-
funcPassManager.addPass(createCanonicalizerPass());
926-
}
927-
928878
void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager) {
929879
tileAndBufferize(funcPassManager);
930880

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,6 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
6060
const GPUPipelineOptions &options,
6161
bool forROCDL);
6262

63-
/// Lowering reductions to warp reductions.
64-
void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager,
65-
bool forROCDL = true);
66-
6763
/// Default pass pipeline on GPU, currently used only for the ukernel path.
6864
void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
6965
const GPUPipelineOptions &options);

compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,6 @@ class ROCDLLowerExecutableTargetPass final
6969
case CodeGenPipeline::LLVMGPUBaseLowering:
7070
addGPUBaseLoweringPassPipeline(pipeline);
7171
break;
72-
case CodeGenPipeline::LLVMGPUWarpReduction:
73-
addGPUWarpReductionPassPipeline(pipeline, /*forROCDL=*/true);
74-
break;
7572
case CodeGenPipeline::LLVMGPUTileAndFuse:
7673
addGPUTileAndFusePassPipeline(pipeline, pipelineOptions,
7774
/*forROCDL=*/true);

0 commit comments

Comments
 (0)