Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
029b2cc
Creates AMDToGPUPass to house a subgroup reduce lowering pattern to DPP
Muzammiluddin-Syed-ECE Mar 25, 2025
427c817
Fix for numerical issues in MatVec tests
Muzammiluddin-Syed-ECE Apr 2, 2025
655251b
Rewrites pattern to be closer to device lib impl.
Muzammiluddin-Syed-ECE Apr 3, 2025
081d6f7
Removes AMDToGPUPass, moving pattern into existing pass
Muzammiluddin-Syed-ECE Apr 3, 2025
0d560c2
Adding permlanex16 and other dpp related ops to mlir dialect
Muzammiluddin-Syed-ECE Apr 10, 2025
015e9b9
Fixing permlanex16 intrinsic failure
Muzammiluddin-Syed-ECE Apr 11, 2025
945f0e8
simplify verbose typing
Muzammiluddin-Syed-ECE Apr 11, 2025
1b356ed
testing numerics
Muzammiluddin-Syed-ECE Apr 12, 2025
7fd30c0
fixing
Muzammiluddin-Syed-ECE Apr 12, 2025
0c28b4d
fixing
Muzammiluddin-Syed-ECE Apr 12, 2025
bfda712
fixing
Muzammiluddin-Syed-ECE Apr 12, 2025
54c08ef
trying again
Muzammiluddin-Syed-ECE Apr 14, 2025
6535bda
Fixing implementation
Muzammiluddin-Syed-ECE Apr 14, 2025
85e3b62
Adding DPP test
Muzammiluddin-Syed-ECE Apr 14, 2025
3392f08
Addressing PR comments
Muzammiluddin-Syed-ECE Apr 14, 2025
b59922a
removing unnecessary header
Muzammiluddin-Syed-ECE Apr 14, 2025
6431293
Addressing PR comments
Muzammiluddin-Syed-ECE Apr 16, 2025
ae25fa0
moving permlanex16 changes to another commit
Muzammiluddin-Syed-ECE Apr 16, 2025
9745098
fixing test
Muzammiluddin-Syed-ECE Apr 16, 2025
a6c35b3
fixing code formatting
Muzammiluddin-Syed-ECE Apr 16, 2025
8a9cefb
Updating implementation to support gfx 10+
Muzammiluddin-Syed-ECE Apr 16, 2025
c395203
Small formatting change
Muzammiluddin-Syed-ECE Apr 16, 2025
ab15c44
Removing ReadlaneOps from test
Muzammiluddin-Syed-ECE Apr 16, 2025
55f442e
Improve dpp implementation
Muzammiluddin-Syed-ECE Apr 16, 2025
6442288
fixing formatting
Muzammiluddin-Syed-ECE Apr 17, 2025
848c6ba
Fixing implementation of DPP subgroup reduce
Muzammiluddin-Syed-ECE Apr 22, 2025
6da1653
Addressing PR comments
Muzammiluddin-Syed-ECE Apr 23, 2025
e19a615
Fixing Typo in RUN command
Muzammiluddin-Syed-ECE Apr 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_

#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/IR/PatternMatch.h"
Expand Down Expand Up @@ -68,6 +69,20 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);

/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
/// ops over scalar types. Assumes that the subgroup has
/// `subgroupSize` lanes. Applicable only to AMD GPUs.
void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
unsigned subgroupSize,
amdgpu::Chipset chipset,
PatternBenefit benefit = 1);

/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
PatternBenefit benefit = 1);

/// Collect all patterns to rewrite ops within the GPU dialect.
inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
populateGpuAllReducePatterns(patterns);
Expand Down
177 changes: 177 additions & 0 deletions mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MathExtras.h"
#include <cassert>
Expand Down Expand Up @@ -362,6 +366,163 @@ struct VectorSubgroupReduceToShuffles final
unsigned shuffleBitwidth = 0;
bool matchClustered = false;
};

FailureOr<Value>
createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op,
Value input, gpu::AllReduceOperation mode,
const ClusterInfo &ci, amdgpu::Chipset chipset) {
Location loc = op.getLoc();
Value dpp;
Value res = input;
constexpr int allRows = 0xf;
constexpr int allBanks = 0xf;
const bool boundCtrl = true;
if (ci.clusterSize >= 2) {
// Perform reduction between all lanes N <-> N+1.
dpp = rewriter.create<amdgpu::DPPOp>(
loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
res = vector::makeArithReduction(rewriter, loc,
gpu::convertReductionKind(mode), res, dpp);
}

if (ci.clusterSize >= 4) {
// Perform reduction between all lanes N <-> N+2.
dpp = rewriter.create<amdgpu::DPPOp>(
loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
res = vector::makeArithReduction(rewriter, loc,
gpu::convertReductionKind(mode), res, dpp);
}
if (ci.clusterSize >= 8) {
// Perform reduction between all lanes N <-> 7-N,
// e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
dpp = rewriter.create<amdgpu::DPPOp>(
loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
res = vector::makeArithReduction(rewriter, loc,
gpu::convertReductionKind(mode), res, dpp);
}
if (ci.clusterSize >= 16) {
// Perform reduction between all lanes N <-> 15-N,
// e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
dpp = rewriter.create<amdgpu::DPPOp>(
loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
res = vector::makeArithReduction(rewriter, loc,
gpu::convertReductionKind(mode), res, dpp);
}
if (ci.clusterSize >= 32) {
if (chipset.majorVersion <= 9) {
// Broadcast last value from each row to next row.
// Use row mask to avoid polluting rows 1 and 3.
dpp = rewriter.create<amdgpu::DPPOp>(
loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
rewriter.getUnitAttr(), 0xa, allBanks,
/*bound_ctrl*/ false);
res = vector::makeArithReduction(
rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
} else if (chipset.majorVersion <= 12) {
// Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
Value uint32Max = rewriter.create<arith::ConstantOp>(
loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
uint32Max, uint32Max,
/*fi=*/true,
/*bound_ctrl=*/false);
res = vector::makeArithReduction(
rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
if (ci.subgroupSize == 32) {
Value lane0 = rewriter.create<arith::ConstantOp>(
loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
dpp =
rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
}
} else {
return rewriter.notifyMatchFailure(
op, "Subgroup reduce lowering to DPP not currently supported for "
"this device.");
}
}
if (ci.clusterSize >= 64) {
if (chipset.majorVersion <= 9) {
// Broadcast 31st lane value to rows 2 and 3.
// Use row mask to avoid polluting rows 0 and 1.
dpp = rewriter.create<amdgpu::DPPOp>(
loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
rewriter.getUnitAttr(), 0xc, allBanks,
/*bound_ctrl*/ false);

} else if (chipset.majorVersion <= 12) {
// Assume reduction across 32 lanes has been done.
// Perform final reduction manually by summing values in lane 0 and
// lane 32.
Value lane0 = rewriter.create<arith::ConstantOp>(
loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
Value lane32 = rewriter.create<arith::ConstantOp>(
loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
} else {
return rewriter.notifyMatchFailure(
op, "Subgroup reduce lowering to DPP not currently supported for "
"this device.");
}
res = vector::makeArithReduction(rewriter, loc,
gpu::convertReductionKind(mode), res, dpp);
}
assert(res.getType() == input.getType());
return res;
}

/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
/// ops over scalar types. Assumes that the subgroup has
/// `subgroupSize` lanes. Applicable only to AMD GPUs.
struct ScalarSubgroupReduceToDPP final
: OpRewritePattern<gpu::SubgroupReduceOp> {
ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
bool matchClustered, amdgpu::Chipset chipset,
PatternBenefit benefit)
: OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
matchClustered(matchClustered), chipset(chipset) {}

LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
PatternRewriter &rewriter) const override {
if (op.getClusterSize().has_value() != matchClustered) {
return rewriter.notifyMatchFailure(
op, llvm::formatv("op is {0}clustered but pattern is configured to "
"only match {1}clustered ops",
matchClustered ? "non-" : "",
matchClustered ? "" : "non-"));
}
auto ci = getAndValidateClusterInfo(op, subgroupSize);
if (failed(ci))
return failure();

if (ci->clusterStride != 1)
return rewriter.notifyMatchFailure(
op, "Supgroup reductions using DPP are currently only available for "
"clusters of contiguous lanes.");

Type valueTy = op.getType();
if (!valueTy.isIntOrFloat())
return rewriter.notifyMatchFailure(
op, "value type is not a compatible scalar");

FailureOr<Value> dpp = createSubgroupDPPReduction(
rewriter, op, op.getValue(), op.getOp(), *ci, chipset);
if (failed(dpp))
return failure();

rewriter.replaceOp(op, dpp.value());
return success();
}

private:
unsigned subgroupSize = 0;
bool matchClustered = false;
amdgpu::Chipset chipset;
};
} // namespace

void mlir::populateGpuBreakDownSubgroupReducePatterns(
Expand All @@ -372,6 +533,22 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
}

void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
PatternBenefit benefit) {
patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
/*matchClustered=*/false, chipset,
benefit);
}

void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
PatternBenefit benefit) {
patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
/*matchClustered=*/true, chipset,
benefit);
}

void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth, PatternBenefit benefit) {
Expand Down
Loading
Loading