Skip to content

Commit 4a66ccb

Browse files
Creates AMDToGPUPass to house a subgroup reduce lowering pattern to DPP
ops. Signed-off-by: Muzammiluddin Syed <[email protected]>
1 parent ffaaace commit 4a66ccb

File tree

7 files changed

+249
-0
lines changed

7 files changed

+249
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
9+
#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_
10+
11+
12+
#include "mlir/IR/PatternMatch.h"
13+
#include <memory>
14+
#include <string>
15+
16+
namespace mlir {
17+
18+
class LLVMTypeConverter;
19+
class RewritePatternSet;
20+
class TypeConverter;
21+
class Pass;
22+
23+
#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS
24+
#include "mlir/Conversion/Passes.h.inc"
25+
26+
void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
27+
unsigned subgroupSize,
28+
PatternBenefit benefit);
29+
30+
} // namespace mlir
31+
32+
#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_

mlir/include/mlir/Conversion/Passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
3535
#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
3636
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
37+
#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
3738
#include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
3839
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
3940
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"

mlir/include/mlir/Conversion/Passes.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,22 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
643643
];
644644
}
645645

646+
//===----------------------------------------------------------------------===//
647+
// GPUToAMDGPU
648+
//===----------------------------------------------------------------------===//
649+
650+
def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> {
651+
let summary = "Generate AMDGPU operations for gpu operations";
652+
let dependentDialects = [
653+
"amdgpu::AMDGPUDialect",
654+
"LLVM::LLVMDialect",
655+
"ROCDL::ROCDLDialect",
656+
];
657+
let options = [Option<"subgroupSize", "subgroup-size", "unsigned",
658+
/*default=*/"64",
659+
"Size of subgroup">];
660+
}
661+
646662
//===----------------------------------------------------------------------===//
647663
// ConvertIndexToLLVMPass
648664
//===----------------------------------------------------------------------===//

mlir/lib/Conversion/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ add_subdirectory(FuncToEmitC)
2424
add_subdirectory(FuncToLLVM)
2525
add_subdirectory(FuncToSPIRV)
2626
add_subdirectory(GPUCommon)
27+
add_subdirectory(GPUToAMDGPU)
2728
add_subdirectory(GPUToLLVMSPV)
2829
add_subdirectory(GPUToNVVM)
2930
add_subdirectory(GPUToROCDL)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
add_mlir_conversion_library(MLIRGPUToAMDGPU
2+
GPUToAMDGPU.cpp
3+
4+
ADDITIONAL_HEADER_DIRS
5+
${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU
6+
7+
DEPENDS
8+
MLIRConversionPassIncGen
9+
10+
LINK_COMPONENTS
11+
Core
12+
13+
LINK_LIBS PUBLIC
14+
MLIRLLVMCommonConversion
15+
MLIRLLVMDialect
16+
MLIRGPUDialect
17+
MLIRAMDGPUDialect
18+
MLIRAMDGPUUtils
19+
MLIRROCDLDialect
20+
MLIRPass
21+
MLIRTransforms
22+
)
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h"
10+
11+
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
12+
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
13+
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
14+
#include "mlir/IR/BuiltinTypes.h"
15+
#include "mlir/IR/TypeUtilities.h"
16+
#include "mlir/Pass/Pass.h"
17+
18+
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
19+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
20+
#include "mlir/Dialect/Vector/IR/VectorOps.h"
21+
22+
#include "mlir/Transforms/WalkPatternRewriteDriver.h"
23+
#include "llvm/Support/FormatVariadic.h"
24+
25+
namespace mlir {
26+
#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS
27+
#include "mlir/Conversion/Passes.h.inc"
28+
} // namespace mlir
29+
30+
using namespace mlir;
31+
32+
namespace {
33+
struct ClusterInfo {
34+
unsigned clusterStride;
35+
unsigned clusterSize;
36+
unsigned subgroupSize;
37+
};
38+
39+
static FailureOr<ClusterInfo>
40+
getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
41+
assert(llvm::isPowerOf2_32(subgroupSize));
42+
43+
std::optional<uint32_t> clusterSize = op.getClusterSize();
44+
assert(!clusterSize ||
45+
llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
46+
if (clusterSize && *clusterSize > subgroupSize)
47+
return op.emitOpError()
48+
<< "cluster size " << *clusterSize
49+
<< " is greater than subgroup size " << subgroupSize;
50+
unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
51+
52+
auto clusterStride = op.getClusterStride();
53+
assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
54+
if (clusterStride >= subgroupSize)
55+
return op.emitOpError()
56+
<< "cluster stride " << clusterStride
57+
<< " is not less than subgroup size " << subgroupSize;
58+
59+
return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
60+
}
61+
62+
Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
63+
gpu::AllReduceOperation mode,
64+
const ClusterInfo &ci) {
65+
Value result = input;
66+
if (ci.clusterSize >= 2) {
67+
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
68+
Value dppResult =
69+
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
70+
amdgpu::DPPPerm::row_shr, permArg);
71+
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
72+
result, dppResult);
73+
}
74+
75+
if (ci.clusterSize >= 4) {
76+
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
77+
Value dppResult =
78+
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
79+
amdgpu::DPPPerm::row_shr, permArg);
80+
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
81+
result, dppResult);
82+
}
83+
84+
if (ci.clusterSize >= 8) {
85+
Value dppResult = b.create<amdgpu::DPPOp>(
86+
loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
87+
b.getUnitAttr());
88+
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
89+
result, dppResult);
90+
}
91+
92+
if (ci.clusterSize >= 16) {
93+
Value dppResult =
94+
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
95+
amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
96+
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
97+
result, dppResult);
98+
}
99+
100+
if (ci.clusterSize >= 32) {
101+
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
102+
Value dppResult = b.create<amdgpu::DPPOp>(
103+
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
104+
b.getUnitAttr(), 10, 15, false);
105+
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
106+
result, dppResult);
107+
}
108+
109+
if (ci.clusterSize == 64) {
110+
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
111+
Value dppResult = b.create<amdgpu::DPPOp>(
112+
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
113+
b.getUnitAttr(), 12, 15, false);
114+
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
115+
result, dppResult);
116+
}
117+
118+
auto int32Type = IntegerType::get(b.getContext(), 32);
119+
Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
120+
result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
121+
assert(result.getType() == input.getType());
122+
return result;
123+
}
124+
125+
struct ScalarSubgroupReduceToShuffles final
126+
: OpRewritePattern<gpu::SubgroupReduceOp> {
127+
ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize,
128+
bool matchClustered, PatternBenefit benefit)
129+
: OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
130+
matchClustered(matchClustered) {}
131+
132+
LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
133+
PatternRewriter &rewriter) const override {
134+
if (op.getClusterSize().has_value() != matchClustered) {
135+
return rewriter.notifyMatchFailure(
136+
op, llvm::formatv("op is {0}clustered but pattern is configured to "
137+
"only match {1}clustered ops",
138+
matchClustered ? "non-" : "",
139+
matchClustered ? "" : "non-"));
140+
}
141+
142+
auto ci = getAndValidateClusterInfo(op, subgroupSize);
143+
if (failed(ci))
144+
return failure();
145+
146+
Location loc = op.getLoc();
147+
rewriter.replaceOp(op, createSubgroupDPPReduction(
148+
rewriter, loc, op.getValue(), op.getOp(), *ci));
149+
return success();
150+
}
151+
152+
private:
153+
unsigned subgroupSize = 0;
154+
bool matchClustered = false;
155+
};
156+
157+
struct ConvertGPUToAMDGPUPass
158+
: public impl::ConvertGPUToAMDGPUPassBase<ConvertGPUToAMDGPUPass> {
159+
using Base::Base;
160+
161+
void runOnOperation() override {
162+
RewritePatternSet patterns(&getContext());
163+
int subgroupSizeInt = static_cast<int>(subgroupSize);
164+
populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt,
165+
PatternBenefit(1));
166+
walkAndApplyPatterns(getOperation(), std::move(patterns));
167+
}
168+
};
169+
} // namespace
170+
171+
void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns,
172+
unsigned subgroupSize,
173+
PatternBenefit benefit) {
174+
patterns.add<ScalarSubgroupReduceToShuffles>(
175+
patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit);
176+
}

mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
1515
MLIRMathToLLVM
1616
MLIRMathToROCDL
1717
MLIRAMDGPUToROCDL
18+
MLIRGPUToAMDGPU
1819
MLIRFuncToLLVM
1920
MLIRGPUDialect
2021
MLIRGPUToGPURuntimeTransforms

0 commit comments

Comments
 (0)