Skip to content

Commit 07ec6ca

Browse files
authored
Implement thread coarsening for GPU kernels (#352)
* Add parallel loop unroll pass * Canonicalizer for flattening alternatives op * Add env variables user-specified coarsening factors * A zero-block gpu kernel launch is an error so avoid that * Max shared memory depends on the target gpu * Canonicalize AlternativeOp with a single region * Output kernel information
1 parent e288f7b commit 07ec6ca

File tree

14 files changed

+1916
-250
lines changed

14 files changed

+1916
-250
lines changed

include/polygeist/Passes/Passes.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,15 @@ std::unique_ptr<Pass> createParallelLowerPass(
3737
std::unique_ptr<Pass> createConvertCudaRTtoCPUPass();
3838
std::unique_ptr<Pass> createConvertCudaRTtoGPUPass();
3939
std::unique_ptr<Pass> createConvertCudaRTtoHipRTPass();
40+
std::unique_ptr<Pass> createSCFParallelLoopUnrollPass(int unrollFactor = 2);
4041
std::unique_ptr<Pass>
4142
createConvertPolygeistToLLVMPass(const LowerToLLVMOptions &options,
4243
bool useCStyleMemRef, bool onlyGpuModules,
4344
std::string gpuTarget);
4445
std::unique_ptr<Pass> createConvertPolygeistToLLVMPass();
4546
std::unique_ptr<Pass> createForBreakToWhilePass();
4647
std::unique_ptr<Pass>
47-
createConvertParallelToGPUPass1(bool useOriginalThreadNums = false);
48+
createConvertParallelToGPUPass1(std::string arch = "sm_60");
4849
std::unique_ptr<Pass>
4950
createConvertParallelToGPUPass2(bool emitGPUKernelLaunchBounds = true);
5051
std::unique_ptr<Pass> createMergeGPUModulesPass();

include/polygeist/Passes/Passes.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@ def Mem2Reg : Pass<"mem2reg"> {
1313
let constructor = "mlir::polygeist::createMem2RegPass()";
1414
}
1515

16+
def SCFParallelLoopUnroll : Pass<"scf-parallel-loop-unroll"> {
17+
let summary = "Unroll and interleave scf parallel loops";
18+
let dependentDialects =
19+
["::mlir::scf::SCFDialect"];
20+
let constructor = "mlir::polygeist::createSCFParallelLoopUnrollPass()";
21+
let options = [
22+
Option<"unrollFactor", "unrollFactor", "int", /*default=*/"2", "Unroll factor">
23+
];
24+
}
25+
1626
def ConvertCudaRTtoCPU : Pass<"convert-cudart-to-cpu", "mlir::ModuleOp"> {
1727
let summary = "Lower cudart functions to cpu versions";
1828
let dependentDialects =
@@ -60,6 +70,9 @@ def ConvertParallelToGPU1 : Pass<"convert-parallel-to-gpu1"> {
6070
let summary = "Convert parallel loops to gpu";
6171
let constructor = "mlir::polygeist::createConvertParallelToGPUPass1()";
6272
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
73+
let options = [
74+
Option<"arch", "arch", "std::string", /*default=*/"\"sm_60\"", "Target GPU architecture">
75+
];
6376
}
6477

6578
def ConvertParallelToGPU2 : Pass<"convert-parallel-to-gpu2"> {

include/polygeist/PolygeistOps.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def AlternativesOp : Polygeist_Op<"alternatives", [
106106
let regions = (region VariadicRegion<SizedRegion<1>>:$regions);
107107
let skipDefaultBuilders = 1;
108108
let builders = [OpBuilder<(ins "int":$regionNum)>];
109+
let hasCanonicalizer = 1;
109110
}
110111

111112
def GPUWrapperOp : Polygeist_Op<"gpu_wrapper", [

lib/polygeist/Ops.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,96 @@ void AlternativesOp::build(OpBuilder &builder, OperationState &result,
9393
}
9494
}
9595

96+
class HoistSingleAlternative final : public OpRewritePattern<AlternativesOp> {
97+
public:
98+
using OpRewritePattern<AlternativesOp>::OpRewritePattern;
99+
100+
LogicalResult matchAndRewrite(AlternativesOp aop,
101+
PatternRewriter &rewriter) const override {
102+
assert(aop->getNumRegions() > 0);
103+
if (aop->getNumRegions() > 1) {
104+
return failure();
105+
}
106+
auto block = &*aop->getRegions()[0].begin();
107+
rewriter.eraseOp(block->getTerminator());
108+
rewriter.mergeBlockBefore(block, aop);
109+
rewriter.eraseOp(aop);
110+
return success();
111+
}
112+
};
113+
114+
class FlattenAlternatives final : public OpRewritePattern<AlternativesOp> {
115+
public:
116+
using OpRewritePattern<AlternativesOp>::OpRewritePattern;
117+
118+
LogicalResult matchAndRewrite(AlternativesOp aop,
119+
PatternRewriter &rewriter) const override {
120+
// Ignore nested alternatives ops
121+
if (aop->getParentOfType<AlternativesOp>())
122+
return failure();
123+
124+
AlternativesOp innerAop = nullptr;
125+
for (auto &region : aop->getRegions()) {
126+
for (auto &op : region.getOps()) {
127+
if (auto aop = dyn_cast<AlternativesOp>(&op)) {
128+
innerAop = aop;
129+
break;
130+
}
131+
}
132+
if (innerAop)
133+
break;
134+
}
135+
if (!innerAop)
136+
return failure();
137+
138+
// TODO use block insertion etc for better performance
139+
auto newAop = rewriter.create<polygeist::AlternativesOp>(
140+
aop->getLoc(), innerAop->getNumRegions() + aop->getNumRegions() - 1);
141+
newAop->setAttrs(aop->getAttrs());
142+
auto srcBlock = &*aop->getBlock()->getParent()->begin();
143+
unsigned curRegion = 0;
144+
for (; curRegion < innerAop->getNumRegions(); curRegion++) {
145+
BlockAndValueMapping mapping;
146+
auto block = &*newAop->getRegion(curRegion).begin();
147+
rewriter.setInsertionPointToStart(block);
148+
for (auto &op : *innerAop->getBlock()) {
149+
if (&op == innerAop.getOperation()) {
150+
for (auto &op : innerAop->getRegion(curRegion).getOps())
151+
if (!isa<PolygeistYieldOp>(&op))
152+
rewriter.clone(op, mapping);
153+
} else {
154+
if (!isa<PolygeistYieldOp>(&op))
155+
rewriter.clone(op, mapping);
156+
}
157+
}
158+
}
159+
160+
unsigned oldRegion = 0;
161+
for (; oldRegion < aop->getNumRegions(); oldRegion++) {
162+
auto &srcRegion = aop->getRegion(oldRegion);
163+
if (innerAop->getBlock()->getParent() == &srcRegion) {
164+
continue;
165+
}
166+
auto block = &*newAop->getRegion(curRegion).begin();
167+
rewriter.setInsertionPointToStart(block);
168+
BlockAndValueMapping mapping;
169+
for (auto &op : srcRegion.getOps())
170+
if (!isa<PolygeistYieldOp>(&op))
171+
rewriter.clone(op, mapping);
172+
curRegion++;
173+
}
174+
175+
rewriter.eraseOp(aop);
176+
177+
return success();
178+
}
179+
};
180+
181+
void AlternativesOp::getCanonicalizationPatterns(RewritePatternSet &results,
182+
MLIRContext *context) {
183+
results.insert<HoistSingleAlternative, FlattenAlternatives>(context);
184+
}
185+
96186
//===----------------------------------------------------------------------===//
97187
// GPUBlockOp
98188
//===----------------------------------------------------------------------===//

lib/polygeist/Passes/CMakeLists.txt

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms
1717
ConvertParallelToGPU.cpp
1818
SerializeToCubin.cpp
1919
SerializeToHsaco.cpp
20+
ParallelLoopUnroll.cpp
2021

2122
ADDITIONAL_HEADER_DIRS
2223
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
@@ -88,9 +89,31 @@ if(POLYGEIST_ENABLE_CUDA)
8889

8990
endif()
9091
if(POLYGEIST_ENABLE_ROCM)
91-
# Enable gpu-to-hsaco pass.
9292
target_compile_definitions(obj.MLIRPolygeistTransforms
9393
PRIVATE
9494
POLYGEIST_ENABLE_ROCM=1
95+
__HIP_PLATFORM_AMD__
9596
)
97+
98+
if (NOT DEFINED ROCM_PATH)
99+
if (NOT DEFINED ENV{ROCM_PATH})
100+
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
101+
else()
102+
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
103+
endif()
104+
endif()
105+
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} "${ROCM_PATH}/hip")
106+
find_package(hip REQUIRED)
107+
108+
# there should be some variable for this
109+
target_include_directories(obj.MLIRPolygeistTransforms
110+
PRIVATE
111+
${ROCM_PATH}/include
112+
)
113+
114+
target_link_libraries(MLIRPolygeistTransforms
115+
PRIVATE
116+
hip::host
117+
)
118+
96119
endif()

0 commit comments

Comments
 (0)