Skip to content

Commit b894587

Browse files
authored
Support for __constant__ and __device__ vars in CUDA (#349)
1 parent 3c48b7e commit b894587

File tree

12 files changed

+370
-68
lines changed

12 files changed

+370
-68
lines changed

include/polygeist/Passes/Passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ std::unique_ptr<Pass>
4747
createConvertParallelToGPUPass1(bool useOriginalThreadNums = false);
4848
std::unique_ptr<Pass>
4949
createConvertParallelToGPUPass2(bool emitGPUKernelLaunchBounds = true);
50+
std::unique_ptr<Pass> createMergeGPUModulesPass();
5051
std::unique_ptr<Pass> createGpuSerializeToCubinPass(
5152
StringRef arch, StringRef features, int llvmOptLevel, int ptxasOptLevel,
5253
std::string ptxasPath, std::string libDevicePath, bool outputIntermediate);

include/polygeist/Passes/Passes.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ def ConvertParallelToGPU2 : Pass<"convert-parallel-to-gpu2"> {
6868
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
6969
}
7070

71+
def MergeGPUModulesPass : Pass<"merge-gpu-modules", "mlir::ModuleOp"> {
72+
let summary = "Merge all gpu modules into one";
73+
let constructor = "mlir::polygeist::createMergeGPUModulesPass()";
74+
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "gpu::GPUDialect"];
75+
}
76+
7177
def InnerSerialization : Pass<"inner-serialize"> {
7278
let summary = "remove scf.barrier";
7379
let constructor = "mlir::polygeist::createInnerSerializationPass()";

lib/polygeist/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ extern "C" void __cudaRegisterFunction(void **fatCubinHandle, void *hostFun,
169169
int32_t thread_limit, void *tid,
170170
void *bid, void *bDim, void *gDim,
171171
void *wSize);
172+
extern "C" void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
173+
char *deviceAddress, const char *deviceName,
174+
int ext, size_t size, int constant,
175+
int global);
172176
extern "C" void **__cudaRegisterFatBinary(void *fatCubin);
173177
extern "C" void __cudaRegisterFatBinaryEnd(void **fatCubinHandle);
174178
extern "C" void __cudaUnregisterFatBinary(void **fatCubinHandle);
@@ -181,17 +185,25 @@ __mgpurtRegisterFunction(void **fatCubinHandle, void *hostFun, void *deviceFun,
181185
thread_limit, tid, bid, bDim, gDim, wSize);
182186
}
183187

188+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
189+
__mgpurtRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
190+
const char *deviceName, int ext, size_t size, int constant,
191+
int global) {
192+
__cudaRegisterVar(fatCubinHandle, hostVar, deviceAddress, deviceName, ext,
193+
size, constant, global);
194+
}
195+
184196
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void **
185197
__mgpurtRegisterFatBinary(void *fatCubin) {
186198
return __cudaRegisterFatBinary(fatCubin);
187199
}
188200

189201
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
190202
__mgpurtRegisterFatBinaryEnd(void **fatCubinHandle) {
191-
return __cudaRegisterFatBinaryEnd(fatCubinHandle);
203+
__cudaRegisterFatBinaryEnd(fatCubinHandle);
192204
}
193205

194206
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
195207
__mgpurtUnregisterFatBinary(void **fatCubinHandle) {
196-
return __cudaUnregisterFatBinary(fatCubinHandle);
208+
__cudaUnregisterFatBinary(fatCubinHandle);
197209
}

lib/polygeist/ExecutionEngine/RocmRuntimeWrappers.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ extern "C" void __hipRegisterFunction(void **fatCubinHandle, void *hostFun,
9393
int32_t thread_limit, void *tid,
9494
void *bid, void *bDim, void *gDim,
9595
void *wSize);
96+
extern "C" void __hipRegisterVar(void **fatCubinHandle, char *hostVar,
97+
char *deviceAddress, const char *deviceName,
98+
int ext, size_t size, int constant,
99+
int global);
96100
extern "C" void **__hipRegisterFatBinary(void *fatCubin);
97101
extern "C" void __hipRegisterFatBinaryEnd(void **fatCubinHandle);
98102
extern "C" void __hipUnregisterFatBinary(void **fatCubinHandle);
@@ -104,6 +108,13 @@ __mgpurtRegisterFunction(void **fatCubinHandle, void *hostFun, void *deviceFun,
104108
__hipRegisterFunction(fatCubinHandle, hostFun, deviceFun, deviceName,
105109
thread_limit, tid, bid, bDim, gDim, wSize);
106110
}
111+
extern "C" MLIR_HIP_WRAPPERS_EXPORT void
112+
__mgpurtRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
113+
const char *deviceName, int ext, size_t size, int constant,
114+
int global) {
115+
__hipRegisterVar(fatCubinHandle, hostVar, deviceAddress, deviceName, ext,
116+
size, constant, global);
117+
}
107118

108119
extern "C" MLIR_HIP_WRAPPERS_EXPORT void **
109120
__mgpurtRegisterFatBinary(void *fatCubin) {

lib/polygeist/Passes/ConvertParallelToGPU.cpp

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,15 @@ struct HandleWrapperRootOps : public OpRewritePattern<polygeist::GPUWrapperOp> {
950950
bool read = hasEffect<MemoryEffects::Read>(effects);
951951
bool write = hasEffect<MemoryEffects::Write>(effects);
952952
SmallVector<Value, 1> cloned;
953-
if (effects.empty()) {
953+
// Special case for get_global because what if actually refers to is the
954+
// device-side global, so this must remain in the gpu wrapper
955+
if (isa<memref::GetGlobalOp>(op)) {
956+
// This is the same as the case for a parallelizable read op
957+
rewriter.setInsertionPoint(newWrapper.getBody()->getTerminator());
958+
rewriter.clone(*op, splitMapping);
959+
rewriter.setInsertionPoint(firstGridOp);
960+
cloned = rewriter.clone(*op, parallelizedMapping)->getResults();
961+
} else if (effects.empty()) {
954962
rewriter.setInsertionPoint(firstGridOp);
955963
rewriter.clone(*op, parallelizedMapping);
956964
rewriter.setInsertionPoint(newWrapper.getBody()->getTerminator());
@@ -1560,8 +1568,81 @@ struct ConvertParallelToGPU2Pass
15601568
}
15611569
};
15621570

1571+
struct MergeGPUModulesPass
1572+
: public MergeGPUModulesPassBase<MergeGPUModulesPass> {
1573+
void runOnOperation() override {
1574+
auto m = getOperation();
1575+
Region &moduleRegion = m->getRegion(0);
1576+
OpBuilder mBuilder(moduleRegion);
1577+
std::string newModuleName = "__polygeist_gpu_module";
1578+
auto newGpuModule =
1579+
mBuilder.create<gpu::GPUModuleOp>(m->getLoc(), newModuleName);
1580+
OpBuilder gpumBuilder(newGpuModule->getRegion(0));
1581+
std::vector<gpu::GPUModuleOp> toErase;
1582+
m->walk([&](gpu::GPUModuleOp gpum) {
1583+
if (gpum == newGpuModule)
1584+
return;
1585+
toErase.push_back(gpum);
1586+
for (auto &op : *gpum.getBody()) {
1587+
auto cloneIf = [&](auto op) {
1588+
if (op) {
1589+
if (!SymbolTable::lookupSymbolIn(newGpuModule, op.getName())) {
1590+
gpumBuilder.clone(*op.getOperation());
1591+
}
1592+
return true;
1593+
}
1594+
return false;
1595+
};
1596+
1597+
if (auto f = dyn_cast<gpu::GPUFuncOp>(&op)) {
1598+
auto newF = cast<gpu::GPUFuncOp>(gpumBuilder.clone(op));
1599+
if (SymbolTable::lookupSymbolIn(newGpuModule, f.getName())) {
1600+
auto newKernelName =
1601+
std::string(f.getName()) +
1602+
std::to_string(reinterpret_cast<intptr_t>(f.getOperation()));
1603+
newF.setName(newKernelName);
1604+
}
1605+
auto symbolUses = SymbolTable::getSymbolUses(f.getOperation(), m);
1606+
assert(symbolUses);
1607+
for (auto symbolUse : *symbolUses) {
1608+
if (auto launchOp =
1609+
dyn_cast<gpu::LaunchFuncOp>(symbolUse.getUser())) {
1610+
auto kernelSymbol =
1611+
SymbolRefAttr::get(newGpuModule.getNameAttr(),
1612+
{SymbolRefAttr::get(newF.getNameAttr())});
1613+
launchOp->setAttr(
1614+
gpu::LaunchFuncOp::getKernelAttrName(launchOp->getName()),
1615+
kernelSymbol);
1616+
} else {
1617+
f.emitError("Unexpected user of gpu func op");
1618+
assert(0);
1619+
}
1620+
}
1621+
} else if (!(cloneIf(dyn_cast<memref::GlobalOp>(&op)) ||
1622+
cloneIf(dyn_cast<LLVM::GlobalOp>(&op)) ||
1623+
cloneIf(dyn_cast<func::FuncOp>(&op)) ||
1624+
cloneIf(dyn_cast<LLVM::LLVMFuncOp>(&op)) ||
1625+
isa<gpu::ModuleEndOp>(&op))) {
1626+
op.emitError("Unexpected global type in gpu module");
1627+
op.dump();
1628+
assert(0);
1629+
}
1630+
}
1631+
});
1632+
1633+
if (toErase.size() == 0)
1634+
newGpuModule->erase();
1635+
1636+
for (auto gpum : toErase)
1637+
gpum->erase();
1638+
}
1639+
};
1640+
15631641
} // namespace
15641642

1643+
std::unique_ptr<Pass> mlir::polygeist::createMergeGPUModulesPass() {
1644+
return std::make_unique<MergeGPUModulesPass>();
1645+
}
15651646
std::unique_ptr<Pass>
15661647
mlir::polygeist::createConvertParallelToGPUPass1(bool useOriginalThreadNums) {
15671648
return std::make_unique<ConvertParallelToGPU1Pass>(useOriginalThreadNums);

0 commit comments

Comments
 (0)