Skip to content

Commit e5d8b2b

Browse files
authored
Enable compilation to CUDA (#273)
* Remove device functions from host module * Integrate gpu to nvvm lowering in PolygeistToLLVM * Convert shared mem stack alloca to global * Add missing files * Update readme * Add optimizations to device code * Fix shared mem memref alloca to global * Fix convertpolygeisttollvm gpu module pass api * Correct struct pointer offset calc * Custom GPUFuncOp lowering * Cuda global symbol handling * Ignore cudaFuncSetCacheConfig * Pass DL to kernel outlining pass * get gpu module triple and dl * Do not remove device only declarations * Custom lowering of gpu launch op * gpu launch func lowering * Pass gpu arch to gpu module serialize pass * Clean up gpu launch op lowering * Unload gpu module * Add a custom serialize to cubin pass in polygeist * clang format * Add missing serialize pass file * cmake fix * serialize to cubin pass fix * Update readme * Get libdevice and ptxas paths from clang * WIP Parallel to GPU launch * Move cudart lower * Keep cuda block/grid parallel ops separate * Do not lower cuda rt when we will be emitting cuda (TEMP) * Support blocks with no terminators in canonicalize for * Add terminator for parallel wrapper op * Fix globalctor globaldtor bug * Handle cases where single iteration parallels get optimized away * Get cuda paths from clang toolchain cuda detector * Fix handling of block/grid dim ops when raising parallel to GPU * Convert llvm intrinsic func to cuda libdevice * Better parallel->gpu * Add case for completely optimized away parallel ops * Handle ops other than a parallel in parallel wrapper op * Fix HandleWrapperRootOps pattern * Rename parallel wrapper to gpu wrapper * Support for write ops in gird parallel * remove module dumps in driver.cc * Sink constants in the gpu launch op before outlining it * Output gpu compilation intermediates * Do not unroll loops by default * dump module macro tweak * Add operands for block size to gpu wrapper * Take original blcok size into account when converting parallel to gpu * Add gpu temp memory cache for splitting wrappers and interchanging if/wrapper * Add C style lowering for gpu.alloc * add todo test * Add option to configure whether to use original gpu block sizes * Add kernels with different block sizes for out of resources failover * alternative strategy for failing execution with out of resources * Remove clang-tidy file * Add polygeist specific cuda runtime wrappers * Fix wrong logic in parallel to gpu when original thread num was bigger than default * Use llvm versions of *powerof2 functions
1 parent f7b222b commit e5d8b2b

25 files changed

+3474
-67
lines changed

.clang-tidy

Lines changed: 0 additions & 19 deletions
This file was deleted.

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.10)
22

33
include(CheckCXXSourceCompiles)
44

5+
set(POLYGEIST_ENABLE_CUDA 0 CACHE BOOL "Enable CUDA compilation support")
6+
57
if(POLICY CMP0068)
68
cmake_policy(SET CMP0068 NEW)
79
set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ ninja
3030
ninja check-mlir
3131
```
3232

33+
To enable compilation to cuda add `-DMLIR_ENABLE_CUDA_RUNNER=1` and remove `-DLLVM_TARGETS_TO_BUILD="host"` from the cmake arguments. (You may need to specify `CUDACXX`, `CUDA_PATH`, or `CMAKE_CUDA_COMPILER`)
34+
3335
2. Build Polygeist:
3436
```sh
3537
mkdir build
@@ -44,6 +46,8 @@ ninja
4446
ninja check-polygeist-opt && ninja check-cgeist
4547
```
4648

49+
To enable compilation to cuda add `-DPOLYGEIST_ENABLE_CUDA=1`
50+
4751
#### Option 2: Using unified LLVM, MLIR, Clang, and Polygeist build
4852

4953
Polygeist can also be built as an external LLVM project using [LLVM_EXTERNAL_PROJECTS](https://llvm.org/docs/CMake.html#llvm-related-variables).

include/polygeist/Ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define POLYGEISTOPS_H
1111

1212
#include "mlir/Dialect/Affine/IR/AffineOps.h"
13+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1314
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
1415
#include "mlir/Dialect/SCF/IR/SCF.h"
1516
#include "mlir/IR/BuiltinTypes.h"

include/polygeist/Passes/Passes.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,21 @@ std::unique_ptr<Pass> createCPUifyPass(StringRef method = "");
2222
std::unique_ptr<Pass> createBarrierRemovalContinuation();
2323
std::unique_ptr<Pass> detectReductionPass();
2424
std::unique_ptr<Pass> createRemoveTrivialUsePass();
25-
std::unique_ptr<Pass> createParallelLowerPass();
25+
std::unique_ptr<Pass> createParallelLowerPass(bool wrapParallelOps = false);
26+
std::unique_ptr<Pass> createCudaRTLowerPass();
2627
std::unique_ptr<Pass>
2728
createConvertPolygeistToLLVMPass(const LowerToLLVMOptions &options,
28-
bool useCStyleMemRef);
29+
bool useCStyleMemRef, bool onlyGpuModules);
2930
std::unique_ptr<Pass> createConvertPolygeistToLLVMPass();
3031
std::unique_ptr<Pass> createForBreakToWhilePass();
32+
std::unique_ptr<Pass>
33+
createConvertParallelToGPUPass1(bool useOriginalThreadNums = false);
34+
std::unique_ptr<Pass> createConvertParallelToGPUPass2();
35+
std::unique_ptr<Pass> createGpuSerializeToCubinPass(
36+
StringRef triple, StringRef arch, StringRef features, int llvmOptLevel,
37+
int ptxasOptLevel, std::string ptxasPath, std::string libDevicePath,
38+
bool outputIntermediate);
39+
void registerGpuSerializeToCubinPass();
3140

3241
void populateForBreakToWhilePatterns(RewritePatternSet &patterns);
3342
} // namespace polygeist

include/polygeist/Passes/Passes.td

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,15 @@ def Mem2Reg : Pass<"mem2reg"> {
1313
let constructor = "mlir::polygeist::createMem2RegPass()";
1414
}
1515

16+
def CudaRTLower : Pass<"cudart-lower", "mlir::ModuleOp"> {
17+
let summary = "Lower cudart functions to cpu versions";
18+
let dependentDialects =
19+
["memref::MemRefDialect", "func::FuncDialect", "LLVM::LLVMDialect"];
20+
let constructor = "mlir::polygeist::createCudaRTLowerPass()";
21+
}
22+
1623
def ParallelLower : Pass<"parallel-lower", "mlir::ModuleOp"> {
17-
let summary = "Replace scf.if and similar with affine.if";
24+
let summary = "Lower gpu launch op to parallel ops";
1825
let dependentDialects =
1926
["memref::MemRefDialect", "func::FuncDialect", "LLVM::LLVMDialect"];
2027
let constructor = "mlir::polygeist::createParallelLowerPass()";
@@ -35,6 +42,18 @@ def SCFCPUify : Pass<"cpuify"> {
3542
];
3643
}
3744

45+
def ConvertParallelToGPU1 : Pass<"convert-parallel-to-gpu1"> {
46+
let summary = "Convert parallel loops to gpu";
47+
let constructor = "mlir::polygeist::createConvertParallelToGPUPass1()";
48+
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
49+
}
50+
51+
def ConvertParallelToGPU2 : Pass<"convert-parallel-to-gpu2"> {
52+
let summary = "Convert parallel loops to gpu";
53+
let constructor = "mlir::polygeist::createConvertParallelToGPUPass2()";
54+
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
55+
}
56+
3857
def InnerSerialization : Pass<"inner-serialize"> {
3958
let summary = "remove scf.barrier";
4059
let constructor = "mlir::polygeist::createInnerSerializationPass()";

include/polygeist/PolygeistOps.td

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
include "Dialect.td"
1313
include "mlir/Interfaces/SideEffectInterfaces.td"
1414
include "mlir/Interfaces/ViewLikeInterface.td"
15+
include "mlir/Interfaces/ControlFlowInterfaces.td"
1516
include "mlir/IR/SymbolInterfaces.td"
1617

1718
include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
@@ -66,6 +67,36 @@ def SubIndexOp : Polygeist_Op<"subindex", [
6667
}];
6768
}
6869

70+
def GPUWrapperOp : Polygeist_Op<"gpu_wrapper", [
71+
RecursiveMemoryEffects,
72+
SingleBlockImplicitTerminator<"polygeist::PolygeistYieldOp">]>,
73+
Arguments<(ins Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)> {
74+
let summary = "Wraps a parallel op to prevent code motion and transformations";
75+
let results = (outs Index : $result);
76+
let regions = (region SizedRegion<1>:$region);
77+
let skipDefaultBuilders = 1;
78+
let builders = [OpBuilder<(ins
79+
"Value":$blockSizeX, "Value":$blockSizeY, "Value":$blockSizeZ)>];
80+
81+
}
82+
83+
def GPUErrorOp : Polygeist_Op<"gpu_error", [
84+
RecursiveMemoryEffects,
85+
SingleBlockImplicitTerminator<"polygeist::PolygeistYieldOp">]>,
86+
Arguments<(ins)> {
87+
let summary = "Gets the error returned by the gpu operation inside";
88+
// TODO should be i32, not index
89+
let results = (outs Index : $result);
90+
let regions = (region SizedRegion<1>:$region);
91+
let skipDefaultBuilders = 1;
92+
let builders = [OpBuilder<(ins)>];
93+
94+
}
95+
96+
def PolygeistYieldOp : Polygeist_Op<"polygeist_yield", [Pure, ReturnLike, Terminator,
97+
ParentOneOf<["GPUWrapperOp", "GPUErrorOp"]>]> {
98+
let summary = "Polygeist ops terminator";
99+
}
69100

70101
def StreamToTokenOp : Polygeist_Op<"stream2token", [
71102
Pure

lib/polygeist/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
add_subdirectory(ExecutionEngine)
2+
13
add_mlir_dialect_library(MLIRPolygeist
24
Dialect.cpp
35
Ops.cpp
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
if(POLYGEIST_ENABLE_CUDA)
3+
find_package(CUDA)
4+
enable_language(CUDA)
5+
6+
add_mlir_library(polygeist_cuda_runtime
7+
SHARED
8+
CudaRuntimeWrappers.cpp
9+
10+
EXCLUDE_FROM_LIBMLIR
11+
)
12+
13+
find_library(CUDA_RUNTIME_LIBRARY cuda)
14+
15+
set_property(TARGET polygeist_cuda_runtime PROPERTY CXX_STANDARD 14)
16+
target_include_directories(polygeist_cuda_runtime
17+
PRIVATE
18+
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
19+
)
20+
target_link_libraries(polygeist_cuda_runtime
21+
PRIVATE
22+
${CUDA_RUNTIME_LIBRARY}
23+
)
24+
25+
endif()
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
//===- PolygeistCudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Implements C wrappers around the CUDA library for easy linking in ORC jit.
10+
// Also adds some debugging helpers that are helpful when writing MLIR code to
11+
// run on GPUs.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#include "mlir/ExecutionEngine/CRunnerUtils.h"
16+
17+
#include <stdio.h>
18+
19+
#include "cuda.h"
20+
21+
#ifdef _WIN32
22+
#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
23+
#else
24+
#define MLIR_CUDA_WRAPPERS_EXPORT
25+
#endif // _WIN32
26+
27+
#define CUDA_REPORT_IF_ERROR(expr) \
28+
[](CUresult result) { \
29+
if (!result) \
30+
return result; \
31+
const char *name = nullptr; \
32+
cuGetErrorName(result, &name); \
33+
if (!name) \
34+
name = "<unknown>"; \
35+
fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
36+
return result; \
37+
}(expr)
38+
39+
thread_local static int32_t defaultDevice = 0;
40+
41+
// Make the primary context of the current default device current for the
42+
// duration
43+
// of the instance and restore the previous context on destruction.
44+
class ScopedContext {
45+
public:
46+
ScopedContext() {
47+
// Static reference to CUDA primary context for device ordinal
48+
// defaultDevice.
49+
static CUcontext context = [] {
50+
CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
51+
CUdevice device;
52+
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
53+
CUcontext ctx;
54+
// Note: this does not affect the current context.
55+
CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
56+
return ctx;
57+
}();
58+
59+
CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
60+
}
61+
62+
~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
63+
};
64+
65+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT int32_t mgpuLaunchKernelErr(
66+
CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
67+
intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
68+
CUstream stream, void **params, void **extra) {
69+
ScopedContext scopedContext;
70+
return CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ,
71+
blockX, blockY, blockZ, smem,
72+
stream, params, extra));
73+
}

0 commit comments

Comments
 (0)