Skip to content

Commit 48e6f1d

Browse files
Implemented GPU OpenCL runtime
1 parent 8635ad2 commit 48e6f1d

File tree

20 files changed

+1366
-11
lines changed

20 files changed

+1366
-11
lines changed

.github/workflows/clang-tidy.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414

1515
steps:
1616
- name: Install OpenMP
17-
run: "sudo apt install -y libomp-dev"
17+
run: "sudo apt install -y libomp-dev opencl-c-headers"
1818

1919
- name: Fetch sources
2020
uses: actions/checkout@v4

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,15 @@ get_property(GC_TOOLS GLOBAL PROPERTY GC_TOOLS)
9898
get_property(GC_MLIR_LIBS GLOBAL PROPERTY GC_MLIR_LIBS)
9999
get_property(GC_PASS_LIBS GLOBAL PROPERTY GC_PASS_LIBS)
100100
get_property(GC_DIALECT_LIBS GLOBAL PROPERTY GC_DIALECT_LIBS)
101+
get_property(IMEX_LIBS GLOBAL PROPERTY IMEX_LIBS)
102+
101103
install(TARGETS
102104
GcInterface
103105
${GC_TOOLS}
104106
${GC_MLIR_LIBS}
105107
${GC_PASS_LIBS}
106108
${GC_DIALECT_LIBS}
109+
${IMEX_LIBS}
107110
EXPORT ${PROJECT_NAME}Targets
108111
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
109112
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}

cmake/imex.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ if (NOT DEFINED IMEX_INCLUDES)
2424
${imex_SOURCE_DIR}/src
2525
)
2626
set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
27+
target_compile_options(GcInterface INTERFACE -DGC_USE_IMEX)
2728
endif ()

include/gc/Error.h

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
//===-- Error.h - Error processing functions --------------------*- C++ -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef GC_ERROR_H
10+
#define GC_ERROR_H
11+
12+
#include <sstream>
13+
14+
#include "gc/Log.h"
15+
16+
#include "llvm/Support/Error.h"
17+
18+
namespace mlir::gc::err {
19+
#ifdef _NDEBUG
20+
#define GC_ERR_LOC_DECL
21+
#define GC_ERR_LOC_ARGS
22+
#define GC_ERR_LOC
23+
#else
24+
#define GC_ERR_LOC_DECL const char *fileName, int lineNum,
25+
#define GC_ERR_LOC_ARGS fileName, lineNum,
26+
#define GC_ERR_LOC __FILE__, __LINE__,
27+
#endif
28+
29+
#define gcMakeErr(...) mlir::gc::err::makeLlvmError(GC_ERR_LOC __VA_ARGS__)
30+
#define gcReportErr(...) \
31+
mlir::gc::err::report(GC_ERR_LOC std::move(gcMakeErr(__VA_ARGS__)))
32+
#define gcGetOrReport(expected) mlir::gc::err::getOrReport(GC_ERR_LOC expected)
33+
34+
template <typename... Args>
35+
[[nodiscard]] llvm::Error makeLlvmError(GC_ERR_LOC_DECL Args... args) {
36+
log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR", args...);
37+
std::ostringstream oss;
38+
log::insertArgs(oss, args...);
39+
auto msg = oss.str();
40+
return llvm::make_error<llvm::StringError>(msg.substr(0, msg.length() - 1),
41+
llvm::inconvertibleErrorCode());
42+
}
43+
44+
[[noreturn]] static void report(GC_ERR_LOC_DECL llvm::Error err) {
45+
log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR",
46+
"Unrecoverable error! Aborting...");
47+
report_fatal_error(std::move(err));
48+
}
49+
50+
template <typename T>
51+
T getOrReport(GC_ERR_LOC_DECL llvm::Expected<T> expected) {
52+
if (expected) {
53+
return *expected;
54+
}
55+
report(GC_ERR_LOC_ARGS std::move(expected.takeError()));
56+
}
57+
} // namespace mlir::gc::err
58+
59+
#endif

include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h

Lines changed: 286 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,294 @@ constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor";
2020
} // namespace mlir::gc::gpu
2121

2222
#ifndef GC_GPU_OCL_CONST_ONLY
23+
#include <cstdarg>
24+
#include <unordered_set>
25+
#include <vector>
2326

24-
// TBD
27+
#include <CL/cl.h>
2528

29+
#include <llvm/ADT/SmallString.h>
30+
31+
#include "mlir/ExecutionEngine/ExecutionEngine.h"
32+
#include "mlir/IR/BuiltinOps.h"
33+
34+
namespace mlir::gc::gpu {
35+
struct OclContext;
36+
struct OclModule;
37+
struct OclModuleBuilder;
38+
39+
struct OclRuntime {
40+
cl_context context;
41+
cl_device_id device;
42+
43+
// Returns the available Intel GPU device ids.
44+
[[nodiscard]] static llvm::Expected<SmallVector<cl_device_id, 2>>
45+
gcIntelDevices(size_t max = std::numeric_limits<size_t>::max());
46+
47+
[[nodiscard]] static llvm::Expected<OclRuntime> get();
48+
49+
[[nodiscard]] static llvm::Expected<OclRuntime> get(cl_device_id device);
50+
51+
[[nodiscard]] static llvm::Expected<OclRuntime> get(cl_command_queue queue);
52+
53+
[[nodiscard]] static llvm::Expected<OclRuntime> get(cl_context context,
54+
cl_device_id device);
55+
56+
static bool isOutOfOrder(cl_command_queue queue);
57+
58+
[[nodiscard]] llvm::Expected<cl_command_queue>
59+
createQueue(bool outOfOrder = false) const;
60+
61+
[[nodiscard]] llvm::Expected<bool> releaseQueue(cl_command_queue queue) const;
62+
63+
[[nodiscard]] llvm::Expected<void *> usmAllocDev(size_t size) const;
64+
65+
[[nodiscard]] llvm::Expected<void *> usmAllocShared(size_t size) const;
66+
67+
[[nodiscard]] llvm::Expected<bool> usmFree(const void *ptr) const;
68+
69+
[[nodiscard]] llvm::Expected<bool> usmCpy(OclContext *ctx, const void *src,
70+
void *dst, size_t size) const;
71+
72+
[[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const void *src,
73+
void *dst, size_t size) const {
74+
return usmCpy(&ctx, src, dst, size);
75+
}
76+
77+
template <typename T>
78+
[[nodiscard]] llvm::Expected<T *> usmNewDev(size_t size) const {
79+
auto expected = usmAllocDev(size * sizeof(T));
80+
if (expected) {
81+
return static_cast<T *>(*expected);
82+
}
83+
return expected.takeError();
84+
}
85+
86+
template <typename T>
87+
[[nodiscard]] llvm::Expected<T *> usmNewShared(size_t size) const {
88+
auto expected = usmAllocShared(size * sizeof(T));
89+
if (expected) {
90+
return static_cast<T *>(*expected);
91+
}
92+
return expected.takeError();
93+
}
94+
95+
template <typename T>
96+
[[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const T *src,
97+
T *dst, size_t size) const {
98+
return usmCpy(ctx, static_cast<const void *>(src), static_cast<void *>(dst),
99+
size * sizeof(T));
100+
}
101+
102+
// Use with caution! This is safe to check validity of USM, but may be false
103+
// positive for any other kinds.
104+
bool isUsm(const void *ptr) const;
105+
106+
bool operator==(const OclRuntime &other) const {
107+
return context == other.context && device == other.device;
108+
}
109+
110+
private:
111+
struct Ext;
112+
struct Exports;
113+
friend OclContext;
114+
friend OclModuleBuilder;
115+
explicit OclRuntime(cl_context context, cl_device_id device, const Ext *ext)
116+
: context(context), device(device), ext(ext) {}
117+
const Ext *ext;
118+
};
119+
} // namespace mlir::gc::gpu
120+
template <> struct std::hash<const mlir::gc::gpu::OclRuntime> {
121+
std::size_t
122+
operator()(const mlir::gc::gpu::OclRuntime &runtime) const noexcept {
123+
return std::hash<cl_context>()(runtime.context) ^
124+
std::hash<cl_device_id>()(runtime.device);
125+
}
126+
}; // namespace std
127+
namespace mlir::gc::gpu {
128+
129+
struct OclContext {
130+
cl_command_queue const queue;
131+
// Preserve the execution order. This is required in case of out-of-order
132+
// execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution
133+
// is completed, the 'lastEvent' field contains the event of the last enqueued
134+
// command. If this field is false, 'waitList' is ignored.
135+
const bool preserveOrder;
136+
cl_event lastEvent;
137+
138+
explicit OclContext(cl_command_queue queue, cl_uint waitListLen = 0,
139+
cl_event *waitList = nullptr)
140+
: OclContext(queue, OclRuntime::isOutOfOrder(queue), waitListLen,
141+
waitList) {}
142+
143+
explicit OclContext(cl_command_queue queue, bool preserveOrder,
144+
cl_uint waitListLen, cl_event *waitList)
145+
: queue(queue), preserveOrder(preserveOrder), lastEvent(nullptr),
146+
waitListLen(preserveOrder ? waitListLen : 0),
147+
waitList(preserveOrder ? waitList : nullptr), runtime(nullptr),
148+
clPtrs(nullptr) {
149+
assert(!OclRuntime::isOutOfOrder(queue) || preserveOrder);
150+
assert(preserveOrder || (waitListLen == 0 && waitList == nullptr));
151+
}
152+
153+
void finish();
154+
155+
private:
156+
friend OclModule;
157+
friend OclRuntime;
158+
friend OclRuntime::Exports;
159+
cl_uint waitListLen;
160+
cl_event *waitList;
161+
const OclRuntime *runtime;
162+
std::unordered_set<void *> *clPtrs;
163+
164+
void setLastEvent(cl_event event) {
165+
lastEvent = event;
166+
if (event) {
167+
waitListLen = 1;
168+
waitList = &lastEvent;
169+
} else {
170+
waitListLen = 0;
171+
waitList = nullptr;
172+
}
173+
}
174+
};
175+
176+
struct OclModule {
177+
static constexpr int64_t ZERO = 0;
178+
static constexpr auto ZERO_PTR = const_cast<int64_t *>(&ZERO);
179+
180+
// The main function arguments in the following format -
181+
// https://mlir.llvm.org/docs/TargetLLVMIR/#c-compatible-wrapper-emission.
182+
// Note: the values are not copied, only the pointers are stored!
183+
template <unsigned N> struct Args {
184+
185+
void add(void **alignedPtr, size_t rank, const int64_t *shape,
186+
const int64_t *strides, bool isUsm = true) {
187+
add(alignedPtr, alignedPtr, ZERO_PTR, rank, shape, strides, isUsm);
188+
}
189+
190+
void add(void **allocatedPtr, void **alignedPtr, const int64_t *offset,
191+
size_t rank, const int64_t *shape, const int64_t *strides,
192+
bool isUsm = true) {
193+
#ifndef NDEBUG
194+
assert(!isUsm || runtime->isUsm(*alignedPtr));
195+
// It's recommended to have at least 16-byte alignment
196+
assert(reinterpret_cast<std::uintptr_t>(*alignedPtr) % 16 == 0);
197+
#endif
198+
199+
args.emplace_back(allocatedPtr);
200+
args.emplace_back(alignedPtr);
201+
args.emplace_back(const_cast<int64_t *>(offset));
202+
for (size_t i = 0; i < rank; i++) {
203+
args.emplace_back(const_cast<int64_t *>(&shape[i]));
204+
}
205+
for (size_t i = 0; i < rank; i++) {
206+
args.emplace_back(const_cast<int64_t *>(&strides[i]));
207+
}
208+
if (!isUsm) {
209+
clPtrs.insert(alignedPtr);
210+
}
211+
}
212+
213+
template <typename T>
214+
void add(T **alignedPtr, size_t rank, const int64_t *shape,
215+
const int64_t *strides, bool isUsm = true) {
216+
add(reinterpret_cast<void **>(alignedPtr), rank, shape, strides, isUsm);
217+
}
218+
219+
template <typename T>
220+
void add(T **allocatedPtr, T **alignedPtr, const int64_t *offset,
221+
size_t rank, const int64_t *shape, const int64_t *strides,
222+
bool isUsm = true) {
223+
add(reinterpret_cast<void **>(allocatedPtr),
224+
reinterpret_cast<void **>(alignedPtr), offset, rank, shape, strides,
225+
isUsm);
226+
}
227+
228+
void clear() {
229+
args.clear();
230+
clPtrs.clear();
231+
}
232+
233+
private:
234+
friend OclModule;
235+
SmallVector<void *, N + 3> args;
236+
// Contains the pointers of all non-USM arguments. It's expected, that the
237+
// arguments are either USM or CL pointers and most probably are USM, thus,
238+
// in most cases, this set will be empty.
239+
std::unordered_set<void *> clPtrs;
240+
#ifdef NDEBUG
241+
explicit Args(){};
242+
#else
243+
const OclRuntime *runtime;
244+
explicit Args(const OclRuntime *runtime) : runtime(runtime) {}
245+
#endif
246+
};
247+
248+
using MainFunc = void (*)(void **);
249+
250+
explicit OclModule(const OclRuntime &runtime,
251+
std::unique_ptr<ExecutionEngine> engine, MainFunc main)
252+
: runtime(runtime), engine(std::move(engine)), main(main) {}
253+
254+
#ifdef NDEBUG
255+
template <unsigned N = 64> Args<N> args() const { return Args<N>(); }
256+
#else
257+
template <unsigned N = 64> Args<N> args() const { return Args<N>(&runtime); }
258+
#endif
259+
260+
template <unsigned N> void exec(OclContext &ctx, Args<N> &args) const {
261+
#ifndef NDEBUG
262+
auto rt = OclRuntime::get(ctx.queue);
263+
assert(rt);
264+
assert(*rt == this->runtime);
265+
#endif
266+
auto size = args.args.size();
267+
auto ctxPtr = &ctx;
268+
ctx.runtime = &runtime;
269+
ctx.clPtrs = &args.clPtrs;
270+
args.args.emplace_back(&ctxPtr);
271+
args.args.emplace_back(&ctxPtr);
272+
args.args.emplace_back(ZERO_PTR);
273+
main(args.args.data());
274+
args.args.truncate(size);
275+
}
276+
277+
~OclModule();
278+
OclModule(const OclModule &) = delete;
279+
OclModule &operator=(const OclModule &) = delete;
280+
OclModule(const OclModule &&) = delete;
281+
OclModule &operator=(const OclModule &&) = delete;
282+
283+
private:
284+
OclRuntime runtime;
285+
std::unique_ptr<ExecutionEngine> engine;
286+
MainFunc main;
287+
};
288+
289+
struct OclModuleBuilder {
290+
friend OclRuntime;
291+
explicit OclModuleBuilder(ModuleOp module);
292+
explicit OclModuleBuilder(OwningOpRef<ModuleOp> &module)
293+
: OclModuleBuilder(module.release()) {}
294+
295+
llvm::Expected<std::shared_ptr<const OclModule>>
296+
build(const OclRuntime &runtime);
297+
298+
llvm::Expected<std::shared_ptr<const OclModule>>
299+
build(cl_command_queue queue);
300+
301+
llvm::Expected<std::shared_ptr<const OclModule>> build(cl_context context,
302+
cl_device_id device);
303+
304+
private:
305+
std::shared_mutex mux;
306+
ModuleOp mlirModule;
307+
SmallString<32> funcName;
308+
std::unordered_map<const OclRuntime, std::shared_ptr<const OclModule>> cache;
309+
};
310+
}; // namespace mlir::gc::gpu
26311
#else
27312
#undef GC_GPU_OCL_CONST_ONLY
28313
#endif

0 commit comments

Comments
 (0)