@@ -20,9 +20,294 @@ constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor";
2020} // namespace mlir::gc::gpu
2121
2222#ifndef GC_GPU_OCL_CONST_ONLY
23+ #include < cstdarg>
24+ #include < unordered_set>
25+ #include < vector>
2326
24- // TBD
27+ # include < CL/cl.h >
2528
29+ #include < llvm/ADT/SmallString.h>
30+
31+ #include " mlir/ExecutionEngine/ExecutionEngine.h"
32+ #include " mlir/IR/BuiltinOps.h"
33+
34+ namespace mlir ::gc::gpu {
35+ struct OclContext ;
36+ struct OclModule ;
37+ struct OclModuleBuilder ;
38+
39+ struct OclRuntime {
40+ cl_context context;
41+ cl_device_id device;
42+
43+ // Returns the available Intel GPU device ids.
44+ [[nodiscard]] static llvm::Expected<SmallVector<cl_device_id, 2 >>
45+ gcIntelDevices (size_t max = std::numeric_limits<size_t >::max());
46+
47+ [[nodiscard]] static llvm::Expected<OclRuntime> get ();
48+
49+ [[nodiscard]] static llvm::Expected<OclRuntime> get (cl_device_id device);
50+
51+ [[nodiscard]] static llvm::Expected<OclRuntime> get (cl_command_queue queue);
52+
53+ [[nodiscard]] static llvm::Expected<OclRuntime> get (cl_context context,
54+ cl_device_id device);
55+
56+ static bool isOutOfOrder (cl_command_queue queue);
57+
58+ [[nodiscard]] llvm::Expected<cl_command_queue>
59+ createQueue (bool outOfOrder = false ) const ;
60+
61+ [[nodiscard]] llvm::Expected<bool > releaseQueue (cl_command_queue queue) const ;
62+
63+ [[nodiscard]] llvm::Expected<void *> usmAllocDev (size_t size) const ;
64+
65+ [[nodiscard]] llvm::Expected<void *> usmAllocShared (size_t size) const ;
66+
67+ [[nodiscard]] llvm::Expected<bool > usmFree (const void *ptr) const ;
68+
69+ [[nodiscard]] llvm::Expected<bool > usmCpy (OclContext *ctx, const void *src,
70+ void *dst, size_t size) const ;
71+
72+ [[nodiscard]] llvm::Expected<bool > usmCpy (OclContext &ctx, const void *src,
73+ void *dst, size_t size) const {
74+ return usmCpy (&ctx, src, dst, size);
75+ }
76+
77+ template <typename T>
78+ [[nodiscard]] llvm::Expected<T *> usmNewDev (size_t size) const {
79+ auto expected = usmAllocDev (size * sizeof (T));
80+ if (expected) {
81+ return static_cast <T *>(*expected);
82+ }
83+ return expected.takeError ();
84+ }
85+
86+ template <typename T>
87+ [[nodiscard]] llvm::Expected<T *> usmNewShared (size_t size) const {
88+ auto expected = usmAllocShared (size * sizeof (T));
89+ if (expected) {
90+ return static_cast <T *>(*expected);
91+ }
92+ return expected.takeError ();
93+ }
94+
95+ template <typename T>
96+ [[nodiscard]] llvm::Expected<bool > usmCpy (OclContext &ctx, const T *src,
97+ T *dst, size_t size) const {
98+ return usmCpy (ctx, static_cast <const void *>(src), static_cast <void *>(dst),
99+ size * sizeof (T));
100+ }
101+
102+ // Use with caution! This is safe to check validity of USM, but may be false
103+ // positive for any other kinds.
104+ bool isUsm (const void *ptr) const ;
105+
106+ bool operator ==(const OclRuntime &other) const {
107+ return context == other.context && device == other.device ;
108+ }
109+
110+ private:
111+ struct Ext ;
112+ struct Exports ;
113+ friend OclContext;
114+ friend OclModuleBuilder;
115+ explicit OclRuntime (cl_context context, cl_device_id device, const Ext *ext)
116+ : context(context), device(device), ext(ext) {}
117+ const Ext *ext;
118+ };
119+ } // namespace mlir::gc::gpu
120+ template <> struct std ::hash<const mlir::gc::gpu::OclRuntime> {
121+ std::size_t
122+ operator ()(const mlir::gc::gpu::OclRuntime &runtime) const noexcept {
123+ return std::hash<cl_context>()(runtime.context ) ^
124+ std::hash<cl_device_id>()(runtime.device );
125+ }
126+ }; // namespace std
127+ namespace mlir ::gc::gpu {
128+
129+ struct OclContext {
130+ cl_command_queue const queue;
131+ // Preserve the execution order. This is required in case of out-of-order
132+ // execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution
133+ // is completed, the 'lastEvent' field contains the event of the last enqueued
134+ // command. If this field is false, 'waitList' is ignored.
135+ const bool preserveOrder;
136+ cl_event lastEvent;
137+
138+ explicit OclContext (cl_command_queue queue, cl_uint waitListLen = 0 ,
139+ cl_event *waitList = nullptr )
140+ : OclContext(queue, OclRuntime::isOutOfOrder(queue), waitListLen,
141+ waitList) {}
142+
143+ explicit OclContext (cl_command_queue queue, bool preserveOrder,
144+ cl_uint waitListLen, cl_event *waitList)
145+ : queue(queue), preserveOrder(preserveOrder), lastEvent(nullptr ),
146+ waitListLen(preserveOrder ? waitListLen : 0 ),
147+ waitList(preserveOrder ? waitList : nullptr ), runtime(nullptr ),
148+ clPtrs(nullptr ) {
149+ assert (!OclRuntime::isOutOfOrder (queue) || preserveOrder);
150+ assert (preserveOrder || (waitListLen == 0 && waitList == nullptr ));
151+ }
152+
153+ void finish ();
154+
155+ private:
156+ friend OclModule;
157+ friend OclRuntime;
158+ friend OclRuntime::Exports;
159+ cl_uint waitListLen;
160+ cl_event *waitList;
161+ const OclRuntime *runtime;
162+ std::unordered_set<void *> *clPtrs;
163+
164+ void setLastEvent (cl_event event) {
165+ lastEvent = event;
166+ if (event) {
167+ waitListLen = 1 ;
168+ waitList = &lastEvent;
169+ } else {
170+ waitListLen = 0 ;
171+ waitList = nullptr ;
172+ }
173+ }
174+ };
175+
176+ struct OclModule {
177+ static constexpr int64_t ZERO = 0 ;
178+ static constexpr auto ZERO_PTR = const_cast <int64_t *>(&ZERO);
179+
180+ // The main function arguments in the following format -
181+ // https://mlir.llvm.org/docs/TargetLLVMIR/#c-compatible-wrapper-emission.
182+ // Note: the values are not copied, only the pointers are stored!
183+ template <unsigned N> struct Args {
184+
185+ void add (void **alignedPtr, size_t rank, const int64_t *shape,
186+ const int64_t *strides, bool isUsm = true ) {
187+ add (alignedPtr, alignedPtr, ZERO_PTR, rank, shape, strides, isUsm);
188+ }
189+
190+ void add (void **allocatedPtr, void **alignedPtr, const int64_t *offset,
191+ size_t rank, const int64_t *shape, const int64_t *strides,
192+ bool isUsm = true ) {
193+ #ifndef NDEBUG
194+ assert (!isUsm || runtime->isUsm (*alignedPtr));
195+ // It's recommended to have at least 16-byte alignment
196+ assert (reinterpret_cast <std::uintptr_t >(*alignedPtr) % 16 == 0 );
197+ #endif
198+
199+ args.emplace_back (allocatedPtr);
200+ args.emplace_back (alignedPtr);
201+ args.emplace_back (const_cast <int64_t *>(offset));
202+ for (size_t i = 0 ; i < rank; i++) {
203+ args.emplace_back (const_cast <int64_t *>(&shape[i]));
204+ }
205+ for (size_t i = 0 ; i < rank; i++) {
206+ args.emplace_back (const_cast <int64_t *>(&strides[i]));
207+ }
208+ if (!isUsm) {
209+ clPtrs.insert (alignedPtr);
210+ }
211+ }
212+
213+ template <typename T>
214+ void add (T **alignedPtr, size_t rank, const int64_t *shape,
215+ const int64_t *strides, bool isUsm = true ) {
216+ add (reinterpret_cast <void **>(alignedPtr), rank, shape, strides, isUsm);
217+ }
218+
219+ template <typename T>
220+ void add (T **allocatedPtr, T **alignedPtr, const int64_t *offset,
221+ size_t rank, const int64_t *shape, const int64_t *strides,
222+ bool isUsm = true ) {
223+ add (reinterpret_cast <void **>(allocatedPtr),
224+ reinterpret_cast <void **>(alignedPtr), offset, rank, shape, strides,
225+ isUsm);
226+ }
227+
228+ void clear () {
229+ args.clear ();
230+ clPtrs.clear ();
231+ }
232+
233+ private:
234+ friend OclModule;
235+ SmallVector<void *, N + 3 > args;
236+ // Contains the pointers of all non-USM arguments. It's expected, that the
237+ // arguments are either USM or CL pointers and most probably are USM, thus,
238+ // in most cases, this set will be empty.
239+ std::unordered_set<void *> clPtrs;
240+ #ifdef NDEBUG
241+ explicit Args (){};
242+ #else
243+ const OclRuntime *runtime;
244+ explicit Args (const OclRuntime *runtime) : runtime(runtime) {}
245+ #endif
246+ };
247+
248+ using MainFunc = void (*)(void **);
249+
250+ explicit OclModule (const OclRuntime &runtime,
251+ std::unique_ptr<ExecutionEngine> engine, MainFunc main)
252+ : runtime(runtime), engine(std::move(engine)), main(main) {}
253+
254+ #ifdef NDEBUG
255+ template <unsigned N = 64 > Args<N> args () const { return Args<N>(); }
256+ #else
257+ template <unsigned N = 64 > Args<N> args () const { return Args<N>(&runtime); }
258+ #endif
259+
260+ template <unsigned N> void exec (OclContext &ctx, Args<N> &args) const {
261+ #ifndef NDEBUG
262+ auto rt = OclRuntime::get (ctx.queue );
263+ assert (rt);
264+ assert (*rt == this ->runtime );
265+ #endif
266+ auto size = args.args .size ();
267+ auto ctxPtr = &ctx;
268+ ctx.runtime = &runtime;
269+ ctx.clPtrs = &args.clPtrs ;
270+ args.args .emplace_back (&ctxPtr);
271+ args.args .emplace_back (&ctxPtr);
272+ args.args .emplace_back (ZERO_PTR);
273+ main (args.args .data ());
274+ args.args .truncate (size);
275+ }
276+
277+ ~OclModule ();
278+ OclModule (const OclModule &) = delete ;
279+ OclModule &operator =(const OclModule &) = delete ;
280+ OclModule (const OclModule &&) = delete ;
281+ OclModule &operator =(const OclModule &&) = delete ;
282+
283+ private:
284+ OclRuntime runtime;
285+ std::unique_ptr<ExecutionEngine> engine;
286+ MainFunc main;
287+ };
288+
289+ struct OclModuleBuilder {
290+ friend OclRuntime;
291+ explicit OclModuleBuilder (ModuleOp module );
292+ explicit OclModuleBuilder (OwningOpRef<ModuleOp> &module )
293+ : OclModuleBuilder(module .release()) {}
294+
295+ llvm::Expected<std::shared_ptr<const OclModule>>
296+ build (const OclRuntime &runtime);
297+
298+ llvm::Expected<std::shared_ptr<const OclModule>>
299+ build (cl_command_queue queue);
300+
301+ llvm::Expected<std::shared_ptr<const OclModule>> build (cl_context context,
302+ cl_device_id device);
303+
304+ private:
305+ std::shared_mutex mux;
306+ ModuleOp mlirModule;
307+ SmallString<32 > funcName;
308+ std::unordered_map<const OclRuntime, std::shared_ptr<const OclModule>> cache;
309+ };
310+ }; // namespace mlir::gc::gpu
26311#else
27312#undef GC_GPU_OCL_CONST_ONLY
28313#endif
0 commit comments