19
19
#include " queue.hpp"
20
20
#include " ur2offload.hpp"
21
21
22
+ namespace {
23
+ ol_result_t waitOnEvents (ol_queue_handle_t Queue,
24
+ const ur_event_handle_t *UrEvents, size_t NumEvents) {
25
+ if (NumEvents) {
26
+ std::vector<ol_event_handle_t > OlEvents;
27
+ OlEvents.reserve (NumEvents);
28
+ for (size_t I = 0 ; I < NumEvents; I++) {
29
+ OlEvents.push_back (UrEvents[I]->OffloadEvent );
30
+ }
31
+
32
+ return olWaitEvents (Queue, OlEvents.data (), NumEvents);
33
+ }
34
+ return OL_SUCCESS;
35
+ }
36
+
37
+ ol_result_t makeEvent (ur_command_t Type, ol_queue_handle_t OlQueue,
38
+ ur_queue_handle_t UrQueue, ur_event_handle_t *UrEvent) {
39
+ if (UrEvent) {
40
+ auto *Event = new ur_event_handle_t_ (Type, UrQueue);
41
+ if (auto Res = olCreateEvent (OlQueue, &Event->OffloadEvent )) {
42
+ delete Event;
43
+ return Res;
44
+ };
45
+ *UrEvent = Event;
46
+ }
47
+ return OL_SUCCESS;
48
+ }
49
+
50
+ template <bool Barrier>
51
+ ur_result_t doWait (ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
52
+ const ur_event_handle_t *phEventWaitList,
53
+ ur_event_handle_t *phEvent) {
54
+ std::lock_guard<std::mutex> Lock (hQueue->OooMutex );
55
+ constexpr ur_command_t TYPE =
56
+ Barrier ? UR_COMMAND_EVENTS_WAIT_WITH_BARRIER : UR_COMMAND_EVENTS_WAIT;
57
+ ol_queue_handle_t TargetQueue;
58
+ if (!numEventsInWaitList && hQueue->isInOrder ()) {
59
+ // In order queue so all work is done in submission order, so it's a
60
+ // no-op
61
+ if (phEvent) {
62
+ OL_RETURN_ON_ERR (hQueue->nextQueueNoLock (TargetQueue));
63
+ OL_RETURN_ON_ERR (makeEvent (TYPE, TargetQueue, hQueue, phEvent));
64
+ }
65
+ return UR_RESULT_SUCCESS;
66
+ }
67
+ OL_RETURN_ON_ERR (hQueue->nextQueueNoLock (TargetQueue));
68
+
69
+ if (!numEventsInWaitList) {
70
+ // "If the event list is empty, it waits for all previously enqueued
71
+ // commands to complete."
72
+
73
+ // Create events on each active queue for an arbitrary thread to block on
74
+ // TODO: Can we efficiently check if each thread is "finished" rather than
75
+ // creating an event?
76
+ std::vector<ol_event_handle_t > OffloadHandles{};
77
+ for (auto *Q : hQueue->OffloadQueues ) {
78
+ if (Q == nullptr ) {
79
+ break ;
80
+ }
81
+ if (Q == TargetQueue) {
82
+ continue ;
83
+ }
84
+ OL_RETURN_ON_ERR (olCreateEvent (Q, &OffloadHandles.emplace_back ()));
85
+ }
86
+ OL_RETURN_ON_ERR (olWaitEvents (TargetQueue, OffloadHandles.data (),
87
+ OffloadHandles.size ()));
88
+ } else {
89
+ OL_RETURN_ON_ERR (
90
+ waitOnEvents (TargetQueue, phEventWaitList, numEventsInWaitList));
91
+ }
92
+
93
+ OL_RETURN_ON_ERR (makeEvent (TYPE, TargetQueue, hQueue, phEvent));
94
+
95
+ if constexpr (Barrier) {
96
+ ol_event_handle_t BarrierEvent;
97
+ if (phEvent) {
98
+ BarrierEvent = (*phEvent)->OffloadEvent ;
99
+ } else {
100
+ OL_RETURN_ON_ERR (olCreateEvent (TargetQueue, &BarrierEvent));
101
+ }
102
+
103
+ // Ensure any newly created work waits on this barrier
104
+ if (hQueue->Barrier ) {
105
+ OL_RETURN_ON_ERR (olDestroyEvent (hQueue->Barrier ));
106
+ }
107
+ hQueue->Barrier = BarrierEvent;
108
+
109
+ // Block all existing threads on the barrier
110
+ for (auto *Q : hQueue->OffloadQueues ) {
111
+ if (Q == nullptr ) {
112
+ break ;
113
+ }
114
+ if (Q == TargetQueue) {
115
+ continue ;
116
+ }
117
+ OL_RETURN_ON_ERR (olWaitEvents (Q, &BarrierEvent, 1 ));
118
+ }
119
+ }
120
+
121
+ return UR_RESULT_SUCCESS;
122
+ }
123
+ } // namespace
124
+
125
+ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait (
126
+ ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
127
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
128
+ return doWait<false >(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
129
+ }
130
+
131
+ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier (
132
+ ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
133
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
134
+ return doWait<true >(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
135
+ }
136
+
22
137
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch (
23
138
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
24
139
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
25
140
const size_t *pLocalWorkSize, uint32_t , const ur_kernel_launch_property_t *,
26
141
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
27
142
ur_event_handle_t *phEvent) {
28
- // Ignore wait list for now
29
- (void )numEventsInWaitList;
30
- (void )phEventWaitList;
31
- //
143
+ ol_queue_handle_t Queue;
144
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
145
+ OL_RETURN_ON_ERR (waitOnEvents (Queue, phEventWaitList, numEventsInWaitList));
32
146
33
147
(void )pGlobalWorkOffset;
34
148
@@ -67,20 +181,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
67
181
LaunchArgs.GroupSize .z = GroupSize[2 ];
68
182
LaunchArgs.DynSharedMemory = 0 ;
69
183
70
- ol_queue_handle_t Queue;
71
- OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
72
184
OL_RETURN_ON_ERR (olLaunchKernel (
73
185
Queue, hQueue->OffloadDevice , hKernel->OffloadKernel ,
74
186
hKernel->Args .getStorage (), hKernel->Args .getStorageSize (), &LaunchArgs));
75
187
76
- if (phEvent) {
77
- auto *Event = new ur_event_handle_t_ (UR_COMMAND_KERNEL_LAUNCH, hQueue);
78
- if (auto Res = olCreateEvent (Queue, &Event->OffloadEvent )) {
79
- delete Event;
80
- return offloadResultToUR (Res);
81
- };
82
- *phEvent = Event;
83
- }
188
+ OL_RETURN_ON_ERR (makeEvent (UR_COMMAND_KERNEL_LAUNCH, Queue, hQueue, phEvent));
84
189
return UR_RESULT_SUCCESS;
85
190
}
86
191
@@ -103,10 +208,9 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
103
208
size_t size, bool blocking, uint32_t numEventsInWaitList,
104
209
const ur_event_handle_t *phEventWaitList,
105
210
ur_event_handle_t *phEvent) {
106
- // Ignore wait list for now
107
- (void )numEventsInWaitList;
108
- (void )phEventWaitList;
109
- //
211
+ ol_queue_handle_t Queue;
212
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
213
+ OL_RETURN_ON_ERR (waitOnEvents (Queue, phEventWaitList, numEventsInWaitList));
110
214
111
215
if (blocking) {
112
216
OL_RETURN_ON_ERR (
@@ -117,8 +221,6 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
117
221
return UR_RESULT_SUCCESS;
118
222
}
119
223
120
- ol_queue_handle_t Queue;
121
- OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
122
224
OL_RETURN_ON_ERR (
123
225
olMemcpy (Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size));
124
226
if (phEvent) {
@@ -192,17 +294,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
192
294
numEventsInWaitList, phEventWaitList, phEvent);
193
295
}
194
296
195
- ur_result_t enqueueNoOp (ur_command_t Type, ur_queue_handle_t hQueue,
196
- ur_event_handle_t *phEvent) {
197
- // This path is a no-op, but we can't output a real event because
198
- // Offload doesn't currently support creating arbitrary events, and we
199
- // don't know the last real event in the queue. Instead we just have to
200
- // wait on the whole queue and then return an empty (implicitly
201
- // finished) event.
202
- *phEvent = ur_event_handle_t_::createEmptyEvent (Type, hQueue);
203
- return urQueueFinish (hQueue);
204
- }
205
-
206
297
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap (
207
298
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
208
299
ur_map_flags_t mapFlags, size_t offset, size_t size,
@@ -226,15 +317,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
226
317
Result = urEnqueueMemBufferRead (hQueue, hBuffer, blockingMap, offset, size,
227
318
MapPtr, numEventsInWaitList,
228
319
phEventWaitList, phEvent);
229
- } else {
230
- if (IsPinned) {
231
- // TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
232
- // implemented we can call it on the wait list.
233
- }
234
-
235
- if (phEvent) {
236
- enqueueNoOp (UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
320
+ } else if (numEventsInWaitList || phEvent) {
321
+ ol_queue_handle_t Queue;
322
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
323
+ if ((!hQueue->isInOrder () && phEvent) || hQueue->isInOrder ()) {
324
+ // Out-of-order queues running no-op work only have side effects if there
325
+ // is an output event
326
+ waitOnEvents (Queue, phEventWaitList, numEventsInWaitList);
237
327
}
328
+ OL_RETURN_ON_ERR (
329
+ makeEvent (UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent));
238
330
}
239
331
*ppRetMap = MapPtr;
240
332
@@ -260,15 +352,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
260
352
Result = urEnqueueMemBufferWrite (
261
353
hQueue, hMem, true , Map->MapOffset , Map->MapSize , pMappedPtr,
262
354
numEventsInWaitList, phEventWaitList, phEvent);
263
- } else {
264
- if (IsPinned) {
265
- // TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
266
- // implemented we can call it on the wait list.
267
- }
268
-
269
- if (phEvent) {
270
- enqueueNoOp (UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
355
+ } else if (numEventsInWaitList || phEvent) {
356
+ ol_queue_handle_t Queue;
357
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
358
+ if ((!hQueue->isInOrder () && phEvent) || hQueue->isInOrder ()) {
359
+ // Out-of-order queues running no-op work only have side effects if there
360
+ // is an output event
361
+ waitOnEvents (Queue, phEventWaitList, numEventsInWaitList);
271
362
}
363
+ OL_RETURN_ON_ERR (makeEvent (UR_COMMAND_MEM_UNMAP, Queue, hQueue, phEvent));
272
364
}
273
365
BufferImpl.unmap (pMappedPtr);
274
366
0 commit comments