19
19
#include " queue.hpp"
20
20
#include " ur2offload.hpp"
21
21
22
+ namespace {
23
+ ol_result_t waitOnEvents (ol_queue_handle_t Queue,
24
+ const ur_event_handle_t *UrEvents, size_t NumEvents) {
25
+ if (NumEvents) {
26
+ std::vector<ol_event_handle_t > OlEvents;
27
+ OlEvents.reserve (NumEvents);
28
+ for (size_t I = 0 ; I < NumEvents; I++) {
29
+ OlEvents.push_back (UrEvents[I]->OffloadEvent );
30
+ }
31
+
32
+ olWaitEvents (Queue, OlEvents.data (), NumEvents);
33
+ }
34
+ return nullptr ;
35
+ }
36
+
37
+ ol_result_t makeEvent (ur_command_t Type, ol_queue_handle_t OlQueue,
38
+ ur_queue_handle_t UrQueue, ur_event_handle_t *UrEvent) {
39
+ if (UrEvent) {
40
+ auto *Event = new ur_event_handle_t_ (Type, UrQueue);
41
+ if (auto Res = olCreateEvent (OlQueue, &Event->OffloadEvent )) {
42
+ delete Event;
43
+ return Res;
44
+ };
45
+ *UrEvent = Event;
46
+ }
47
+ return nullptr ;
48
+ }
49
+
50
+ template <bool Barrier>
51
+ ur_result_t doWait (ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
52
+ const ur_event_handle_t *phEventWaitList,
53
+ ur_event_handle_t *phEvent) {
54
+ constexpr ur_command_t TYPE =
55
+ Barrier ? UR_COMMAND_EVENTS_WAIT_WITH_BARRIER : UR_COMMAND_EVENTS_WAIT;
56
+ ol_queue_handle_t TargetQueue;
57
+ if (!numEventsInWaitList && hQueue->isInOrder ()) {
58
+ // In order queue so all work is done in submission order, so it's a
59
+ // no-op
60
+ if (phEvent) {
61
+ OL_RETURN_ON_ERR (hQueue->nextQueue (TargetQueue));
62
+ OL_RETURN_ON_ERR (makeEvent (TYPE, TargetQueue, hQueue, phEvent));
63
+ }
64
+ return UR_RESULT_SUCCESS;
65
+ }
66
+ OL_RETURN_ON_ERR (hQueue->nextQueue (TargetQueue));
67
+
68
+ if (!numEventsInWaitList) {
69
+ // "If the event list is empty, it waits for all previously enqueued
70
+ // commands to complete."
71
+
72
+ // Create events on each active queue for an arbitrary thread to block on
73
+ // TODO: Can we efficiently check if each thread is "finished" rather than
74
+ // creating an event?
75
+ std::vector<ol_event_handle_t > OffloadHandles{};
76
+ for (auto *Q : hQueue->OffloadQueues ) {
77
+ if (Q == nullptr ) {
78
+ break ;
79
+ }
80
+ if (Q == TargetQueue) {
81
+ continue ;
82
+ }
83
+ OL_RETURN_ON_ERR (olCreateEvent (Q, &OffloadHandles.emplace_back ()));
84
+ }
85
+ OL_RETURN_ON_ERR (olWaitEvents (TargetQueue, OffloadHandles.data (),
86
+ OffloadHandles.size ()));
87
+ } else {
88
+ OL_RETURN_ON_ERR (
89
+ waitOnEvents (TargetQueue, phEventWaitList, numEventsInWaitList));
90
+ }
91
+
92
+ OL_RETURN_ON_ERR (makeEvent (TYPE, TargetQueue, hQueue, phEvent));
93
+
94
+ if constexpr (Barrier) {
95
+ ol_event_handle_t BarrierEvent;
96
+ if (phEvent) {
97
+ BarrierEvent = (*phEvent)->OffloadEvent ;
98
+ } else {
99
+ OL_RETURN_ON_ERR (olCreateEvent (TargetQueue, &BarrierEvent));
100
+ }
101
+
102
+ // Ensure any newly created work waits on this barrier
103
+ hQueue->Barrier .store (BarrierEvent);
104
+
105
+ // Block all existing threads on the barrier
106
+ for (auto *Q : hQueue->OffloadQueues ) {
107
+ if (Q == nullptr ) {
108
+ break ;
109
+ }
110
+ if (Q == TargetQueue) {
111
+ continue ;
112
+ }
113
+ OL_RETURN_ON_ERR (olWaitEvents (Q, &BarrierEvent, 1 ));
114
+ }
115
+ }
116
+
117
+ return UR_RESULT_SUCCESS;
118
+ }
119
+ } // namespace
120
+
121
+ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait (
122
+ ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
123
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
124
+ return doWait<false >(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
125
+ }
126
+
127
+ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier (
128
+ ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
129
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
130
+ return doWait<true >(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
131
+ }
132
+
22
133
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch (
23
134
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
24
135
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
25
136
const size_t *pLocalWorkSize, uint32_t , const ur_kernel_launch_property_t *,
26
137
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
27
138
ur_event_handle_t *phEvent) {
28
- // Ignore wait list for now
29
- (void )numEventsInWaitList;
30
- (void )phEventWaitList;
31
- //
139
+ ol_queue_handle_t Queue;
140
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
141
+ OL_RETURN_ON_ERR (waitOnEvents (Queue, phEventWaitList, numEventsInWaitList));
32
142
33
143
(void )pGlobalWorkOffset;
34
144
@@ -67,20 +177,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
67
177
LaunchArgs.GroupSize .z = GroupSize[2 ];
68
178
LaunchArgs.DynSharedMemory = 0 ;
69
179
70
- ol_queue_handle_t Queue;
71
- OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
72
180
OL_RETURN_ON_ERR (olLaunchKernel (
73
181
Queue, hQueue->OffloadDevice , hKernel->OffloadKernel ,
74
182
hKernel->Args .getStorage (), hKernel->Args .getStorageSize (), &LaunchArgs));
75
183
76
- if (phEvent) {
77
- auto *Event = new ur_event_handle_t_ (UR_COMMAND_KERNEL_LAUNCH, hQueue);
78
- if (auto Res = olCreateEvent (Queue, &Event->OffloadEvent )) {
79
- delete Event;
80
- return offloadResultToUR (Res);
81
- };
82
- *phEvent = Event;
83
- }
184
+ OL_RETURN_ON_ERR (makeEvent (UR_COMMAND_KERNEL_LAUNCH, Queue, hQueue, phEvent));
84
185
return UR_RESULT_SUCCESS;
85
186
}
86
187
@@ -103,10 +204,9 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
103
204
size_t size, bool blocking, uint32_t numEventsInWaitList,
104
205
const ur_event_handle_t *phEventWaitList,
105
206
ur_event_handle_t *phEvent) {
106
- // Ignore wait list for now
107
- (void )numEventsInWaitList;
108
- (void )phEventWaitList;
109
- //
207
+ ol_queue_handle_t Queue;
208
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
209
+ OL_RETURN_ON_ERR (waitOnEvents (Queue, phEventWaitList, numEventsInWaitList));
110
210
111
211
if (blocking) {
112
212
OL_RETURN_ON_ERR (
@@ -117,8 +217,6 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
117
217
return UR_RESULT_SUCCESS;
118
218
}
119
219
120
- ol_queue_handle_t Queue;
121
- OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
122
220
OL_RETURN_ON_ERR (
123
221
olMemcpy (Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size));
124
222
if (phEvent) {
@@ -192,17 +290,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
192
290
numEventsInWaitList, phEventWaitList, phEvent);
193
291
}
194
292
195
- ur_result_t enqueueNoOp (ur_command_t Type, ur_queue_handle_t hQueue,
196
- ur_event_handle_t *phEvent) {
197
- // This path is a no-op, but we can't output a real event because
198
- // Offload doesn't currently support creating arbitrary events, and we
199
- // don't know the last real event in the queue. Instead we just have to
200
- // wait on the whole queue and then return an empty (implicitly
201
- // finished) event.
202
- *phEvent = ur_event_handle_t_::createEmptyEvent (Type, hQueue);
203
- return urQueueFinish (hQueue);
204
- }
205
-
206
293
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap (
207
294
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
208
295
ur_map_flags_t mapFlags, size_t offset, size_t size,
@@ -226,15 +313,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
226
313
Result = urEnqueueMemBufferRead (hQueue, hBuffer, blockingMap, offset, size,
227
314
MapPtr, numEventsInWaitList,
228
315
phEventWaitList, phEvent);
229
- } else {
230
- if (IsPinned) {
231
- // TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
232
- // implemented we can call it on the wait list.
233
- }
234
-
235
- if (phEvent) {
236
- enqueueNoOp (UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
316
+ } else if (numEventsInWaitList || phEvent) {
317
+ ol_queue_handle_t Queue;
318
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
319
+ if ((!hQueue->isInOrder () && phEvent) || hQueue->isInOrder ()) {
320
+ // Out-of-order queues running no-op work only have side effects if there
321
+ // is an output event
322
+ waitOnEvents (Queue, phEventWaitList, numEventsInWaitList);
237
323
}
324
+ OL_RETURN_ON_ERR (
325
+ makeEvent (UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent));
238
326
}
239
327
*ppRetMap = MapPtr;
240
328
@@ -260,15 +348,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
260
348
Result = urEnqueueMemBufferWrite (
261
349
hQueue, hMem, true , Map->MapOffset , Map->MapSize , pMappedPtr,
262
350
numEventsInWaitList, phEventWaitList, phEvent);
263
- } else {
264
- if (IsPinned) {
265
- // TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
266
- // implemented we can call it on the wait list.
267
- }
268
-
269
- if (phEvent) {
270
- enqueueNoOp (UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
351
+ } else if (numEventsInWaitList || phEvent) {
352
+ ol_queue_handle_t Queue;
353
+ OL_RETURN_ON_ERR (hQueue->nextQueue (Queue));
354
+ if ((!hQueue->isInOrder () && phEvent) || hQueue->isInOrder ()) {
355
+ // Out-of-order queues running no-op work only have side effects if there
356
+ // is an output event
357
+ waitOnEvents (Queue, phEventWaitList, numEventsInWaitList);
271
358
}
359
+ OL_RETURN_ON_ERR (
360
+ makeEvent (UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent));
272
361
}
273
362
BufferImpl.unmap (pMappedPtr);
274
363
0 commit comments