Skip to content

Commit ef3162f

Browse files
committed
[UR][Offload] Event waiting
Implement urEventsWait[WithBarrier] and respect the waitlist of enqueue functions.
1 parent d524f4b commit ef3162f

File tree

3 files changed

+150
-54
lines changed

3 files changed

+150
-54
lines changed

unified-runtime/source/adapters/offload/enqueue.cpp

Lines changed: 135 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,126 @@
1919
#include "queue.hpp"
2020
#include "ur2offload.hpp"
2121

22+
namespace {
23+
ol_result_t waitOnEvents(ol_queue_handle_t Queue,
24+
const ur_event_handle_t *UrEvents, size_t NumEvents) {
25+
if (NumEvents) {
26+
std::vector<ol_event_handle_t> OlEvents;
27+
OlEvents.reserve(NumEvents);
28+
for (size_t I = 0; I < NumEvents; I++) {
29+
OlEvents.push_back(UrEvents[I]->OffloadEvent);
30+
}
31+
32+
olWaitEvents(Queue, OlEvents.data(), NumEvents);
33+
}
34+
return nullptr;
35+
}
36+
37+
ol_result_t makeEvent(ur_command_t Type, ol_queue_handle_t OlQueue,
38+
ur_queue_handle_t UrQueue, ur_event_handle_t *UrEvent) {
39+
if (UrEvent) {
40+
auto *Event = new ur_event_handle_t_(Type, UrQueue);
41+
if (auto Res = olCreateEvent(OlQueue, &Event->OffloadEvent)) {
42+
delete Event;
43+
return Res;
44+
};
45+
*UrEvent = Event;
46+
}
47+
return nullptr;
48+
}
49+
50+
template <bool Barrier>
51+
ur_result_t doWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
52+
const ur_event_handle_t *phEventWaitList,
53+
ur_event_handle_t *phEvent) {
54+
constexpr ur_command_t TYPE =
55+
Barrier ? UR_COMMAND_EVENTS_WAIT_WITH_BARRIER : UR_COMMAND_EVENTS_WAIT;
56+
ol_queue_handle_t TargetQueue;
57+
if (!numEventsInWaitList && hQueue->isInOrder()) {
58+
// In order queue so all work is done in submission order, so it's a
59+
// no-op
60+
if (phEvent) {
61+
OL_RETURN_ON_ERR(hQueue->nextQueue(TargetQueue));
62+
OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent));
63+
}
64+
return UR_RESULT_SUCCESS;
65+
}
66+
OL_RETURN_ON_ERR(hQueue->nextQueue(TargetQueue));
67+
68+
if (!numEventsInWaitList) {
69+
// "If the event list is empty, it waits for all previously enqueued
70+
// commands to complete."
71+
72+
// Create events on each active queue for an arbitrary thread to block on
73+
// TODO: Can we efficiently check if each thread is "finished" rather than
74+
// creating an event?
75+
std::vector<ol_event_handle_t> OffloadHandles{};
76+
for (auto *Q : hQueue->OffloadQueues) {
77+
if (Q == nullptr) {
78+
break;
79+
}
80+
if (Q == TargetQueue) {
81+
continue;
82+
}
83+
OL_RETURN_ON_ERR(olCreateEvent(Q, &OffloadHandles.emplace_back()));
84+
}
85+
OL_RETURN_ON_ERR(olWaitEvents(TargetQueue, OffloadHandles.data(),
86+
OffloadHandles.size()));
87+
} else {
88+
OL_RETURN_ON_ERR(
89+
waitOnEvents(TargetQueue, phEventWaitList, numEventsInWaitList));
90+
}
91+
92+
OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent));
93+
94+
if constexpr (Barrier) {
95+
ol_event_handle_t BarrierEvent;
96+
if (phEvent) {
97+
BarrierEvent = (*phEvent)->OffloadEvent;
98+
} else {
99+
OL_RETURN_ON_ERR(olCreateEvent(TargetQueue, &BarrierEvent));
100+
}
101+
102+
// Ensure any newly created work waits on this barrier
103+
hQueue->Barrier.store(BarrierEvent);
104+
105+
// Block all existing threads on the barrier
106+
for (auto *Q : hQueue->OffloadQueues) {
107+
if (Q == nullptr) {
108+
break;
109+
}
110+
if (Q == TargetQueue) {
111+
continue;
112+
}
113+
OL_RETURN_ON_ERR(olWaitEvents(Q, &BarrierEvent, 1));
114+
}
115+
}
116+
117+
return UR_RESULT_SUCCESS;
118+
}
119+
} // namespace
120+
121+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
122+
ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
123+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
124+
return doWait<false>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
125+
}
126+
127+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
128+
ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
129+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
130+
return doWait<true>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
131+
}
132+
22133
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
23134
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
24135
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
25136
const size_t *pLocalWorkSize, uint32_t, const ur_kernel_launch_property_t *,
26137
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
27138
ur_event_handle_t *phEvent) {
28-
// Ignore wait list for now
29-
(void)numEventsInWaitList;
30-
(void)phEventWaitList;
31-
//
139+
ol_queue_handle_t Queue;
140+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
141+
OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));
32142

33143
(void)pGlobalWorkOffset;
34144

@@ -67,20 +177,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
67177
LaunchArgs.GroupSize.z = GroupSize[2];
68178
LaunchArgs.DynSharedMemory = 0;
69179

70-
ol_queue_handle_t Queue;
71-
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
72180
OL_RETURN_ON_ERR(olLaunchKernel(
73181
Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
74182
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), &LaunchArgs));
75183

76-
if (phEvent) {
77-
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
78-
if (auto Res = olCreateEvent(Queue, &Event->OffloadEvent)) {
79-
delete Event;
80-
return offloadResultToUR(Res);
81-
};
82-
*phEvent = Event;
83-
}
184+
OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_KERNEL_LAUNCH, Queue, hQueue, phEvent));
84185
return UR_RESULT_SUCCESS;
85186
}
86187

@@ -103,10 +204,9 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
103204
size_t size, bool blocking, uint32_t numEventsInWaitList,
104205
const ur_event_handle_t *phEventWaitList,
105206
ur_event_handle_t *phEvent) {
106-
// Ignore wait list for now
107-
(void)numEventsInWaitList;
108-
(void)phEventWaitList;
109-
//
207+
ol_queue_handle_t Queue;
208+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
209+
OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));
110210

111211
if (blocking) {
112212
OL_RETURN_ON_ERR(
@@ -117,8 +217,6 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
117217
return UR_RESULT_SUCCESS;
118218
}
119219

120-
ol_queue_handle_t Queue;
121-
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
122220
OL_RETURN_ON_ERR(
123221
olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size));
124222
if (phEvent) {
@@ -192,17 +290,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
192290
numEventsInWaitList, phEventWaitList, phEvent);
193291
}
194292

195-
ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue,
196-
ur_event_handle_t *phEvent) {
197-
// This path is a no-op, but we can't output a real event because
198-
// Offload doesn't currently support creating arbitrary events, and we
199-
// don't know the last real event in the queue. Instead we just have to
200-
// wait on the whole queue and then return an empty (implicitly
201-
// finished) event.
202-
*phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue);
203-
return urQueueFinish(hQueue);
204-
}
205-
206293
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
207294
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
208295
ur_map_flags_t mapFlags, size_t offset, size_t size,
@@ -226,15 +313,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
226313
Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
227314
MapPtr, numEventsInWaitList,
228315
phEventWaitList, phEvent);
229-
} else {
230-
if (IsPinned) {
231-
// TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
232-
// implemented we can call it on the wait list.
233-
}
234-
235-
if (phEvent) {
236-
enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
316+
} else if (numEventsInWaitList || phEvent) {
317+
ol_queue_handle_t Queue;
318+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
319+
if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) {
320+
// Out-of-order queues running no-op work only have side effects if there
321+
// is an output event
322+
waitOnEvents(Queue, phEventWaitList, numEventsInWaitList);
237323
}
324+
OL_RETURN_ON_ERR(
325+
makeEvent(UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent));
238326
}
239327
*ppRetMap = MapPtr;
240328

@@ -260,15 +348,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
260348
Result = urEnqueueMemBufferWrite(
261349
hQueue, hMem, true, Map->MapOffset, Map->MapSize, pMappedPtr,
262350
numEventsInWaitList, phEventWaitList, phEvent);
263-
} else {
264-
if (IsPinned) {
265-
// TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
266-
// implemented we can call it on the wait list.
267-
}
268-
269-
if (phEvent) {
270-
enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
351+
} else if (numEventsInWaitList || phEvent) {
352+
ol_queue_handle_t Queue;
353+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
354+
if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) {
355+
// Out-of-order queues running no-op work only have side effects if there
356+
// is an output event
357+
waitOnEvents(Queue, phEventWaitList, numEventsInWaitList);
271358
}
359+
OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_MEM_UNMAP, Queue, hQueue, phEvent));
272360
}
273361
BufferImpl.unmap(pMappedPtr);
274362

unified-runtime/source/adapters/offload/queue.hpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ struct ur_queue_handle_t_ : RefCounted {
2323
: OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
2424
? 1
2525
: OOO_QUEUE_POOL_SIZE),
26-
QueueOffset(0), OffloadDevice(Device), UrContext(UrContext),
27-
Flags(Flags) {}
26+
QueueOffset(0), Barrier(nullptr), OffloadDevice(Device),
27+
UrContext(UrContext), Flags(Flags) {}
2828

2929
// In-order queues only have one element here, while out of order queues have
3030
// a bank of queues to use. We rotate through them round robin instead of
@@ -35,19 +35,27 @@ struct ur_queue_handle_t_ : RefCounted {
3535
// `stream_queue_t`. In the future, if we want more performance or it
3636
// simplifies the implementation of a feature, we can consider using it.
3737
std::vector<ol_queue_handle_t> OffloadQueues;
38-
size_t QueueOffset;
38+
std::atomic<size_t> QueueOffset;
39+
std::atomic<ol_event_handle_t> Barrier;
3940
ol_device_handle_t OffloadDevice;
4041
ur_context_handle_t UrContext;
4142
ur_queue_flags_t Flags;
4243

44+
bool isInOrder() const { return OffloadQueues.size() == 1; }
45+
4346
ol_result_t nextQueue(ol_queue_handle_t &Handle) {
44-
auto &Slot = OffloadQueues[QueueOffset++];
45-
QueueOffset %= OffloadQueues.size();
47+
auto &Slot = OffloadQueues[(QueueOffset++) % OffloadQueues.size()];
4648

4749
if (!Slot) {
4850
if (auto Res = olCreateQueue(OffloadDevice, &Slot)) {
4951
return Res;
5052
}
53+
54+
if (auto Event = Barrier.load()) {
55+
if (auto Res = olWaitEvents(Slot, &Event, 1)) {
56+
return Res;
57+
}
58+
}
5159
}
5260

5361
Handle = Slot;

unified-runtime/source/adapters/offload/ur_interface_loader.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
170170
}
171171
pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead;
172172
pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
173-
pDdiTable->pfnEventsWait = nullptr;
174-
pDdiTable->pfnEventsWaitWithBarrier = nullptr;
173+
pDdiTable->pfnEventsWait = urEnqueueEventsWait;
174+
pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
175175
pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
176176
pDdiTable->pfnMemBufferCopy = nullptr;
177177
pDdiTable->pfnMemBufferCopyRect = nullptr;

0 commit comments

Comments
 (0)