Skip to content

Commit 05538e0

Browse files
authored
[UR][Offload] Queue flag and out-of-order queue support (#19531)
Out of order support is similar to CUDA and HIP; there's a pool of queues, and incoming tasks are allocated to the queues round-robin style. In addition, the memcpy helper function now creates and destroys an ol queue, since they are cheap to create and we don't need it to hang around. In addition, flags are now parsed by `urQueueCreate` and can be queried with `urQueueGetInfo`. This determines whether the queue is in-order or out-of-order.
1 parent 72919a0 commit 05538e0

File tree

4 files changed

+89
-23
lines changed

4 files changed

+89
-23
lines changed

unified-runtime/source/adapters/offload/device.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
121121
case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
122122
return ReturnValue(uint32_t{0});
123123
case UR_DEVICE_INFO_QUEUE_PROPERTIES:
124+
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
125+
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
126+
return ReturnValue(
127+
ur_queue_flags_t{UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE});
124128
case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES:
125129
return ReturnValue(0);
126130
case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: {

unified-runtime/source/adapters/offload/enqueue.cpp

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
6868
LaunchArgs.DynSharedMemory = 0;
6969

7070
ol_event_handle_t EventOut;
71+
ol_queue_handle_t Queue;
72+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
7173
OL_RETURN_ON_ERR(
72-
olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice,
73-
hKernel->OffloadKernel, hKernel->Args.getStorage(),
74-
hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut));
74+
olLaunchKernel(Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
75+
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(),
76+
&LaunchArgs, &EventOut));
7577

7678
if (phEvent) {
7779
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
@@ -105,15 +107,20 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
105107
(void)phEventWaitList;
106108
//
107109

108-
ol_event_handle_t EventOut = nullptr;
109-
110-
OL_RETURN_ON_ERR(olMemcpy(hQueue->OffloadQueue, DestPtr, DestDevice, SrcPtr,
111-
SrcDevice, size, phEvent ? &EventOut : nullptr));
112-
113110
if (blocking) {
114-
OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue));
111+
OL_RETURN_ON_ERR(olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice,
112+
size, nullptr));
113+
if (phEvent) {
114+
*phEvent = ur_event_handle_t_::createEmptyEvent(Command, hQueue);
115+
}
116+
return UR_RESULT_SUCCESS;
115117
}
116118

119+
ol_event_handle_t EventOut = nullptr;
120+
ol_queue_handle_t Queue;
121+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
122+
OL_RETURN_ON_ERR(olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size,
123+
phEvent ? &EventOut : nullptr));
117124
if (phEvent) {
118125
auto *Event = new ur_event_handle_t_(Command, hQueue);
119126
Event->OffloadEvent = EventOut;

unified-runtime/source/adapters/offload/queue.cpp

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,30 @@
1717
#include "queue.hpp"
1818
#include "ur2offload.hpp"
1919

20-
UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext,
21-
ur_device_handle_t hDevice,
22-
const ur_queue_properties_t *,
23-
ur_queue_handle_t *phQueue) {
20+
UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
21+
[[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice,
22+
const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
2423

2524
assert(hContext->Device == hDevice);
2625

27-
ur_queue_handle_t Queue = new ur_queue_handle_t_();
28-
auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue);
29-
if (Res != OL_SUCCESS) {
30-
delete Queue;
31-
return offloadResultToUR(Res);
26+
ur_queue_flags_t URFlags = 0;
27+
if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
28+
URFlags = pProps->flags;
3229
}
3330

34-
Queue->OffloadDevice = hDevice->OffloadDevice;
35-
Queue->UrContext = hContext;
31+
ur_queue_handle_t Queue =
32+
new ur_queue_handle_t_(hDevice->OffloadDevice, hContext, URFlags);
33+
34+
// For in-order queues, create the ol queue on construction so we can report
35+
// any errors earlier
36+
if (!(URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
37+
[[maybe_unused]] ol_queue_handle_t InitQueue;
38+
auto Res = Queue->nextQueue(InitQueue);
39+
if (Res != OL_SUCCESS) {
40+
delete Queue;
41+
return offloadResultToUR(Res);
42+
}
43+
}
3644

3745
*phQueue = Queue;
3846

@@ -47,6 +55,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
4755
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
4856

4957
switch (propName) {
58+
case UR_QUEUE_INFO_FLAGS:
59+
return ReturnValue(hQueue->Flags);
5060
case UR_QUEUE_INFO_REFERENCE_COUNT:
5161
return ReturnValue(hQueue->RefCount.load());
5262
default:
@@ -63,15 +73,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
6373

6474
UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
6575
if (--hQueue->RefCount == 0) {
66-
OL_RETURN_ON_ERR(olDestroyQueue(hQueue->OffloadQueue));
76+
for (auto *Q : hQueue->OffloadQueues) {
77+
if (!Q) {
78+
break;
79+
}
80+
OL_RETURN_ON_ERR(olDestroyQueue(Q));
81+
}
6782
delete hQueue;
6883
}
6984

7085
return UR_RESULT_SUCCESS;
7186
}
7287

7388
UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
74-
return offloadResultToUR(olSyncQueue(hQueue->OffloadQueue));
89+
for (auto *Q : hQueue->OffloadQueues) {
90+
if (!Q) {
91+
break;
92+
}
93+
OL_RETURN_ON_ERR(olSyncQueue(Q));
94+
}
95+
return UR_RESULT_SUCCESS;
7596
}
7697

7798
UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(

unified-runtime/source/adapters/offload/queue.hpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,42 @@
1515

1616
#include "common.hpp"
1717

18+
constexpr size_t OOO_QUEUE_POOL_SIZE = 32;
19+
1820
struct ur_queue_handle_t_ : RefCounted {
19-
ol_queue_handle_t OffloadQueue;
21+
ur_queue_handle_t_(ol_device_handle_t Device, ur_context_handle_t UrContext,
22+
ur_queue_flags_t Flags)
23+
: OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
24+
? 1
25+
: OOO_QUEUE_POOL_SIZE),
26+
QueueOffset(0), OffloadDevice(Device), UrContext(UrContext),
27+
Flags(Flags) {}
28+
29+
// In-order queues only have one element here, while out of order queues have
30+
// a bank of queues to use. We rotate through them round robin instead of
31+
// constantly creating new ones in case there is a long-running program that
32+
// never destroys the ur queue. Out-of-order queues create ol queues when
33+
// needed; any queues that are not yet created are nullptr.
34+
// This is a simpler implementation of the HIP/Cuda queue pooling logic in
35+
// `stream_queue_t`. In the future, if we want more performance or it
36+
// simplifies the implementation of a feature, we can consider using it.
37+
std::vector<ol_queue_handle_t> OffloadQueues;
38+
size_t QueueOffset;
2039
ol_device_handle_t OffloadDevice;
2140
ur_context_handle_t UrContext;
41+
ur_queue_flags_t Flags;
42+
43+
ol_result_t nextQueue(ol_queue_handle_t &Handle) {
44+
auto &Slot = OffloadQueues[QueueOffset++];
45+
QueueOffset %= OffloadQueues.size();
46+
47+
if (!Slot) {
48+
if (auto Res = olCreateQueue(OffloadDevice, &Slot)) {
49+
return Res;
50+
}
51+
}
52+
53+
Handle = Slot;
54+
return nullptr;
55+
}
2256
};

0 commit comments

Comments
 (0)