Skip to content

Commit 4888933

Browse files
committed
[Offload] Design for async error handling
This updates the spec to provide a way for async errors to be signaled from, for example, kernels. The error is stored on the queue and can be queried with `olGetQueueError`. In addition, if any other queues are waiting on the error'd queue they will also enter the error state. With this design, both `olSyncEvent` and `olSyncQueue` will now exit early on error. More specifically, unless a kernel gets trapped in an infinite loop, both sync functions will always return in a finite amount of time.
1 parent 8687ef7 commit 4888933

File tree

4 files changed

+39
-4
lines changed

4 files changed

+39
-4
lines changed

offload/liboffload/API/Common.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def ol_errc_t : Enum {
9090
Etor<"COMPILE_FAILURE", "jit compile failure while processing binary image">,
9191
Etor<"LINK_FAILURE", "linker failure while processing binary image">,
9292
Etor<"BACKEND_FAILURE", "the plugin backend is in an invalid or unsupported state">,
93+
Etor<"QUEUE_ERROR", "the queue entered an error state">,
9394
Etor<"UNINITIALIZED", "not initialized">,
9495

9596
// Handle related errors - only makes sense for liboffload

offload/liboffload/API/Event.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,15 @@ def olDestroyEvent : Function {
3333

3434
def olSyncEvent : Function {
3535
let desc = "Block the calling thread until the event is complete.";
36-
let details = [];
36+
let details = [
37+
"If the queue or any dependencies encounter an error, this returns early and no work after the error will be complete."
38+
];
3739
let params = [
3840
Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
3941
];
40-
let returns = [];
42+
let returns = [
43+
Return<"OL_ERRC_QUEUE_ERROR", ["The queue associated with this event or any of its dependencies encountered an error"]>,
44+
];
4145
}
4246

4347
def ol_event_info_t : Enum {

offload/liboffload/API/Queue.td

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,23 @@ def olDestroyQueue : Function {
3333

3434
def olSyncQueue : Function {
3535
let desc = "Block the calling thread until the enqueued work on a queue is complete.";
36-
let details = [];
36+
let details = [
37+
"If the queue or any dependencies encounter an error, this returns early and no work after the error will be complete."
38+
];
3739
let params = [
3840
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
3941
];
40-
let returns = [];
42+
let returns = [
43+
Return<"OL_ERRC_INVALID_QUEUE_ERROR", ["The queue or any of it's dependencies encountered an error"]>,
44+
];
4145
}
4246

4347
def olWaitEvents : Function {
4448
let desc = "Make any future work submitted to this queue wait until the provided events are complete.";
4549
let details = [
4650
"All events in `Events` must complete before the queue is unblocked.",
4751
"The input events can be from any queue on any device provided by the same platform as `Queue`.",
52+
"If `Event`'s queue is different from `Queue`, a dependency is created. If `Event`'s queue enters the error state, then Queue will also enter the error state.",
4853
];
4954
let params = [
5055
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
@@ -125,3 +130,21 @@ def olLaunchHostFunction : Function {
125130
];
126131
let returns = [];
127132
}
133+
134+
def olGetQueueError : Function {
135+
let desc = "Gets the error from a queue or any of its dependencies in the error state.";
136+
let details = [
137+
"If the queue is not in the error state, OL_SUCCESS is written",
138+
"Dependencies are created using `olWaitEvents`, if any waited on queue enters the fail state then this will also be in the fail state",
139+
"The error is not cleared; there is no way to recover a queue in the error state",
140+
];
141+
let params = [
142+
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
143+
Param<"ol_result_t *", "Error", "output location to put the queue error", PARAM_IN>,
144+
Param<"ol_queue_handle_t *", "FailingQueue", "output location to put the queue that encountered an error", PARAM_IN_OPTIONAL>,
145+
146+
];
147+
let returns = [
148+
Return<"OL_ERRC_INVALID_QUEUE">
149+
];
150+
}

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,13 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
644644
return olGetQueueInfoImplDetail(Queue, PropName, 0, nullptr, PropSizeRet);
645645
}
646646

647+
Error olGetQueueError_impl(ol_queue_handle_t Queue, ol_result_t *Error,
648+
ol_queue_handle_t *ErrQueue) {
649+
// TODO
650+
*Error = nullptr;
651+
return Error::success();
652+
}
653+
647654
Error olSyncEvent_impl(ol_event_handle_t Event) {
648655
// No event info means that this event was complete on creation
649656
if (!Event->EventInfo)

0 commit comments

Comments
 (0)