Skip to content

Commit 6fd6098

Browse files
[SYCL] Enable post-enqueue execution graph cleanup (#5070)
This patch contains the initial implementation of post-enqueue cleanup of command nodes. This is primarily motivated by significant overhead of post-wait cleanup in queue::wait when lowered to piQueueFinish, which is due to the fact that we cannot alternate between waiting for singular events and cleaning up their commands while other tasks are still executing on device. Post-enqueue cleanup is performed for enqueued non-leaf nodes, so it can be triggered by the enqueue process or by removing an enqueued node from leaves. There are multiple exceptions in the initial implementation: host tasks (currently cannot be cleaned up after enqueue), kernels with streams (stream handling is tied to finished command cleanup) and CGs without dependencies (deleted in addCG like before for now). Because of this, finished command cleanup is still triggered in event::wait() unconditionally and in queue::wait() for host tasks and kernels with streams. In addition, this patch removes queue::wait workarounds for Level Zero that were required to bypass finished command cleanup overhead.
1 parent ec97c57 commit 6fd6098

22 files changed

+766
-185
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ variables in production code.</span>
9292
| `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images |
9393
| `SYCL_PRINT_EXECUTION_GRAPH` | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. |
9494
| `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. |
95+
| `SYCL_DISABLE_POST_ENQUEUE_CLEANUP` | Any(\*) | Disable cleanup of enqueued command nodes during submission. |
9596
| `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. |
9697
| `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
9798
| `SYCL_PROGRAM_COMPILE_OPTIONS` | String of valid OpenCL compile options | Override compile options for all programs. |

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ class CGExecKernel : public CG {
290290
}
291291

292292
void clearStreams() { MStreams.clear(); }
293+
bool hasStreams() { return !MStreams.empty(); }
293294
};
294295

295296
/// "Copy memory" command group class.

sycl/source/detail/config.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH)
1414
CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP)
15+
CONFIG(SYCL_DISABLE_POST_ENQUEUE_CLEANUP, 1, __SYCL_DISABLE_POST_ENQUEUE_CLEANUP)
1516
CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST)
1617
CONFIG(SYCL_BE, 16, __SYCL_BE)
1718
CONFIG(SYCL_PI_TRACE, 16, __SYCL_PI_TRACE)

sycl/source/detail/device_image_impl.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,14 @@ class device_image_impl {
186186
std::lock_guard<std::mutex> Lock{MSpecConstAccessMtx};
187187
if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) {
188188
const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin();
189+
// Uses PI_MEM_FLAGS_HOST_PTR_COPY instead of PI_MEM_FLAGS_HOST_PTR_USE
190+
// since post-enqueue cleanup might trigger destruction of
191+
// device_image_impl and, as a result, destruction of MSpecConstsBlob
192+
// while MSpecConstsBuffer is still in use.
193+
// TODO consider changing the lifetime of device_image_impl instead
189194
memBufferCreateHelper(Plugin,
190195
detail::getSyclObjImpl(MContext)->getHandleRef(),
191-
PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_USE,
196+
PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY,
192197
MSpecConstsBlob.size(), MSpecConstsBlob.data(),
193198
&MSpecConstsBuffer, nullptr);
194199
}

sycl/source/detail/event_impl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,6 @@ std::vector<EventImplPtr> event_impl::getWaitList() {
364364
}
365365

366366
void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
367-
assert(MEvent != nullptr);
368367
if (MIsFlushed)
369368
return;
370369

@@ -379,6 +378,7 @@ void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
379378
return;
380379

381380
// Check if the task for this event has already been submitted.
381+
assert(MEvent != nullptr);
382382
pi_event_status Status = PI_EVENT_QUEUED;
383383
getPlugin().call<PiApiKind::piEventGetInfo>(
384384
MEvent, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(pi_int32), &Status,

sycl/source/detail/event_impl.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,11 @@ class event_impl {
201201
/// \return true if this event is discarded.
202202
bool isDiscarded() const { return MState == HES_Discarded; }
203203

204+
void setNeedsCleanupAfterWait(bool NeedsCleanupAfterWait) {
205+
MNeedsCleanupAfterWait = NeedsCleanupAfterWait;
206+
}
207+
bool needsCleanupAfterWait() { return MNeedsCleanupAfterWait; }
208+
204209
private:
205210
// When instrumentation is enabled emits trace event for event wait begin and
206211
// returns the telemetry event generated for the wait
@@ -231,6 +236,12 @@ class event_impl {
231236
// HostEventState enum.
232237
std::atomic<int> MState;
233238

239+
// A temporary workaround for the current limitations of post enqueue graph
240+
// cleanup. Indicates that the command associated with this event isn't
241+
// handled by post enqueue cleanup yet and has to be deleted by cleanup after
242+
// wait.
243+
bool MNeedsCleanupAfterWait = false;
244+
234245
std::mutex MMutex;
235246
};
236247

sycl/source/detail/queue_impl.cpp

Lines changed: 37 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,7 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
7474

7575
event ResEvent = prepareUSMEvent(Self, NativeEvent);
7676
// Track only if we won't be able to handle it with piQueueFinish.
77-
// FIXME these events are stored for level zero until as a workaround, remove
78-
// once piEventRelease no longer calls wait on the event in the plugin.
79-
if (!MSupportOOO ||
80-
getPlugin().getBackend() == backend::ext_oneapi_level_zero)
77+
if (!MSupportOOO)
8178
addSharedEvent(ResEvent);
8279
return MDiscardEvents ? createDiscardedEvent() : ResEvent;
8380
}
@@ -99,10 +96,7 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
9996

10097
event ResEvent = prepareUSMEvent(Self, NativeEvent);
10198
// Track only if we won't be able to handle it with piQueueFinish.
102-
// FIXME these events are stored for level zero until as a workaround, remove
103-
// once piEventRelease no longer calls wait on the event in the plugin.
104-
if (!MSupportOOO ||
105-
getPlugin().getBackend() == backend::ext_oneapi_level_zero)
99+
if (!MSupportOOO)
106100
addSharedEvent(ResEvent);
107101
return MDiscardEvents ? createDiscardedEvent() : ResEvent;
108102
}
@@ -125,29 +119,30 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
125119

126120
event ResEvent = prepareUSMEvent(Self, NativeEvent);
127121
// Track only if we won't be able to handle it with piQueueFinish.
128-
// FIXME these events are stored for level zero until as a workaround, remove
129-
// once piEventRelease no longer calls wait on the event in the plugin.
130-
if (!MSupportOOO ||
131-
getPlugin().getBackend() == backend::ext_oneapi_level_zero)
122+
if (!MSupportOOO)
132123
addSharedEvent(ResEvent);
133124
return MDiscardEvents ? createDiscardedEvent() : ResEvent;
134125
}
135126

136127
void queue_impl::addEvent(const event &Event) {
137-
EventImplPtr Eimpl = getSyclObjImpl(Event);
138-
Command *Cmd = (Command *)(Eimpl->getCommand());
128+
EventImplPtr EImpl = getSyclObjImpl(Event);
129+
auto *Cmd = static_cast<Command *>(EImpl->getCommand());
139130
if (!Cmd) {
140131
// if there is no command on the event, we cannot track it with MEventsWeak
141132
// as that will leave it with no owner. Track in MEventsShared only if we're
142133
// unable to call piQueueFinish during wait.
143-
// FIXME these events are stored for level zero until as a workaround,
144-
// remove once piEventRelease no longer calls wait on the event in the
145-
// plugin.
146-
if (is_host() || !MSupportOOO ||
147-
getPlugin().getBackend() == backend::ext_oneapi_level_zero)
134+
if (is_host() || !MSupportOOO)
148135
addSharedEvent(Event);
149-
} else {
150-
std::weak_ptr<event_impl> EventWeakPtr{Eimpl};
136+
}
137+
// As long as the queue supports piQueueFinish we only need to store events
138+
// with command nodes in the following cases:
139+
// 1. Unenqueued commands, since they aren't covered by piQueueFinish.
140+
// 2. Kernels with streams, since they are not supported by post enqueue
141+
// cleanup.
142+
// 3. Host tasks, for both reasons.
143+
else if (is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr ||
144+
EImpl->needsCleanupAfterWait()) {
145+
std::weak_ptr<event_impl> EventWeakPtr{EImpl};
151146
std::lock_guard<std::mutex> Lock{MMutex};
152147
MEventsWeak.push_back(std::move(EventWeakPtr));
153148
}
@@ -157,10 +152,7 @@ void queue_impl::addEvent(const event &Event) {
157152
/// but some events have no other owner. In this case,
158153
/// addSharedEvent will have the queue track the events via a shared pointer.
159154
void queue_impl::addSharedEvent(const event &Event) {
160-
// FIXME The assertion should be corrected once the Level Zero workaround is
161-
// removed.
162-
assert(is_host() || !MSupportOOO ||
163-
getPlugin().getBackend() == backend::ext_oneapi_level_zero);
155+
assert(is_host() || !MSupportOOO);
164156
std::lock_guard<std::mutex> Lock(MMutex);
165157
// Events stored in MEventsShared are not released anywhere else aside from
166158
// calls to queue::wait/wait_and_throw, which a user application might not
@@ -293,50 +285,31 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
293285
// directly. Otherwise, only wait for unenqueued or host task events, starting
294286
// from the latest submitted task in order to minimize total amount of calls,
295287
// then handle the rest with piQueueFinish.
296-
// TODO the new workflow has worse performance with Level Zero, keep the old
297-
// behavior until this is addressed
298-
if (!is_host() &&
299-
getPlugin().getBackend() == backend::ext_oneapi_level_zero) {
288+
const bool SupportsPiFinish = !is_host() && MSupportOOO;
289+
for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
290+
EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
291+
if (std::shared_ptr<event_impl> EventImplSharedPtr =
292+
EventImplWeakPtrIt->lock()) {
293+
// A nullptr PI event indicates that piQueueFinish will not cover it,
294+
// either because it's a host task event or an unenqueued one.
295+
if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) {
296+
EventImplSharedPtr->wait(EventImplSharedPtr);
297+
}
298+
}
299+
}
300+
if (SupportsPiFinish) {
301+
const detail::plugin &Plugin = getPlugin();
302+
Plugin.call<detail::PiApiKind::piQueueFinish>(getHandleRef());
300303
for (std::weak_ptr<event_impl> &EventImplWeakPtr : WeakEvents)
301304
if (std::shared_ptr<event_impl> EventImplSharedPtr =
302305
EventImplWeakPtr.lock())
303-
EventImplSharedPtr->wait(EventImplSharedPtr);
306+
if (EventImplSharedPtr->needsCleanupAfterWait())
307+
EventImplSharedPtr->cleanupCommand(EventImplSharedPtr);
308+
assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
309+
"shouldn't have shared events");
310+
} else {
304311
for (event &Event : SharedEvents)
305312
Event.wait();
306-
} else {
307-
bool SupportsPiFinish = !is_host() && MSupportOOO;
308-
for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
309-
EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
310-
if (std::shared_ptr<event_impl> EventImplSharedPtr =
311-
EventImplWeakPtrIt->lock()) {
312-
// A nullptr PI event indicates that piQueueFinish will not cover it,
313-
// either because it's a host task event or an unenqueued one.
314-
if (!SupportsPiFinish ||
315-
nullptr == EventImplSharedPtr->getHandleRef()) {
316-
EventImplSharedPtr->wait(EventImplSharedPtr);
317-
}
318-
}
319-
}
320-
if (SupportsPiFinish) {
321-
const detail::plugin &Plugin = getPlugin();
322-
Plugin.call<detail::PiApiKind::piQueueFinish>(getHandleRef());
323-
for (std::weak_ptr<event_impl> &EventImplWeakPtr : WeakEvents)
324-
if (std::shared_ptr<event_impl> EventImplSharedPtr =
325-
EventImplWeakPtr.lock())
326-
EventImplSharedPtr->cleanupCommand(EventImplSharedPtr);
327-
// FIXME these events are stored for level zero until as a workaround,
328-
// remove once piEventRelease no longer calls wait on the event in the
329-
// plugin.
330-
if (Plugin.getBackend() == backend::ext_oneapi_level_zero) {
331-
SharedEvents.clear();
332-
}
333-
assert(SharedEvents.empty() &&
334-
"Queues that support calling piQueueFinish "
335-
"shouldn't have shared events");
336-
} else {
337-
for (event &Event : SharedEvents)
338-
Event.wait();
339-
}
340313
}
341314
#ifdef XPTI_ENABLE_INSTRUMENTATION
342315
instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);

0 commit comments

Comments
 (0)