Skip to content

Commit 5dc81c5

Browse files
committed
Add support for handler-less graph submission
1 parent 6fcb77d commit 5dc81c5

File tree

11 files changed

+340
-71
lines changed

11 files changed

+340
-71
lines changed

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,8 @@ inline void execute_graph(handler &CGH,
450450
inline void execute_graph(queue Q, command_graph<graph_state::executable> &G,
451451
const sycl::detail::code_location &CodeLoc =
452452
sycl::detail::code_location::current()) {
453-
submit(std::move(Q), [&](handler &CGH) { execute_graph(CGH, G); }, CodeLoc);
453+
submit_graph_direct_without_event_impl(std::move(Q), G, /*DepEvents*/ {},
454+
CodeLoc);
454455
}
455456

456457
} // namespace ext::oneapi::experimental

sycl/include/sycl/queue.hpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,20 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
8282
const detail::KernelPropertyHolderStructTy &Props,
8383
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
8484

85+
event __SYCL_EXPORT submit_graph_direct_with_event_impl(
86+
const queue &Queue,
87+
ext::oneapi::experimental::command_graph<
88+
ext::oneapi::experimental::graph_state::executable> &G,
89+
sycl::span<const event> DepEvents,
90+
const detail::code_location &CodeLoc = detail::code_location::current());
91+
92+
void __SYCL_EXPORT submit_graph_direct_without_event_impl(
93+
const queue &Queue,
94+
ext::oneapi::experimental::command_graph<
95+
ext::oneapi::experimental::graph_state::executable> &G,
96+
sycl::span<const event> DepEvents,
97+
const detail::code_location &CodeLoc = detail::code_location::current());
98+
8599
namespace detail {
86100
class queue_impl;
87101

@@ -3706,7 +3720,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
37063720
ext::oneapi::experimental::graph_state::executable>
37073721
Graph,
37083722
const detail::code_location &CodeLoc = detail::code_location::current()) {
3709-
return submit([&](handler &CGH) { CGH.ext_oneapi_graph(Graph); }, CodeLoc);
3723+
return submit_graph_direct_with_event_impl(*this, Graph, /*DepEvents*/ {},
3724+
CodeLoc);
37103725
}
37113726

37123727
/// Shortcut for executing a graph of commands with a single dependency.
@@ -3721,12 +3736,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
37213736
Graph,
37223737
event DepEvent,
37233738
const detail::code_location &CodeLoc = detail::code_location::current()) {
3724-
return submit(
3725-
[&](handler &CGH) {
3726-
CGH.depends_on(DepEvent);
3727-
CGH.ext_oneapi_graph(Graph);
3728-
},
3729-
CodeLoc);
3739+
return submit_graph_direct_with_event_impl(
3740+
*this, Graph, sycl::span<const event>(&DepEvent, 1), CodeLoc);
37303741
}
37313742

37323743
/// Shortcut for executing a graph of commands with multiple dependencies.
@@ -3741,12 +3752,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
37413752
Graph,
37423753
const std::vector<event> &DepEvents,
37433754
const detail::code_location &CodeLoc = detail::code_location::current()) {
3744-
return submit(
3745-
[&](handler &CGH) {
3746-
CGH.depends_on(DepEvents);
3747-
CGH.ext_oneapi_graph(Graph);
3748-
},
3749-
CodeLoc);
3755+
return submit_graph_direct_with_event_impl(*this, Graph, DepEvents,
3756+
CodeLoc);
37503757
}
37513758

37523759
/// Provides a hint to the runtime that previously issued commands to this

sycl/source/detail/graph/graph_impl.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,7 +1204,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
12041204
return SignalEvent;
12051205
}
12061206

1207-
EventImplPtr
1207+
std::pair<EventImplPtr, bool>
12081208
exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12091209
sycl::detail::CG::StorageInitHelper CGData,
12101210
bool EventNeeded) {
@@ -1213,19 +1213,17 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12131213
cleanupExecutionEvents(MSchedulerDependencies);
12141214
CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(),
12151215
MSchedulerDependencies.end());
1216-
12171216
bool IsCGDataSafeForSchedulerBypass =
12181217
detail::Scheduler::areEventsSafeForSchedulerBypass(
12191218
CGData.MEvents, Queue.getContextImpl()) &&
12201219
CGData.MRequirements.empty();
1220+
bool SkipScheduler = IsCGDataSafeForSchedulerBypass && !MContainsHostTask;
12211221

12221222
// This variable represents the returned event. It will always be nullptr if
12231223
// EventNeeded is false.
12241224
EventImplPtr SignalEvent;
1225-
12261225
if (!MContainsHostTask) {
1227-
bool SkipScheduler =
1228-
IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty();
1226+
SkipScheduler = SkipScheduler && MPartitions[0]->MRequirements.empty();
12291227
if (SkipScheduler) {
12301228
SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue,
12311229
CGData.MEvents, EventNeeded);
@@ -1258,7 +1256,7 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12581256
SignalEvent->setProfilingEnabled(MEnableProfiling);
12591257
}
12601258

1261-
return SignalEvent;
1259+
return {SignalEvent, SkipScheduler};
12621260
}
12631261

12641262
void exec_graph_impl::duplicateNodes() {

sycl/source/detail/graph/graph_impl.hpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -640,11 +640,14 @@ class exec_graph_impl {
640640
/// @param CGData Command-group data provided by the sycl::handler
641641
/// @param EventNeeded Whether an event signalling the completion of this
642642
/// operation needs to be returned.
643-
/// @return Returns an event if EventNeeded is true. Returns nullptr
644-
/// otherwise.
645-
EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
646-
sycl::detail::CG::StorageInitHelper CGData,
647-
bool EventNeeded);
643+
/// @return Returns a pair of an event and a boolean indicating whether the
644+
/// scheduler was bypassed. If an event is required, then the first element of
645+
/// the pair is the event representing the execution of the graph. If no event
646+
/// is required, the first element is nullptr. The second element is true if
647+
/// the scheduler was bypassed, false otherwise.
648+
std::pair<EventImplPtr, bool>
649+
enqueue(sycl::detail::queue_impl &Queue,
650+
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
648651

649652
/// Iterates through all the nodes in the graph to build the list of
650653
/// accessor requirements for the whole graph and for each partition.

sycl/source/detail/queue_impl.cpp

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) {
326326
}
327327
}
328328

329+
void queue_impl::addEventUnlocked(const detail::EventImplPtr &EventImpl) {
330+
if (!EventImpl)
331+
return;
332+
Command *Cmd = EventImpl->getCommand();
333+
if (Cmd != nullptr && EventImpl->getHandle() == nullptr) {
334+
std::weak_ptr<event_impl> EventWeakPtr{EventImpl};
335+
MEventsWeak.push_back(std::move(EventWeakPtr));
336+
}
337+
}
338+
329339
detail::EventImplPtr
330340
queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
331341
bool CallerNeedsEvent, const detail::code_location &Loc,
@@ -574,16 +584,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
574584
KData.validateAndSetKernelLaunchProperties(Props, hasCommandGraph(),
575585
getDeviceImpl());
576586

577-
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
578-
bool SchedulerBypass) -> EventImplPtr {
587+
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData)
588+
-> std::pair<EventImplPtr, bool> {
589+
bool SchedulerBypass =
590+
(CGData.MEvents.size() > 0
591+
? detail::Scheduler::areEventsSafeForSchedulerBypass(
592+
CGData.MEvents, getContextImpl())
593+
: true) &&
594+
!hasCommandGraph();
579595
if (SchedulerBypass) {
580596
// No need to copy/move the kernel function, so we set
581597
// the function pointer to the original function
582598
KData.setKernelFunc(HostKernel.getPtr());
583599

584-
return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
585-
CallerNeedsEvent, nullptr, nullptr,
586-
CodeLoc, IsTopCodeLoc);
600+
return {submit_kernel_scheduler_bypass(KData, CGData.MEvents,
601+
CallerNeedsEvent, nullptr, nullptr,
602+
CodeLoc, IsTopCodeLoc),
603+
SchedulerBypass};
587604
}
588605
std::unique_ptr<detail::CG> CommandGroup;
589606
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
@@ -611,24 +628,63 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
611628
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
612629

613630
if (auto GraphImpl = getCommandGraph(); GraphImpl) {
614-
return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
615-
detail::CGType::Kernel);
631+
return {submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
632+
detail::CGType::Kernel),
633+
SchedulerBypass};
616634
}
617635

618-
return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
619-
*this, true);
636+
return {detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
637+
*this, true),
638+
SchedulerBypass};
620639
};
621640

622-
return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc);
641+
return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc,
642+
detail::CGType::Kernel,
643+
/*CommandFuncContainsHostTask*/ false);
644+
}
645+
646+
EventImplPtr queue_impl::submit_graph_direct_impl(
647+
std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
648+
ExecGraph,
649+
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
650+
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
651+
bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask() ||
652+
!supportsDiscardingPiEvents();
653+
auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper CGData)
654+
-> std::pair<EventImplPtr, bool> {
655+
if (auto ParentGraph = getCommandGraph(); ParentGraph) {
656+
std::unique_ptr<detail::CG> CommandGroup;
657+
{
658+
ext::oneapi::experimental::detail::graph_impl::ReadLock ParentLock(
659+
ParentGraph->MMutex);
660+
CGData.MRequirements = ExecGraph->getRequirements();
661+
// Here we are using the CommandGroup without passing a CommandBuffer to
662+
// pass the exec_graph_impl and event dependencies. Since this subgraph
663+
// CG will not be executed this is fine.
664+
CommandGroup.reset(
665+
new sycl::detail::CGExecCommandBuffer(nullptr, ExecGraph, CGData));
666+
}
667+
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
668+
return {submit_command_to_graph(*ParentGraph, std::move(CommandGroup),
669+
detail::CGType::ExecCommandBuffer),
670+
/*BypassScheduler*/ false};
671+
} else {
672+
return ExecGraph->enqueue(*this, CGData, EventNeeded);
673+
}
674+
};
675+
return submit_direct(CallerNeedsEvent, DepEvents, SubmitGraphFunc,
676+
detail::CGType::ExecCommandBuffer,
677+
ExecGraph->containsHostTask());
623678
}
624679

625680
template <typename SubmitCommandFuncType>
626-
detail::EventImplPtr
627-
queue_impl::submit_direct(bool CallerNeedsEvent,
628-
sycl::span<const event> DepEvents,
629-
SubmitCommandFuncType &SubmitCommandFunc) {
681+
detail::EventImplPtr queue_impl::submit_direct(
682+
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
683+
SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type,
684+
bool CommandFuncContainsHostTask) {
630685
detail::CG::StorageInitHelper CGData;
631686
std::unique_lock<std::mutex> Lock(MMutex);
687+
const bool inOrder = isInOrder();
632688

633689
// Used by queue_empty() and getLastEvent()
634690
MEmpty.store(false, std::memory_order_release);
@@ -639,29 +695,35 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
639695
registerEventDependency</*LockQueue*/ false>(
640696
getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(),
641697
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
642-
detail::CGType::Kernel);
698+
Type);
643699
}
644700

645701
auto &Deps = hasCommandGraph() ? MExtGraphDeps : MDefaultGraphDeps;
646702

647703
// Sync with the last event for in order queue
648704
EventImplPtr &LastEvent = Deps.LastEventPtr;
649-
if (isInOrder() && LastEvent) {
705+
if (inOrder && LastEvent) {
650706
registerEventDependency</*LockQueue*/ false>(
651707
LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
652-
hasCommandGraph() ? getCommandGraph().get() : nullptr,
653-
detail::CGType::Kernel);
708+
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
709+
} else if (inOrder && MNoLastEventMode && CommandFuncContainsHostTask) {
710+
// If we have a host task in an in-order queue with no last event mode, then
711+
// we must add a barrier to ensure ordering.
712+
auto ResEvent = insertHelperBarrier();
713+
registerEventDependency</*LockQueue*/ false>(
714+
ResEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
715+
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
654716
}
655717

656718
for (event e : DepEvents) {
657719
registerEventDependency</*LockQueue*/ false>(
658720
getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(),
659721
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
660-
detail::CGType::Kernel);
722+
Type);
661723
}
662724

663725
// Barrier and un-enqueued commands synchronization for out or order queue
664-
if (!isInOrder()) {
726+
if (!inOrder) {
665727
MMissedCleanupRequests.unset(
666728
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
667729
for (auto &UpdatedGraph : MissedCleanupRequests)
@@ -674,31 +736,27 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
674736
}
675737
}
676738

677-
bool SchedulerBypass =
678-
(CGData.MEvents.size() > 0
679-
? detail::Scheduler::areEventsSafeForSchedulerBypass(
680-
CGData.MEvents, getContextImpl())
681-
: true) &&
682-
!hasCommandGraph();
739+
auto [EventImpl, SchedulerBypass] = SubmitCommandFunc(CGData);
683740

684741
// Synchronize with the "no last event mode", used by the handler-based
685742
// kernel submit path
686-
MNoLastEventMode.store(isInOrder() && SchedulerBypass,
687-
std::memory_order_relaxed);
688-
689-
EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);
743+
MNoLastEventMode.store(inOrder && SchedulerBypass, std::memory_order_relaxed);
690744

691745
// Sync with the last event for in order queue. For scheduler-bypass flow,
692746
// the ordering is done at the layers below the SYCL runtime,
693747
// but for the scheduler-based flow, it needs to be done here, as the
694748
// scheduler handles host task submissions.
695-
if (isInOrder()) {
749+
if (inOrder) {
696750
LastEvent = SchedulerBypass ? nullptr : EventImpl;
697751
}
698752

699-
// Barrier and un-enqueued commands synchronization for out or order queue
700-
if (!isInOrder() && !EventImpl->isEnqueued()) {
701-
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
753+
// Barrier and un-enqueued commands synchronization for out or order queue.
754+
// The event must also be stored for future wait calls.
755+
if (!inOrder) {
756+
if (!EventImpl->isEnqueued()) {
757+
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
758+
}
759+
addEventUnlocked(EventImpl);
702760
}
703761

704762
return CallerNeedsEvent ? EventImpl : nullptr;
@@ -1149,6 +1207,15 @@ void queue_impl::verifyProps(const property_list &Props) const {
11491207
CheckPropertiesWithData);
11501208
}
11511209

1210+
EventImplPtr queue_impl::insertHelperBarrier() {
1211+
auto ResEvent = detail::event_impl::create_device_event(*this);
1212+
ur_event_handle_t UREvent = nullptr;
1213+
getAdapter().call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
1214+
getHandleRef(), 0, nullptr, &UREvent);
1215+
ResEvent->setHandle(UREvent);
1216+
return ResEvent;
1217+
}
1218+
11521219
} // namespace detail
11531220
} // namespace _V1
11541221
} // namespace sycl

0 commit comments

Comments
 (0)