Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,8 @@ inline void execute_graph(handler &CGH,
inline void execute_graph(queue Q, command_graph<graph_state::executable> &G,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
submit(std::move(Q), [&](handler &CGH) { execute_graph(CGH, G); }, CodeLoc);
submit_graph_direct_without_event_impl(std::move(Q), G, /*DepEvents*/ {},
CodeLoc);
}

} // namespace ext::oneapi::experimental
Expand Down
33 changes: 20 additions & 13 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,20 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
const detail::KernelPropertyHolderStructTy &Props,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

event __SYCL_EXPORT submit_graph_direct_with_event_impl(
const queue &Queue,
ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable> &G,
sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current());

void __SYCL_EXPORT submit_graph_direct_without_event_impl(
const queue &Queue,
ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable> &G,
sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current());

namespace detail {
class queue_impl;

Expand Down Expand Up @@ -3706,7 +3720,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
ext::oneapi::experimental::graph_state::executable>
Graph,
const detail::code_location &CodeLoc = detail::code_location::current()) {
return submit([&](handler &CGH) { CGH.ext_oneapi_graph(Graph); }, CodeLoc);
return submit_graph_direct_with_event_impl(*this, Graph, /*DepEvents*/ {},
CodeLoc);
}

/// Shortcut for executing a graph of commands with a single dependency.
Expand All @@ -3721,12 +3736,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
Graph,
event DepEvent,
const detail::code_location &CodeLoc = detail::code_location::current()) {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.ext_oneapi_graph(Graph);
},
CodeLoc);
return submit_graph_direct_with_event_impl(
*this, Graph, sycl::span<const event>(&DepEvent, 1), CodeLoc);
}

/// Shortcut for executing a graph of commands with multiple dependencies.
Expand All @@ -3741,12 +3752,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
Graph,
const std::vector<event> &DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current()) {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.ext_oneapi_graph(Graph);
},
CodeLoc);
return submit_graph_direct_with_event_impl(*this, Graph, DepEvents,
CodeLoc);
}

/// Provides a hint to the runtime that previously issued commands to this
Expand Down
10 changes: 4 additions & 6 deletions sycl/source/detail/graph/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
return SignalEvent;
}

EventImplPtr
std::pair<EventImplPtr, bool>
exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData,
bool EventNeeded) {
Expand All @@ -1213,19 +1213,17 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
cleanupExecutionEvents(MSchedulerDependencies);
CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(),
MSchedulerDependencies.end());

bool IsCGDataSafeForSchedulerBypass =
detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, Queue.getContextImpl()) &&
CGData.MRequirements.empty();
bool SkipScheduler = IsCGDataSafeForSchedulerBypass && !MContainsHostTask;

// This variable represents the returned event. It will always be nullptr if
// EventNeeded is false.
EventImplPtr SignalEvent;

if (!MContainsHostTask) {
bool SkipScheduler =
IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty();
SkipScheduler = SkipScheduler && MPartitions[0]->MRequirements.empty();
if (SkipScheduler) {
SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue,
CGData.MEvents, EventNeeded);
Expand Down Expand Up @@ -1258,7 +1256,7 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
SignalEvent->setProfilingEnabled(MEnableProfiling);
}

return SignalEvent;
return {SignalEvent, SkipScheduler};
}

void exec_graph_impl::duplicateNodes() {
Expand Down
13 changes: 8 additions & 5 deletions sycl/source/detail/graph/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,11 +640,14 @@ class exec_graph_impl {
/// @param CGData Command-group data provided by the sycl::handler
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return Returns an event if EventNeeded is true. Returns nullptr
/// otherwise.
EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData,
bool EventNeeded);
/// @return Returns a pair of an event and a boolean indicating whether the
/// scheduler was bypassed. If an event is required, then the first element of
/// the pair is the event representing the execution of the graph. If no event
/// is required, the first element is nullptr. The second element is true if
/// the scheduler was bypassed, false otherwise.
std::pair<EventImplPtr, bool>
enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);

/// Iterates through all the nodes in the graph to build the list of
/// accessor requirements for the whole graph and for each partition.
Expand Down
135 changes: 101 additions & 34 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) {
}
}

void queue_impl::addEventUnlocked(const detail::EventImplPtr &EventImpl) {
if (!EventImpl)
return;
Command *Cmd = EventImpl->getCommand();
if (Cmd != nullptr && EventImpl->getHandle() == nullptr) {
std::weak_ptr<event_impl> EventWeakPtr{EventImpl};
MEventsWeak.push_back(std::move(EventWeakPtr));
}
}

detail::EventImplPtr
queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
bool CallerNeedsEvent, const detail::code_location &Loc,
Expand Down Expand Up @@ -574,16 +584,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
KData.validateAndSetKernelLaunchProperties(Props, hasCommandGraph(),
getDeviceImpl());

auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
bool SchedulerBypass) -> EventImplPtr {
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData)
-> std::pair<EventImplPtr, bool> {
bool SchedulerBypass =
(CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, getContextImpl())
: true) &&
!hasCommandGraph();
if (SchedulerBypass) {
// No need to copy/move the kernel function, so we set
// the function pointer to the original function
KData.setKernelFunc(HostKernel.getPtr());

return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, nullptr, nullptr,
CodeLoc, IsTopCodeLoc);
return {submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, nullptr, nullptr,
CodeLoc, IsTopCodeLoc),
SchedulerBypass};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be cleaner to just return true here (and false in the other two cases)?

}
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
Expand Down Expand Up @@ -611,24 +628,63 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;

if (auto GraphImpl = getCommandGraph(); GraphImpl) {
return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
detail::CGType::Kernel);
return {submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
detail::CGType::Kernel),
SchedulerBypass};
}

return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true);
return {detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true),
SchedulerBypass};
};

return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc);
return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc,
detail::CGType::Kernel,
/*CommandFuncContainsHostTask*/ false);
}

EventImplPtr queue_impl::submit_graph_direct_impl(
std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
ExecGraph,
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
[[maybe_unused]] const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask() ||
!supportsDiscardingPiEvents();
auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper CGData)
-> std::pair<EventImplPtr, bool> {
if (auto ParentGraph = getCommandGraph(); ParentGraph) {
std::unique_ptr<detail::CG> CommandGroup;
{
ext::oneapi::experimental::detail::graph_impl::ReadLock ExecLock(
ExecGraph->MMutex);
CGData.MRequirements = ExecGraph->getRequirements();
}
// Here we are using the CommandGroup without passing a CommandBuffer to
// pass the exec_graph_impl and event dependencies. Since this subgraph
// CG will not be executed this is fine.
CommandGroup.reset(
new sycl::detail::CGExecCommandBuffer(nullptr, ExecGraph, CGData));
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
return {submit_command_to_graph(*ParentGraph, std::move(CommandGroup),
detail::CGType::ExecCommandBuffer),
/*SchedulerBypass*/ false};
} else {
return ExecGraph->enqueue(*this, CGData, EventNeeded);
}
};
return submit_direct(CallerNeedsEvent, DepEvents, SubmitGraphFunc,
detail::CGType::ExecCommandBuffer,
ExecGraph->containsHostTask());
}

template <typename SubmitCommandFuncType>
detail::EventImplPtr
queue_impl::submit_direct(bool CallerNeedsEvent,
sycl::span<const event> DepEvents,
SubmitCommandFuncType &SubmitCommandFunc) {
detail::EventImplPtr queue_impl::submit_direct(
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type,
bool CommandFuncContainsHostTask) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this argument could be called something like "InsertBarrierForCommandOrdering", which would make submit_direct more generic.

detail::CG::StorageInitHelper CGData;
std::unique_lock<std::mutex> Lock(MMutex);
const bool inOrder = isInOrder();

// Used by queue_empty() and getLastEvent()
MEmpty.store(false, std::memory_order_release);
Expand All @@ -639,29 +695,35 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
registerEventDependency</*LockQueue*/ false>(
getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(),
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
detail::CGType::Kernel);
Type);
}

auto &Deps = hasCommandGraph() ? MExtGraphDeps : MDefaultGraphDeps;

// Sync with the last event for in order queue
EventImplPtr &LastEvent = Deps.LastEventPtr;
if (isInOrder() && LastEvent) {
if (inOrder && LastEvent) {
registerEventDependency</*LockQueue*/ false>(
LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
hasCommandGraph() ? getCommandGraph().get() : nullptr,
detail::CGType::Kernel);
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
} else if (inOrder && MNoLastEventMode && CommandFuncContainsHostTask) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that checking MNoLastEventMode here works (as this is what the handler path does). One of my longer term goals for the handler-less path however, was to avoid using MNoLastEventMode synchronization flag. Instead of checking that flag, my plan was to rely on the LastEvent being set. If SchedulerBypass is true, then we just unset it, since the lower layers take care of the ordering. If it is false, then we have to set it, since the kernel submission to the scheduler requires ordering on the SYCL layer.
The MNoLastEventMode is still used in queue_impl::wait, which is generic, so this may be a subject to future changes. The current handler-less path however updates that flag only for compatibility with the handler path.
I think there are three cases here:

  1. The LastEvent is set, then we simply add the dependency
  2. The LastEvent is not set - we are in the No Last Event Mode - we have to insert the barrier
  3. The LastEvent is not set - nothing was submitted to this queue yet - then we might use the MEmpty flag? But probably we have to move the MEmpty.store() to after this check.

Please let me know if this thinking makes sense and if it would be possible to replace the use of the MNoLastEventMode flag here.

// If we have a host task in an in-order queue with no last event mode, then
// we must add a barrier to ensure ordering.
auto ResEvent = insertHelperBarrier();
registerEventDependency</*LockQueue*/ false>(
ResEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
}

for (event e : DepEvents) {
registerEventDependency</*LockQueue*/ false>(
getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(),
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
detail::CGType::Kernel);
Type);
}

// Barrier and un-enqueued commands synchronization for out or order queue
if (!isInOrder()) {
if (!inOrder) {
MMissedCleanupRequests.unset(
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
for (auto &UpdatedGraph : MissedCleanupRequests)
Expand All @@ -674,31 +736,27 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
}
}

bool SchedulerBypass =
(CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, getContextImpl())
: true) &&
!hasCommandGraph();
auto [EventImpl, SchedulerBypass] = SubmitCommandFunc(CGData);

// Synchronize with the "no last event mode", used by the handler-based
// kernel submit path
MNoLastEventMode.store(isInOrder() && SchedulerBypass,
std::memory_order_relaxed);

EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);
MNoLastEventMode.store(inOrder && SchedulerBypass, std::memory_order_relaxed);

// Sync with the last event for in order queue. For scheduler-bypass flow,
// the ordering is done at the layers below the SYCL runtime,
// but for the scheduler-based flow, it needs to be done here, as the
// scheduler handles host task submissions.
if (isInOrder()) {
if (inOrder) {
LastEvent = SchedulerBypass ? nullptr : EventImpl;
}

// Barrier and un-enqueued commands synchronization for out or order queue
if (!isInOrder() && !EventImpl->isEnqueued()) {
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
// Barrier and un-enqueued commands synchronization for out or order queue.
// The event must also be stored for future wait calls.
if (!inOrder) {
if (!EventImpl->isEnqueued()) {
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
}
addEventUnlocked(EventImpl);
}

return CallerNeedsEvent ? EventImpl : nullptr;
Expand Down Expand Up @@ -1149,6 +1207,15 @@ void queue_impl::verifyProps(const property_list &Props) const {
CheckPropertiesWithData);
}

EventImplPtr queue_impl::insertHelperBarrier() {
auto ResEvent = detail::event_impl::create_device_event(*this);
ur_event_handle_t UREvent = nullptr;
getAdapter().call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
getHandleRef(), 0, nullptr, &UREvent);
ResEvent->setHandle(UREvent);
return ResEvent;
}

} // namespace detail
} // namespace _V1
} // namespace sycl
Loading
Loading