Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 116 additions & 15 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
return EventImpl;
}

EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
bool EventNeeded, detail::kernel_impl *KernelImplPtr,
detail::kernel_bundle_impl *KernelBundleImpPtr,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
std::vector<ur_event_handle_t> RawEvents;

// TODO checking the size of the events vector and avoiding the call is
// more efficient here at this point
if (DepEvents.size() > 0) {
RawEvents = detail::Command::getUrEvents(DepEvents, this, false);
}

bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents();
if (DiscardEvent) {
// Kernel only uses assert if it's non interop one
bool KernelUsesAssert =
!(KernelImplPtr && KernelImplPtr->isInterop()) && KData.usesAssert();
DiscardEvent = !KernelUsesAssert;
}

std::shared_ptr<detail::event_impl> ResultEvent =
DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this);

auto EnqueueKernel = [&]() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
xpti_td *CmdTraceEvent = nullptr;
uint64_t InstanceID = 0;
auto StreamID = detail::getActiveXPTIStreamID();
// Only enable instrumentation if there are subscribes to the SYCL
// stream
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
if (xptiEnabled) {
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
*KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(),
KernelBundleImpPtr, KData.getArgs());
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
xpti::trace_task_begin, nullptr);
}
#endif
const detail::RTDeviceBinaryImage *BinImage = nullptr;
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
assert(BinImage && "Failed to obtain a binary image.");
}
enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
KernelBundleImpPtr, KernelImplPtr,
*KData.getDeviceKernelInfoPtr(), RawEvents,
ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
KData.isCooperative(), KData.usesClusterLaunch(),
KData.getKernelWorkGroupMemorySize(), BinImage,
KData.getKernelFuncPtr());
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiEnabled) {
// Emit signal only when event is created
if (!DiscardEvent) {
detail::emitInstrumentationGeneral(
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
static_cast<const void *>(ResultEvent->getHandle()));
}
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
xpti::trace_task_end, nullptr);
}
#endif
};

if (DiscardEvent) {
EnqueueKernel();
} else {
ResultEvent->setWorkerQueue(weak_from_this());
ResultEvent->setStateIncomplete();
ResultEvent->setSubmissionTime();

EnqueueKernel();
ResultEvent->setEnqueued();
// connect returned event with dependent events
if (!isInOrder()) {
// DepEvents is not used anymore, so can move.
ResultEvent->getPreparedDepsEvents() = std::move(DepEvents);
// ResultEvent is local for current thread, no need to lock.
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
}
}

return ResultEvent;
}

EventImplPtr queue_impl::submit_command_to_graph(
ext::oneapi::experimental::detail::graph_impl &GraphImpl,
std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
Expand Down Expand Up @@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
return EventImpl;
}

detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
EventImplPtr queue_impl::submit_kernel_direct_impl(
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {

KernelData KData;

std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
HostKernel.takeOrCopyOwnership();

KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
KData.setKernelFunc(HostKernelPtr->getPtr());
KData.setKernelFunc(HostKernel.getPtr());
KData.setNDRDesc(NDRDesc);

auto SubmitKernelFunc =
[&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
bool SchedulerBypass) -> EventImplPtr {
if (SchedulerBypass) {
return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, nullptr, nullptr,
CodeLoc, IsTopCodeLoc);
}
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
std::vector<std::shared_ptr<const void>> AuxiliaryResources;

std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
HostKernel.takeOrCopyOwnership();

KData.extractArgsAndReqsFromLambda();

CommandGroup.reset(new detail::CGExecKernel(
Expand All @@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
std::move(CGData), std::move(KData).getArgs(),
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
std::move(AuxiliaryResources), detail::CGType::Kernel,
UR_KERNEL_CACHE_CONFIG_DEFAULT,
false, // KernelIsCooperative
false, // KernelUsesClusterLaunch
0, // KernelWorkGroupMemorySize
KData.getKernelCacheConfig(), KData.isCooperative(),
KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(),
CodeLoc));
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;

Expand Down Expand Up @@ -567,11 +658,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
}
}

EventImplPtr EventImpl = SubmitCommandFunc(CGData);
bool SchedulerBypass =
(CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, getContextImpl())
: true) &&
!hasCommandGraph();

// Sync with the last event for in order queue
if (isInOrder() && !EventImpl->isDiscarded()) {
LastEvent = EventImpl;
EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);

// Sync with the last event for in order queue. For scheduler-bypass flow,
// the ordering is done at the layers below the SYCL runtime,
// but for the scheduler-based flow, it needs to be done here, as the
// scheduler handles host task submissions.
if (isInOrder()) {
LastEvent = SchedulerBypass ? nullptr : EventImpl;
}

// Barrier and un-enqueued commands synchronization for out or order queue
Expand Down
24 changes: 21 additions & 3 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,24 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
}

/// Submits a kernel using the scheduler bypass fast path
///
/// \param KData is an object storing data related to the kernel.
/// \param DepEvents is a list of event dependencies.
/// \param EventNeeded should be true, if the resulting event is needed.
/// \param Kernel to be used, if kernel defined as a kernel object.
/// \param KernelBundleImpPtr to be used, if kernel bundle defined.
/// \param CodeLoc is the code location of the submit call.
/// \param IsTopCodeLoc is used to determine if the object is in a local
/// scope or in the top level scope.
///
/// \return a SYCL event representing submitted command or nullptr.
EventImplPtr submit_kernel_scheduler_bypass(
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
bool EventNeeded, detail::kernel_impl *KernelImplPtr,
detail::kernel_bundle_impl *KernelBundleImpPtr,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
Expand Down Expand Up @@ -908,14 +926,14 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// scope or in the top level scope.
///
/// \return a SYCL event representing submitted command group or nullptr.
detail::EventImplPtr submit_kernel_direct_impl(
EventImplPtr submit_kernel_direct_impl(
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template <typename SubmitCommandFuncType>
detail::EventImplPtr submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc);
EventImplPtr submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc);

/// Helper function for submitting a memory operation with a handler.
/// \param DepEvents is a vector of dependencies of the operation.
Expand Down
94 changes: 4 additions & 90 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,97 +638,11 @@ event handler::finalize() {
// the graph is not changed, then this faster path is used to submit
// kernel bypassing scheduler and avoiding CommandGroup, Command objects
// creation.
std::vector<ur_event_handle_t> RawEvents;
// TODO checking the size of the events vector and avoiding the call is
// more efficient here at this point
if (impl->CGData.MEvents.size() > 0) {
RawEvents = detail::Command::getUrEvents(
impl->CGData.MEvents, impl->get_queue_or_null(), false);
}

bool DiscardEvent =
!impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
if (DiscardEvent) {
// Kernel only uses assert if it's non interop one
bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) &&
impl->MKernelData.usesAssert();
DiscardEvent = !KernelUsesAssert;
}

std::shared_ptr<detail::event_impl> ResultEvent =
DiscardEvent
? nullptr
: detail::event_impl::create_device_event(impl->get_queue());

auto EnqueueKernel = [&]() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
xpti_td *CmdTraceEvent = nullptr;
uint64_t InstanceID = 0;
auto StreamID = detail::getActiveXPTIStreamID();
// Only enable instrumentation if there are subscribes to the SYCL
// stream
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
if (xptiEnabled) {
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
StreamID, MKernel.get(), MCodeLoc, impl->MIsTopCodeLoc,
*impl->MKernelData.getDeviceKernelInfoPtr(),
impl->get_queue_or_null(), impl->MKernelData.getNDRDesc(),
KernelBundleImpPtr, impl->MKernelData.getArgs());
detail::emitInstrumentationGeneral(StreamID, InstanceID,
CmdTraceEvent,
xpti::trace_task_begin, nullptr);
}
#endif
const detail::RTDeviceBinaryImage *BinImage = nullptr;
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
BinImage = detail::retrieveKernelBinary(impl->get_queue(),
impl->getKernelName());
assert(BinImage && "Failed to obtain a binary image.");
}
enqueueImpKernel(impl->get_queue(), impl->MKernelData.getNDRDesc(),
impl->MKernelData.getArgs(), KernelBundleImpPtr,
MKernel.get(),
*impl->MKernelData.getDeviceKernelInfoPtr(), RawEvents,
ResultEvent.get(), nullptr,
impl->MKernelData.getKernelCacheConfig(),
impl->MKernelData.isCooperative(),
impl->MKernelData.usesClusterLaunch(),
impl->MKernelData.getKernelWorkGroupMemorySize(),
BinImage, impl->MKernelData.getKernelFuncPtr());
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiEnabled) {
// Emit signal only when event is created
if (!DiscardEvent) {
detail::emitInstrumentationGeneral(
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
static_cast<const void *>(ResultEvent->getHandle()));
}
detail::emitInstrumentationGeneral(StreamID, InstanceID,
CmdTraceEvent,
xpti::trace_task_end, nullptr);
}
#endif
};

if (DiscardEvent) {
EnqueueKernel();
} else {
detail::queue_impl &Queue = impl->get_queue();
ResultEvent->setWorkerQueue(Queue.weak_from_this());
ResultEvent->setStateIncomplete();
ResultEvent->setSubmissionTime();

EnqueueKernel();
ResultEvent->setEnqueued();
// connect returned event with dependent events
if (!Queue.isInOrder()) {
// MEvents is not used anymore, so can move.
ResultEvent->getPreparedDepsEvents() =
std::move(impl->CGData.MEvents);
// ResultEvent is local for current thread, no need to lock.
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
}
}
detail::EventImplPtr ResultEvent =
impl->get_queue().submit_kernel_scheduler_bypass(
impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded,
MKernel.get(), KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc);
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
return ResultEvent;
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,46 @@ TEST_F(FreeFunctionCommandsEventsTests,

TestMoveFunctor::MoveCtorCalls = 0;
TestMoveFunctor MoveOnly;
std::mutex CvMutex;
std::condition_variable Cv;
bool ready = false;

// This kernel submission uses scheduler-bypass path, so the HostKernel
// shouldn't be constructed.

sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
std::move(MoveOnly));

ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0);
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});

// Another kernel submission is queued behind a host task,
// to force the scheduler-based submission. In this case, the HostKernel
// should be constructed.

Queue.submit([&](sycl::handler &CGH) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
});
});

sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
std::move(MoveOnly));

{
std::unique_lock<std::mutex> lk(CvMutex);
ready = true;
}
Cv.notify_one();

Queue.wait();

// Move ctor for TestMoveFunctor is called during move construction of
// HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete
// it.
ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1);

ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2});
}
#endif

Expand Down
Loading