Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 142 additions & 29 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,42 +420,146 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
return EventImpl;
}

detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
bool EventNeeded, std::shared_ptr<detail::kernel_impl> &Kernel,
detail::kernel_bundle_impl *KernelBundleImpPtr,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
std::vector<ur_event_handle_t> RawEvents;

// TODO checking the size of the events vector and avoiding the call is
// more efficient here at this point
if (DepEvents.size() > 0) {
RawEvents = detail::Command::getUrEvents(DepEvents, this, false);
}

bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents();
if (DiscardEvent) {
// Kernel only uses assert if it's non interop one
bool KernelUsesAssert =
!(Kernel && Kernel->isInterop()) && KData.usesAssert();
DiscardEvent = !KernelUsesAssert;
}

std::shared_ptr<detail::event_impl> ResultEvent =
DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this);

auto EnqueueKernel = [&]() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
xpti_td *CmdTraceEvent = nullptr;
uint64_t InstanceID = 0;
auto StreamID = detail::getActiveXPTIStreamID();
// Only enable instrumentation if there are subscribes to the SYCL
// stream
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
if (xptiEnabled) {
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
StreamID, Kernel, CodeLoc, IsTopCodeLoc,
*KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(),
KernelBundleImpPtr, KData.getArgs());
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
xpti::trace_task_begin, nullptr);
}
#endif
const detail::RTDeviceBinaryImage *BinImage = nullptr;
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
assert(BinImage && "Failed to obtain a binary image.");
}
enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
KernelBundleImpPtr, Kernel.get(),
*KData.getDeviceKernelInfoPtr(), RawEvents,
ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
KData.isCooperative(), KData.usesClusterLaunch(),
KData.getKernelWorkGroupMemorySize(), BinImage,
KData.getKernelFuncPtr());
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiEnabled) {
// Emit signal only when event is created
if (!DiscardEvent) {
detail::emitInstrumentationGeneral(
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
static_cast<const void *>(ResultEvent->getHandle()));
}
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
xpti::trace_task_end, nullptr);
}
#endif
};

if (DiscardEvent) {
EnqueueKernel();
} else {
ResultEvent->setWorkerQueue(weak_from_this());
ResultEvent->setStateIncomplete();
ResultEvent->setSubmissionTime();

EnqueueKernel();
ResultEvent->setEnqueued();
// connect returned event with dependent events
if (!isInOrder()) {
// MEvents is not used anymore, so can move.
ResultEvent->getPreparedDepsEvents() = std::move(DepEvents);
// ResultEvent is local for current thread, no need to lock.
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
}
}

return ResultEvent;
}

EventImplPtr queue_impl::submit_kernel_direct_impl(
const NDRDescT &NDRDesc,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {

KernelData KData;
detail::code_location CLoc;
bool IsTopCLoc = true;

#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiTraceEnabled()) {
CLoc = CodeLoc;
IsTopCLoc = IsTopCodeLoc;
}
#endif

KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
KData.setKernelFunc(HostKernel->getPtr());
KData.setNDRDesc(NDRDesc);

auto SubmitKernelFunc =
[&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
std::vector<std::shared_ptr<const void>> AuxiliaryResources;

KData.extractArgsAndReqsFromLambda();

CommandGroup.reset(new detail::CGExecKernel(
KData.getNDRDesc(), HostKernel,
nullptr, // Kernel
nullptr, // KernelBundle
std::move(CGData), std::move(KData).getArgs(),
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
std::move(AuxiliaryResources), detail::CGType::Kernel,
UR_KERNEL_CACHE_CONFIG_DEFAULT,
false, // KernelIsCooperative
false, // KernelUsesClusterLaunch
0, // KernelWorkGroupMemorySize
CodeLoc));
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;

return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true);
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
bool SchedulerBypass) -> EventImplPtr {
if (SchedulerBypass) {
std::shared_ptr<detail::kernel_impl> Kernel;
return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, Kernel, nullptr,
CLoc, IsTopCLoc);
} else {
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
std::vector<std::shared_ptr<const void>> AuxiliaryResources;

KData.extractArgsAndReqsFromLambda();

CommandGroup.reset(new detail::CGExecKernel(
KData.getNDRDesc(), HostKernel,
nullptr, // Kernel
nullptr, // KernelBundle
std::move(CGData), std::move(KData).getArgs(),
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
std::move(AuxiliaryResources), detail::CGType::Kernel,
UR_KERNEL_CACHE_CONFIG_DEFAULT,
false, // KernelIsCooperative
false, // KernelUsesClusterLaunch
0, // KernelWorkGroupMemorySize
CLoc));
CommandGroup->MIsTopCodeLoc = IsTopCLoc;

return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true);
}
};

return submit_direct(CallerNeedsEvent, SubmitKernelFunc);
Expand Down Expand Up @@ -505,11 +609,20 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
}
}

EventImplPtr EventImpl = SubmitCommandFunc(CGData);
bool SchedulerBypass =
CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(CGData.MEvents,
getContextImpl())
: true;

// Sync with the last event for in order queue
if (isInOrder() && !EventImpl->isDiscarded()) {
LastEvent = EventImpl;
EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);

// Sync with the last event for in order queue. For scheduler-bypass flow,
// the ordering is done at the layers below the SYCL runtime,
// but for the scheduler-based flow, it needs to be done here, as the
// scheduler handles host task submissions.
if (isInOrder()) {
LastEvent = SchedulerBypass ? nullptr : EventImpl;
}

// Barrier and un-enqueued commands synchronization for out or order queue
Expand Down
24 changes: 21 additions & 3 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,24 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
}

/// Submits a kernel using the scheduler bypass fast path
///
/// \param KData an object storing data related to the kernel.
/// \param DepEvents list of event dependencies.
/// \param EventNeeded true, if the resulting event is needed.
/// \param Kernel used, if kernel defined as a kernel object.
/// \param KernelBundleImpPtr used, if kernel bundle defined.
/// \param CodeLoc is the code location of the submit call.
/// \param IsTopCodeLoc used to determine if the object is in a local
/// scope or in the top level scope.
///
/// \return a SYCL event representing submitted command or nullptr.
EventImplPtr submit_kernel_scheduler_bypass(
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
bool EventNeeded, std::shared_ptr<detail::kernel_impl> &Kernel,
detail::kernel_bundle_impl *KernelBundleImpPtr,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
Expand Down Expand Up @@ -904,15 +922,15 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// scope or in the top level scope.
///
/// \return a SYCL event representing submitted command group or nullptr.
detail::EventImplPtr submit_kernel_direct_impl(
EventImplPtr submit_kernel_direct_impl(
const NDRDescT &NDRDesc,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template <typename SubmitCommandFuncType>
detail::EventImplPtr submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc);
EventImplPtr submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc);

/// Helper function for submitting a memory operation with a handler.
/// \param DepEvents is a vector of dependencies of the operation.
Expand Down
94 changes: 4 additions & 90 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,97 +638,11 @@ event handler::finalize() {
// the graph is not changed, then this faster path is used to submit
// kernel bypassing scheduler and avoiding CommandGroup, Command objects
// creation.
std::vector<ur_event_handle_t> RawEvents;
// TODO checking the size of the events vector and avoiding the call is
// more efficient here at this point
if (impl->CGData.MEvents.size() > 0) {
RawEvents = detail::Command::getUrEvents(
impl->CGData.MEvents, impl->get_queue_or_null(), false);
}

bool DiscardEvent =
!impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
if (DiscardEvent) {
// Kernel only uses assert if it's non interop one
bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) &&
impl->MKernelData.usesAssert();
DiscardEvent = !KernelUsesAssert;
}

std::shared_ptr<detail::event_impl> ResultEvent =
DiscardEvent
? nullptr
: detail::event_impl::create_device_event(impl->get_queue());

auto EnqueueKernel = [&]() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
xpti_td *CmdTraceEvent = nullptr;
uint64_t InstanceID = 0;
auto StreamID = detail::getActiveXPTIStreamID();
// Only enable instrumentation if there are subscribes to the SYCL
// stream
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
if (xptiEnabled) {
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc,
*impl->MKernelData.getDeviceKernelInfoPtr(),
impl->get_queue_or_null(), impl->MKernelData.getNDRDesc(),
KernelBundleImpPtr, impl->MKernelData.getArgs());
detail::emitInstrumentationGeneral(StreamID, InstanceID,
CmdTraceEvent,
xpti::trace_task_begin, nullptr);
}
#endif
const detail::RTDeviceBinaryImage *BinImage = nullptr;
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
BinImage = detail::retrieveKernelBinary(impl->get_queue(),
impl->getKernelName());
assert(BinImage && "Failed to obtain a binary image.");
}
enqueueImpKernel(impl->get_queue(), impl->MKernelData.getNDRDesc(),
impl->MKernelData.getArgs(), KernelBundleImpPtr,
MKernel.get(),
*impl->MKernelData.getDeviceKernelInfoPtr(), RawEvents,
ResultEvent.get(), nullptr,
impl->MKernelData.getKernelCacheConfig(),
impl->MKernelData.isCooperative(),
impl->MKernelData.usesClusterLaunch(),
impl->MKernelData.getKernelWorkGroupMemorySize(),
BinImage, impl->MKernelData.getKernelFuncPtr());
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiEnabled) {
// Emit signal only when event is created
if (!DiscardEvent) {
detail::emitInstrumentationGeneral(
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
static_cast<const void *>(ResultEvent->getHandle()));
}
detail::emitInstrumentationGeneral(StreamID, InstanceID,
CmdTraceEvent,
xpti::trace_task_end, nullptr);
}
#endif
};

if (DiscardEvent) {
EnqueueKernel();
} else {
detail::queue_impl &Queue = impl->get_queue();
ResultEvent->setWorkerQueue(Queue.weak_from_this());
ResultEvent->setStateIncomplete();
ResultEvent->setSubmissionTime();

EnqueueKernel();
ResultEvent->setEnqueued();
// connect returned event with dependent events
if (!Queue.isInOrder()) {
// MEvents is not used anymore, so can move.
ResultEvent->getPreparedDepsEvents() =
std::move(impl->CGData.MEvents);
// ResultEvent is local for current thread, no need to lock.
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
}
}
detail::EventImplPtr ResultEvent =
impl->get_queue().submit_kernel_scheduler_bypass(
impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded,
MKernel, KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc);
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
return ResultEvent;
#else
Expand Down
Loading