Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1d1bf16
[SYCL] Add scheduler-bypass for handler-less kernel submission path
slawekptak Sep 29, 2025
91ad6dd
Remove unnecessary EventImpl check
slawekptak Sep 29, 2025
4d06579
Extract the scheduler bypass logic into a separate function
slawekptak Sep 30, 2025
03deefe
Change the EventImpl var name back to original
slawekptak Sep 30, 2025
956d27a
Address review comments
slawekptak Oct 1, 2025
f0c9da5
Merge branch 'sycl' into no_handler_scheduler_bypass
slawekptak Oct 1, 2025
795375a
Merge branch 'sycl' into no_handler_scheduler_bypass
slawekptak Oct 3, 2025
f15f842
Allocate HostKernel on the scheduler path only
slawekptak Oct 3, 2025
e8dc229
Fix formatting
slawekptak Oct 3, 2025
bcf270f
Address review comments
slawekptak Oct 3, 2025
43b4b3a
Change the LaunchGroupedShortcutMoveKernelNoEvent unit test,
slawekptak Oct 3, 2025
2f0280d
[SYCL] Fallback path for handler-less kernel properties
slawekptak Oct 6, 2025
4832419
Add properties check to free function extension
slawekptak Oct 6, 2025
798c1ca
Merge branch 'sycl' into no_handler_scheduler_bypass
slawekptak Oct 6, 2025
2bd29d0
[SYCL] Remove assertion for graph support for handler-less kernel submit
slawekptak Oct 6, 2025
36fa311
Use scheduler bypass path only if no graph associated with the queue
slawekptak Oct 6, 2025
6500bf0
Merge branch 'no_handler_properties_fallback' into temp_no_handler_in…
slawekptak Oct 7, 2025
2ed4cb7
Merge branch 'no_handler_graph_record_followup' into temp_no_handler_…
slawekptak Oct 7, 2025
3b867c2
Temp - Remove the no-handler macro
slawekptak Oct 7, 2025
fa18fcd
Comment out the event check
slawekptak Oct 7, 2025
f464e17
Change the expected copy_count in test_num_kernel_copies
slawekptak Oct 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 11 additions & 23 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,15 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
ReductionsT &&...Reductions) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions yet.
if constexpr (sizeof...(ReductionsT) == 0) {
// TODO The handler-less path does not support reductions and kernel function
// properties yet.
if constexpr (sizeof...(ReductionsT) == 0 &&
!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
Range, KernelObj);
} else
#endif
{
} else {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
Expand All @@ -292,23 +293,10 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename Properties, typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
const KernelType &KernelObj, ReductionsT &&...Reductions) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions yet.
if constexpr (sizeof...(ReductionsT) == 0) {
ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
Properties>
ConfigAccess(Config);
detail::submit_kernel_direct<KernelName>(
std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(),
KernelObj);
} else
#endif
{
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
}
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
}

template <int Dimensions, typename... ArgsT>
Expand Down
63 changes: 36 additions & 27 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,45 +157,54 @@ template <typename KernelType, typename = typename std::enable_if_t<
void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size),
std::forward<KernelType>(k));
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
// TODO The handler-less path does not support kernel function properties yet.
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), std::forward<KernelType>(k));
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
}
}
template <typename KernelType, typename = typename std::enable_if_t<
enable_kernel_function_overload<KernelType>>>
void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size),
std::forward<KernelType>(k));
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
// TODO The handler-less path does not support kernel function properties yet.
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), std::forward<KernelType>(k));
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
}
}
template <typename KernelType, typename = typename std::enable_if_t<
enable_kernel_function_overload<KernelType>>>
void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size),
std::forward<KernelType>(k));
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
// TODO The handler-less path does not support kernel function properties yet.
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), std::forward<KernelType>(k));
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
}
}

template <typename... Args>
Expand Down
16 changes: 10 additions & 6 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3275,15 +3275,19 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions yet.
if constexpr (sizeof...(RestT) == 1) {

using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;

// TODO The handler-less path does not support reductions and kernel
// function properties yet.
if constexpr (sizeof...(RestT) == 1 &&
!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
return detail::submit_kernel_direct<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
Rest...);
} else
#endif
{
} else {
return submit(
[&](handler &CGH) {
CGH.template parallel_for<KernelName>(Range, Rest...);
Expand Down
134 changes: 116 additions & 18 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
return EventImpl;
}

EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
bool EventNeeded, detail::kernel_impl *KernelImplPtr,
detail::kernel_bundle_impl *KernelBundleImpPtr,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
std::vector<ur_event_handle_t> RawEvents;

// TODO checking the size of the events vector and avoiding the call is
// more efficient here at this point
if (DepEvents.size() > 0) {
RawEvents = detail::Command::getUrEvents(DepEvents, this, false);
}

bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents();
if (DiscardEvent) {
// Kernel only uses assert if it's non interop one
bool KernelUsesAssert =
!(KernelImplPtr && KernelImplPtr->isInterop()) && KData.usesAssert();
DiscardEvent = !KernelUsesAssert;
}

std::shared_ptr<detail::event_impl> ResultEvent =
DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this);

auto EnqueueKernel = [&]() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
xpti_td *CmdTraceEvent = nullptr;
uint64_t InstanceID = 0;
auto StreamID = detail::getActiveXPTIStreamID();
// Only enable instrumentation if there are subscribes to the SYCL
// stream
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
if (xptiEnabled) {
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
*KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(),
KernelBundleImpPtr, KData.getArgs());
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
xpti::trace_task_begin, nullptr);
}
#endif
const detail::RTDeviceBinaryImage *BinImage = nullptr;
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
assert(BinImage && "Failed to obtain a binary image.");
}
enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
KernelBundleImpPtr, KernelImplPtr,
*KData.getDeviceKernelInfoPtr(), RawEvents,
ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
KData.isCooperative(), KData.usesClusterLaunch(),
KData.getKernelWorkGroupMemorySize(), BinImage,
KData.getKernelFuncPtr());
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiEnabled) {
// Emit signal only when event is created
if (!DiscardEvent) {
detail::emitInstrumentationGeneral(
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
static_cast<const void *>(ResultEvent->getHandle()));
}
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
xpti::trace_task_end, nullptr);
}
#endif
};

if (DiscardEvent) {
EnqueueKernel();
} else {
ResultEvent->setWorkerQueue(weak_from_this());
ResultEvent->setStateIncomplete();
ResultEvent->setSubmissionTime();

EnqueueKernel();
ResultEvent->setEnqueued();
// connect returned event with dependent events
if (!isInOrder()) {
// DepEvents is not used anymore, so can move.
ResultEvent->getPreparedDepsEvents() = std::move(DepEvents);
// ResultEvent is local for current thread, no need to lock.
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
}
}

return ResultEvent;
}

EventImplPtr queue_impl::submit_command_to_graph(
ext::oneapi::experimental::detail::graph_impl &GraphImpl,
std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
Expand Down Expand Up @@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
return EventImpl;
}

detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
EventImplPtr queue_impl::submit_kernel_direct_impl(
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {

KernelData KData;

std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
HostKernel.takeOrCopyOwnership();

KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
KData.setKernelFunc(HostKernelPtr->getPtr());
KData.setKernelFunc(HostKernel.getPtr());
KData.setNDRDesc(NDRDesc);

auto SubmitKernelFunc =
[&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
bool SchedulerBypass) -> EventImplPtr {
if (SchedulerBypass) {
return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, nullptr, nullptr,
CodeLoc, IsTopCodeLoc);
}
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
std::vector<std::shared_ptr<const void>> AuxiliaryResources;

std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
HostKernel.takeOrCopyOwnership();

KData.extractArgsAndReqsFromLambda();

CommandGroup.reset(new detail::CGExecKernel(
Expand All @@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
std::move(CGData), std::move(KData).getArgs(),
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
std::move(AuxiliaryResources), detail::CGType::Kernel,
UR_KERNEL_CACHE_CONFIG_DEFAULT,
false, // KernelIsCooperative
false, // KernelUsesClusterLaunch
0, // KernelWorkGroupMemorySize
KData.getKernelCacheConfig(), KData.isCooperative(),
KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(),
CodeLoc));
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;

Expand All @@ -530,9 +621,6 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
detail::CG::StorageInitHelper CGData;
std::unique_lock<std::mutex> Lock(MMutex);

// Graphs are not supported yet for the no-handler path
assert(!hasCommandGraph());

// Set the No Last Event Mode to false, since the no-handler path
// does not support it yet.
MNoLastEventMode.store(false, std::memory_order_relaxed);
Expand Down Expand Up @@ -567,11 +655,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
}
}

EventImplPtr EventImpl = SubmitCommandFunc(CGData);
bool SchedulerBypass =
(CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, getContextImpl())
: true) &&
!hasCommandGraph();

// Sync with the last event for in order queue
if (isInOrder() && !EventImpl->isDiscarded()) {
LastEvent = EventImpl;
EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);

// Sync with the last event for in order queue. For scheduler-bypass flow,
// the ordering is done at the layers below the SYCL runtime,
// but for the scheduler-based flow, it needs to be done here, as the
// scheduler handles host task submissions.
if (isInOrder()) {
LastEvent = SchedulerBypass ? nullptr : EventImpl;
}

// Barrier and un-enqueued commands synchronization for out or order queue
Expand Down
24 changes: 21 additions & 3 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,24 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
}

/// Submits a kernel using the scheduler bypass fast path
///
/// \param KData is an object storing data related to the kernel.
/// \param DepEvents is a list of event dependencies.
/// \param EventNeeded should be true, if the resulting event is needed.
/// \param Kernel to be used, if kernel defined as a kernel object.
/// \param KernelBundleImpPtr to be used, if kernel bundle defined.
/// \param CodeLoc is the code location of the submit call.
/// \param IsTopCodeLoc is used to determine if the object is in a local
/// scope or in the top level scope.
///
/// \return a SYCL event representing submitted command or nullptr.
EventImplPtr submit_kernel_scheduler_bypass(
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
bool EventNeeded, detail::kernel_impl *KernelImplPtr,
detail::kernel_bundle_impl *KernelBundleImpPtr,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
Expand Down Expand Up @@ -908,14 +926,14 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/// scope or in the top level scope.
///
/// \return a SYCL event representing submitted command group or nullptr.
detail::EventImplPtr submit_kernel_direct_impl(
EventImplPtr submit_kernel_direct_impl(
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template <typename SubmitCommandFuncType>
detail::EventImplPtr submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc);
EventImplPtr submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc);

/// Helper function for submitting a memory operation with a handler.
/// \param DepEvents is a vector of dependencies of the operation.
Expand Down
Loading
Loading