Skip to content

Commit 9895530

Browse files
authored
[SYCL] Add scheduler-bypass for handler-less kernel submission path (#20234)
The handler-less kernel submission path has been extended to support the fast, scheduler-bypass submission.
1 parent 25d9de0 commit 9895530

File tree

4 files changed

+175
-110
lines changed

4 files changed

+175
-110
lines changed

sycl/source/detail/queue_impl.cpp

Lines changed: 116 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
420420
return EventImpl;
421421
}
422422

423+
EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
424+
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
425+
bool EventNeeded, detail::kernel_impl *KernelImplPtr,
426+
detail::kernel_bundle_impl *KernelBundleImpPtr,
427+
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
428+
std::vector<ur_event_handle_t> RawEvents;
429+
430+
// TODO checking the size of the events vector and avoiding the call is
431+
// more efficient here at this point
432+
if (DepEvents.size() > 0) {
433+
RawEvents = detail::Command::getUrEvents(DepEvents, this, false);
434+
}
435+
436+
bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents();
437+
if (DiscardEvent) {
438+
// Kernel only uses assert if it's non interop one
439+
bool KernelUsesAssert =
440+
!(KernelImplPtr && KernelImplPtr->isInterop()) && KData.usesAssert();
441+
DiscardEvent = !KernelUsesAssert;
442+
}
443+
444+
std::shared_ptr<detail::event_impl> ResultEvent =
445+
DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this);
446+
447+
auto EnqueueKernel = [&]() {
448+
#ifdef XPTI_ENABLE_INSTRUMENTATION
449+
xpti_td *CmdTraceEvent = nullptr;
450+
uint64_t InstanceID = 0;
451+
auto StreamID = detail::getActiveXPTIStreamID();
452+
// Only enable instrumentation if there are subscribes to the SYCL
453+
// stream
454+
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
455+
if (xptiEnabled) {
456+
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
457+
StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
458+
*KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(),
459+
KernelBundleImpPtr, KData.getArgs());
460+
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
461+
xpti::trace_task_begin, nullptr);
462+
}
463+
#endif
464+
const detail::RTDeviceBinaryImage *BinImage = nullptr;
465+
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
466+
BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
467+
assert(BinImage && "Failed to obtain a binary image.");
468+
}
469+
enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
470+
KernelBundleImpPtr, KernelImplPtr,
471+
*KData.getDeviceKernelInfoPtr(), RawEvents,
472+
ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
473+
KData.isCooperative(), KData.usesClusterLaunch(),
474+
KData.getKernelWorkGroupMemorySize(), BinImage,
475+
KData.getKernelFuncPtr());
476+
#ifdef XPTI_ENABLE_INSTRUMENTATION
477+
if (xptiEnabled) {
478+
// Emit signal only when event is created
479+
if (!DiscardEvent) {
480+
detail::emitInstrumentationGeneral(
481+
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
482+
static_cast<const void *>(ResultEvent->getHandle()));
483+
}
484+
detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
485+
xpti::trace_task_end, nullptr);
486+
}
487+
#endif
488+
};
489+
490+
if (DiscardEvent) {
491+
EnqueueKernel();
492+
} else {
493+
ResultEvent->setWorkerQueue(weak_from_this());
494+
ResultEvent->setStateIncomplete();
495+
ResultEvent->setSubmissionTime();
496+
497+
EnqueueKernel();
498+
ResultEvent->setEnqueued();
499+
// connect returned event with dependent events
500+
if (!isInOrder()) {
501+
// DepEvents is not used anymore, so can move.
502+
ResultEvent->getPreparedDepsEvents() = std::move(DepEvents);
503+
// ResultEvent is local for current thread, no need to lock.
504+
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
505+
}
506+
}
507+
508+
return ResultEvent;
509+
}
510+
423511
EventImplPtr queue_impl::submit_command_to_graph(
424512
ext::oneapi::experimental::detail::graph_impl &GraphImpl,
425513
std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
@@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
475563
return EventImpl;
476564
}
477565

478-
detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
566+
EventImplPtr queue_impl::submit_kernel_direct_impl(
479567
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
480568
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
481569
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
482570

483571
KernelData KData;
484572

485-
std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
486-
HostKernel.takeOrCopyOwnership();
487-
488573
KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
489-
KData.setKernelFunc(HostKernelPtr->getPtr());
574+
KData.setKernelFunc(HostKernel.getPtr());
490575
KData.setNDRDesc(NDRDesc);
491576

492-
auto SubmitKernelFunc =
493-
[&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
577+
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
578+
bool SchedulerBypass) -> EventImplPtr {
579+
if (SchedulerBypass) {
580+
return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
581+
CallerNeedsEvent, nullptr, nullptr,
582+
CodeLoc, IsTopCodeLoc);
583+
}
494584
std::unique_ptr<detail::CG> CommandGroup;
495585
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
496586
std::vector<std::shared_ptr<const void>> AuxiliaryResources;
497587

588+
std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
589+
HostKernel.takeOrCopyOwnership();
590+
498591
KData.extractArgsAndReqsFromLambda();
499592

500593
CommandGroup.reset(new detail::CGExecKernel(
@@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
504597
std::move(CGData), std::move(KData).getArgs(),
505598
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
506599
std::move(AuxiliaryResources), detail::CGType::Kernel,
507-
UR_KERNEL_CACHE_CONFIG_DEFAULT,
508-
false, // KernelIsCooperative
509-
false, // KernelUsesClusterLaunch
510-
0, // KernelWorkGroupMemorySize
600+
KData.getKernelCacheConfig(), KData.isCooperative(),
601+
KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(),
511602
CodeLoc));
512603
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
513604

@@ -567,11 +658,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
567658
}
568659
}
569660

570-
EventImplPtr EventImpl = SubmitCommandFunc(CGData);
661+
bool SchedulerBypass =
662+
(CGData.MEvents.size() > 0
663+
? detail::Scheduler::areEventsSafeForSchedulerBypass(
664+
CGData.MEvents, getContextImpl())
665+
: true) &&
666+
!hasCommandGraph();
571667

572-
// Sync with the last event for in order queue
573-
if (isInOrder() && !EventImpl->isDiscarded()) {
574-
LastEvent = EventImpl;
668+
EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);
669+
670+
// Sync with the last event for in order queue. For scheduler-bypass flow,
671+
// the ordering is done at the layers below the SYCL runtime,
672+
// but for the scheduler-based flow, it needs to be done here, as the
673+
// scheduler handles host task submissions.
674+
if (isInOrder()) {
675+
LastEvent = SchedulerBypass ? nullptr : EventImpl;
575676
}
576677

577678
// Barrier and un-enqueued commands synchronization for out or order queue

sycl/source/detail/queue_impl.hpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,24 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
386386
submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
387387
}
388388

389+
/// Submits a kernel using the scheduler bypass fast path
390+
///
391+
/// \param KData is an object storing data related to the kernel.
392+
/// \param DepEvents is a list of event dependencies.
393+
/// \param EventNeeded should be true, if the resulting event is needed.
394+
/// \param Kernel to be used, if kernel defined as a kernel object.
395+
/// \param KernelBundleImpPtr to be used, if kernel bundle defined.
396+
/// \param CodeLoc is the code location of the submit call.
397+
/// \param IsTopCodeLoc is used to determine if the object is in a local
398+
/// scope or in the top level scope.
399+
///
400+
/// \return a SYCL event representing submitted command or nullptr.
401+
EventImplPtr submit_kernel_scheduler_bypass(
402+
KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
403+
bool EventNeeded, detail::kernel_impl *KernelImplPtr,
404+
detail::kernel_bundle_impl *KernelBundleImpPtr,
405+
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
406+
389407
/// Performs a blocking wait for the completion of all enqueued tasks in the
390408
/// queue.
391409
///
@@ -908,14 +926,14 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
908926
/// scope or in the top level scope.
909927
///
910928
/// \return a SYCL event representing submitted command group or nullptr.
911-
detail::EventImplPtr submit_kernel_direct_impl(
929+
EventImplPtr submit_kernel_direct_impl(
912930
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
913931
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
914932
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
915933

916934
template <typename SubmitCommandFuncType>
917-
detail::EventImplPtr submit_direct(bool CallerNeedsEvent,
918-
SubmitCommandFuncType &SubmitCommandFunc);
935+
EventImplPtr submit_direct(bool CallerNeedsEvent,
936+
SubmitCommandFuncType &SubmitCommandFunc);
919937

920938
/// Helper function for submitting a memory operation with a handler.
921939
/// \param DepEvents is a vector of dependencies of the operation.

sycl/source/handler.cpp

Lines changed: 4 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -638,97 +638,11 @@ event handler::finalize() {
638638
// the graph is not changed, then this faster path is used to submit
639639
// kernel bypassing scheduler and avoiding CommandGroup, Command objects
640640
// creation.
641-
std::vector<ur_event_handle_t> RawEvents;
642-
// TODO checking the size of the events vector and avoiding the call is
643-
// more efficient here at this point
644-
if (impl->CGData.MEvents.size() > 0) {
645-
RawEvents = detail::Command::getUrEvents(
646-
impl->CGData.MEvents, impl->get_queue_or_null(), false);
647-
}
648-
649-
bool DiscardEvent =
650-
!impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
651-
if (DiscardEvent) {
652-
// Kernel only uses assert if it's non interop one
653-
bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) &&
654-
impl->MKernelData.usesAssert();
655-
DiscardEvent = !KernelUsesAssert;
656-
}
657-
658-
std::shared_ptr<detail::event_impl> ResultEvent =
659-
DiscardEvent
660-
? nullptr
661-
: detail::event_impl::create_device_event(impl->get_queue());
662641

663-
auto EnqueueKernel = [&]() {
664-
#ifdef XPTI_ENABLE_INSTRUMENTATION
665-
xpti_td *CmdTraceEvent = nullptr;
666-
uint64_t InstanceID = 0;
667-
auto StreamID = detail::getActiveXPTIStreamID();
668-
// Only enable instrumentation if there are subscribes to the SYCL
669-
// stream
670-
const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
671-
if (xptiEnabled) {
672-
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
673-
StreamID, MKernel.get(), MCodeLoc, impl->MIsTopCodeLoc,
674-
*impl->MKernelData.getDeviceKernelInfoPtr(),
675-
impl->get_queue_or_null(), impl->MKernelData.getNDRDesc(),
676-
KernelBundleImpPtr, impl->MKernelData.getArgs());
677-
detail::emitInstrumentationGeneral(StreamID, InstanceID,
678-
CmdTraceEvent,
679-
xpti::trace_task_begin, nullptr);
680-
}
681-
#endif
682-
const detail::RTDeviceBinaryImage *BinImage = nullptr;
683-
if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
684-
BinImage = detail::retrieveKernelBinary(impl->get_queue(),
685-
impl->getKernelName());
686-
assert(BinImage && "Failed to obtain a binary image.");
687-
}
688-
enqueueImpKernel(impl->get_queue(), impl->MKernelData.getNDRDesc(),
689-
impl->MKernelData.getArgs(), KernelBundleImpPtr,
690-
MKernel.get(),
691-
*impl->MKernelData.getDeviceKernelInfoPtr(), RawEvents,
692-
ResultEvent.get(), nullptr,
693-
impl->MKernelData.getKernelCacheConfig(),
694-
impl->MKernelData.isCooperative(),
695-
impl->MKernelData.usesClusterLaunch(),
696-
impl->MKernelData.getKernelWorkGroupMemorySize(),
697-
BinImage, impl->MKernelData.getKernelFuncPtr());
698-
#ifdef XPTI_ENABLE_INSTRUMENTATION
699-
if (xptiEnabled) {
700-
// Emit signal only when event is created
701-
if (!DiscardEvent) {
702-
detail::emitInstrumentationGeneral(
703-
StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
704-
static_cast<const void *>(ResultEvent->getHandle()));
705-
}
706-
detail::emitInstrumentationGeneral(StreamID, InstanceID,
707-
CmdTraceEvent,
708-
xpti::trace_task_end, nullptr);
709-
}
710-
#endif
711-
};
712-
713-
if (DiscardEvent) {
714-
EnqueueKernel();
715-
} else {
716-
detail::queue_impl &Queue = impl->get_queue();
717-
ResultEvent->setWorkerQueue(Queue.weak_from_this());
718-
ResultEvent->setStateIncomplete();
719-
ResultEvent->setSubmissionTime();
720-
721-
EnqueueKernel();
722-
ResultEvent->setEnqueued();
723-
// connect returned event with dependent events
724-
if (!Queue.isInOrder()) {
725-
// MEvents is not used anymore, so can move.
726-
ResultEvent->getPreparedDepsEvents() =
727-
std::move(impl->CGData.MEvents);
728-
// ResultEvent is local for current thread, no need to lock.
729-
ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
730-
}
731-
}
642+
detail::EventImplPtr ResultEvent =
643+
impl->get_queue().submit_kernel_scheduler_bypass(
644+
impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded,
645+
MKernel.get(), KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc);
732646
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
733647
return ResultEvent;
734648
#else

sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,14 +235,46 @@ TEST_F(FreeFunctionCommandsEventsTests,
235235

236236
TestMoveFunctor::MoveCtorCalls = 0;
237237
TestMoveFunctor MoveOnly;
238+
std::mutex CvMutex;
239+
std::condition_variable Cv;
240+
bool ready = false;
241+
242+
// This kernel submission uses scheduler-bypass path, so the HostKernel
243+
// shouldn't be constructed.
244+
245+
sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
246+
std::move(MoveOnly));
247+
248+
ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0);
249+
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
250+
251+
// Another kernel submission is queued behind a host task,
252+
// to force the scheduler-based submission. In this case, the HostKernel
253+
// should be constructed.
254+
255+
Queue.submit([&](sycl::handler &CGH) {
256+
CGH.host_task([&] {
257+
std::unique_lock<std::mutex> lk(CvMutex);
258+
Cv.wait(lk, [&ready] { return ready; });
259+
});
260+
});
261+
238262
sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
239263
std::move(MoveOnly));
264+
265+
{
266+
std::unique_lock<std::mutex> lk(CvMutex);
267+
ready = true;
268+
}
269+
Cv.notify_one();
270+
271+
Queue.wait();
272+
240273
// Move ctor for TestMoveFunctor is called during move construction of
241274
// HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete
242275
// it.
243276
ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1);
244-
245-
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
277+
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2});
246278
}
247279
#endif
248280

0 commit comments

Comments
 (0)