@@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
420
420
return EventImpl;
421
421
}
422
422
423
+ EventImplPtr queue_impl::submit_kernel_scheduler_bypass (
424
+ KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
425
+ bool EventNeeded, detail::kernel_impl *KernelImplPtr,
426
+ detail::kernel_bundle_impl *KernelBundleImpPtr,
427
+ const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
428
+ std::vector<ur_event_handle_t > RawEvents;
429
+
430
+ // TODO checking the size of the events vector and avoiding the call is
431
+ // more efficient here at this point
432
+ if (DepEvents.size () > 0 ) {
433
+ RawEvents = detail::Command::getUrEvents (DepEvents, this , false );
434
+ }
435
+
436
+ bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents ();
437
+ if (DiscardEvent) {
438
+ // Kernel only uses assert if it's non interop one
439
+ bool KernelUsesAssert =
440
+ !(KernelImplPtr && KernelImplPtr->isInterop ()) && KData.usesAssert ();
441
+ DiscardEvent = !KernelUsesAssert;
442
+ }
443
+
444
+ std::shared_ptr<detail::event_impl> ResultEvent =
445
+ DiscardEvent ? nullptr : detail::event_impl::create_device_event (*this );
446
+
447
+ auto EnqueueKernel = [&]() {
448
+ #ifdef XPTI_ENABLE_INSTRUMENTATION
449
+ xpti_td *CmdTraceEvent = nullptr ;
450
+ uint64_t InstanceID = 0 ;
451
+ auto StreamID = detail::getActiveXPTIStreamID ();
452
+ // Only enable instrumentation if there are subscribes to the SYCL
453
+ // stream
454
+ const bool xptiEnabled = xptiCheckTraceEnabled (StreamID);
455
+ if (xptiEnabled) {
456
+ std::tie (CmdTraceEvent, InstanceID) = emitKernelInstrumentationData (
457
+ StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
458
+ *KData.getDeviceKernelInfoPtr (), this , KData.getNDRDesc (),
459
+ KernelBundleImpPtr, KData.getArgs ());
460
+ detail::emitInstrumentationGeneral (StreamID, InstanceID, CmdTraceEvent,
461
+ xpti::trace_task_begin, nullptr );
462
+ }
463
+ #endif
464
+ const detail::RTDeviceBinaryImage *BinImage = nullptr ;
465
+ if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get ()) {
466
+ BinImage = detail::retrieveKernelBinary (*this , KData.getKernelName ());
467
+ assert (BinImage && " Failed to obtain a binary image." );
468
+ }
469
+ enqueueImpKernel (*this , KData.getNDRDesc (), KData.getArgs (),
470
+ KernelBundleImpPtr, KernelImplPtr,
471
+ *KData.getDeviceKernelInfoPtr (), RawEvents,
472
+ ResultEvent.get (), nullptr , KData.getKernelCacheConfig (),
473
+ KData.isCooperative (), KData.usesClusterLaunch (),
474
+ KData.getKernelWorkGroupMemorySize (), BinImage,
475
+ KData.getKernelFuncPtr ());
476
+ #ifdef XPTI_ENABLE_INSTRUMENTATION
477
+ if (xptiEnabled) {
478
+ // Emit signal only when event is created
479
+ if (!DiscardEvent) {
480
+ detail::emitInstrumentationGeneral (
481
+ StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
482
+ static_cast <const void *>(ResultEvent->getHandle ()));
483
+ }
484
+ detail::emitInstrumentationGeneral (StreamID, InstanceID, CmdTraceEvent,
485
+ xpti::trace_task_end, nullptr );
486
+ }
487
+ #endif
488
+ };
489
+
490
+ if (DiscardEvent) {
491
+ EnqueueKernel ();
492
+ } else {
493
+ ResultEvent->setWorkerQueue (weak_from_this ());
494
+ ResultEvent->setStateIncomplete ();
495
+ ResultEvent->setSubmissionTime ();
496
+
497
+ EnqueueKernel ();
498
+ ResultEvent->setEnqueued ();
499
+ // connect returned event with dependent events
500
+ if (!isInOrder ()) {
501
+ // DepEvents is not used anymore, so can move.
502
+ ResultEvent->getPreparedDepsEvents () = std::move (DepEvents);
503
+ // ResultEvent is local for current thread, no need to lock.
504
+ ResultEvent->cleanDepEventsThroughOneLevelUnlocked ();
505
+ }
506
+ }
507
+
508
+ return ResultEvent;
509
+ }
510
+
423
511
EventImplPtr queue_impl::submit_command_to_graph (
424
512
ext::oneapi::experimental::detail::graph_impl &GraphImpl,
425
513
std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
@@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
475
563
return EventImpl;
476
564
}
477
565
478
- detail:: EventImplPtr queue_impl::submit_kernel_direct_impl (
566
+ EventImplPtr queue_impl::submit_kernel_direct_impl (
479
567
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
480
568
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
481
569
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
482
570
483
571
KernelData KData;
484
572
485
- std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
486
- HostKernel.takeOrCopyOwnership ();
487
-
488
573
KData.setDeviceKernelInfoPtr (DeviceKernelInfo);
489
- KData.setKernelFunc (HostKernelPtr-> getPtr ());
574
+ KData.setKernelFunc (HostKernel. getPtr ());
490
575
KData.setNDRDesc (NDRDesc);
491
576
492
- auto SubmitKernelFunc =
493
- [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
577
+ auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
578
+ bool SchedulerBypass) -> EventImplPtr {
579
+ if (SchedulerBypass) {
580
+ return submit_kernel_scheduler_bypass (KData, CGData.MEvents ,
581
+ CallerNeedsEvent, nullptr , nullptr ,
582
+ CodeLoc, IsTopCodeLoc);
583
+ }
494
584
std::unique_ptr<detail::CG> CommandGroup;
495
585
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
496
586
std::vector<std::shared_ptr<const void >> AuxiliaryResources;
497
587
588
+ std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
589
+ HostKernel.takeOrCopyOwnership ();
590
+
498
591
KData.extractArgsAndReqsFromLambda ();
499
592
500
593
CommandGroup.reset (new detail::CGExecKernel (
@@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
504
597
std::move (CGData), std::move (KData).getArgs (),
505
598
*KData.getDeviceKernelInfoPtr (), std::move (StreamStorage),
506
599
std::move (AuxiliaryResources), detail::CGType::Kernel,
507
- UR_KERNEL_CACHE_CONFIG_DEFAULT,
508
- false , // KernelIsCooperative
509
- false , // KernelUsesClusterLaunch
510
- 0 , // KernelWorkGroupMemorySize
600
+ KData.getKernelCacheConfig (), KData.isCooperative (),
601
+ KData.usesClusterLaunch (), KData.getKernelWorkGroupMemorySize (),
511
602
CodeLoc));
512
603
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
513
604
@@ -567,11 +658,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
567
658
}
568
659
}
569
660
570
- EventImplPtr EventImpl = SubmitCommandFunc (CGData);
661
+ bool SchedulerBypass =
662
+ (CGData.MEvents .size () > 0
663
+ ? detail::Scheduler::areEventsSafeForSchedulerBypass (
664
+ CGData.MEvents , getContextImpl ())
665
+ : true ) &&
666
+ !hasCommandGraph ();
571
667
572
- // Sync with the last event for in order queue
573
- if (isInOrder () && !EventImpl->isDiscarded ()) {
574
- LastEvent = EventImpl;
668
+ EventImplPtr EventImpl = SubmitCommandFunc (CGData, SchedulerBypass);
669
+
670
+ // Sync with the last event for in order queue. For scheduler-bypass flow,
671
+ // the ordering is done at the layers below the SYCL runtime,
672
+ // but for the scheduler-based flow, it needs to be done here, as the
673
+ // scheduler handles host task submissions.
674
+ if (isInOrder ()) {
675
+ LastEvent = SchedulerBypass ? nullptr : EventImpl;
575
676
}
576
677
577
678
// Barrier and un-enqueued commands synchronization for out or order queue
0 commit comments