intel · slawekptak · Sep 29, 2025 · Sep 29, 2025 · Sep 30, 2025 · Sep 30, 2025
@@ -259,14 +259,15 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
           typename KernelType, typename... ReductionsT>
 void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
                ReductionsT &&...Reductions) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  // TODO The handler-less path does not support reductions yet.
-  if constexpr (sizeof...(ReductionsT) == 0) {
+  // TODO The handler-less path does not support reductions and kernel function
+  // properties yet.
+  if constexpr (sizeof...(ReductionsT) == 0 &&
+                !(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
     detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
                                              Range, KernelObj);
-  } else
-#endif
-  {
+  } else {
     submit(std::move(Q), [&](handler &CGH) {
       nd_launch<KernelName>(CGH, Range, KernelObj,
                             std::forward<ReductionsT>(Reductions)...);
@@ -292,23 +293,10 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
           typename Properties, typename KernelType, typename... ReductionsT>
 void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
                const KernelType &KernelObj, ReductionsT &&...Reductions) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  // TODO The handler-less path does not support reductions yet.
-  if constexpr (sizeof...(ReductionsT) == 0) {
-    ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
-                                                          Properties>
-        ConfigAccess(Config);
-    detail::submit_kernel_direct<KernelName>(
-        std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(),
-        KernelObj);
-  } else
-#endif
-  {
-    submit(std::move(Q), [&](handler &CGH) {
-      nd_launch<KernelName>(CGH, Config, KernelObj,
-                            std::forward<ReductionsT>(Reductions)...);
-    });
-  }
+  submit(std::move(Q), [&](handler &CGH) {
+    nd_launch<KernelName>(CGH, Config, KernelObj,
+                          std::forward<ReductionsT>(Reductions)...);
+  });
 }
 
 template <int Dimensions, typename... ArgsT>

@@ -157,45 +157,54 @@ template <typename KernelType, typename = typename std::enable_if_t<
 void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
                     const sycl::detail::code_location &codeLoc =
                         sycl::detail::code_location::current()) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  detail::submit_kernel_direct(
-      q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size),
-      std::forward<KernelType>(k));
-#else
-  submit(
-      q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
-      codeLoc);
-#endif
+  // TODO The handler-less path does not support kernel function properties yet.
+  if constexpr (!(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
+    detail::submit_kernel_direct(
+        q, ext::oneapi::experimental::empty_properties_t{},
+        nd_range<1>(r, size), std::forward<KernelType>(k));
+  } else {
+    submit(
+        q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
+        codeLoc);
+  }
 }
 template <typename KernelType, typename = typename std::enable_if_t<
                                    enable_kernel_function_overload<KernelType>>>
 void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
                     const sycl::detail::code_location &codeLoc =
                         sycl::detail::code_location::current()) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  detail::submit_kernel_direct(
-      q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size),
-      std::forward<KernelType>(k));
-#else
-  submit(
-      q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
-      codeLoc);
-#endif
+  // TODO The handler-less path does not support kernel function properties yet.
+  if constexpr (!(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
+    detail::submit_kernel_direct(
+        q, ext::oneapi::experimental::empty_properties_t{},
+        nd_range<2>(r, size), std::forward<KernelType>(k));
+  } else {
+    submit(
+        q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
+        codeLoc);
+  }
 }
 template <typename KernelType, typename = typename std::enable_if_t<
                                    enable_kernel_function_overload<KernelType>>>
 void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
                     const sycl::detail::code_location &codeLoc =
                         sycl::detail::code_location::current()) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  detail::submit_kernel_direct(
-      q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size),
-      std::forward<KernelType>(k));
-#else
-  submit(
-      q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
-      codeLoc);
-#endif
+  // TODO The handler-less path does not support kernel function properties yet.
+  if constexpr (!(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
+    detail::submit_kernel_direct(
+        q, ext::oneapi::experimental::empty_properties_t{},
+        nd_range<3>(r, size), std::forward<KernelType>(k));
+  } else {
+    submit(
+        q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
+        codeLoc);
+  }
 }
 
 template <typename... Args>

@@ -3275,15 +3275,19 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
     constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-    // TODO The handler-less path does not support reductions yet.
-    if constexpr (sizeof...(RestT) == 1) {
+
+    using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;
+
+    // TODO The handler-less path does not support reductions and kernel
+    // function properties yet.
+    if constexpr (sizeof...(RestT) == 1 &&
+                  !(ext::oneapi::experimental::detail::
+                        HasKernelPropertiesGetMethod<
+                            const KernelType &>::value)) {
       return detail::submit_kernel_direct<KernelName, true>(
           *this, ext::oneapi::experimental::empty_properties_t{}, Range,
           Rest...);
-    } else
-#endif
-    {
+    } else {
       return submit(
           [&](handler &CGH) {
             CGH.template parallel_for<KernelName>(Range, Rest...);

@@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
   return EventImpl;
 }
 
+EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
+    KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
+    bool EventNeeded, detail::kernel_impl *KernelImplPtr,
+    detail::kernel_bundle_impl *KernelBundleImpPtr,
+    const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
+  std::vector<ur_event_handle_t> RawEvents;
+
+  // TODO checking the size of the events vector and avoiding the call is
+  // more efficient here at this point
+  if (DepEvents.size() > 0) {
+    RawEvents = detail::Command::getUrEvents(DepEvents, this, false);
+  }
+
+  bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents();
+  if (DiscardEvent) {
+    // Kernel only uses assert if it's non interop one
+    bool KernelUsesAssert =
+        !(KernelImplPtr && KernelImplPtr->isInterop()) && KData.usesAssert();
+    DiscardEvent = !KernelUsesAssert;
+  }
+
+  std::shared_ptr<detail::event_impl> ResultEvent =
+      DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this);
+
+  auto EnqueueKernel = [&]() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    xpti_td *CmdTraceEvent = nullptr;
+    uint64_t InstanceID = 0;
+    auto StreamID = detail::getActiveXPTIStreamID();
+    // Only enable instrumentation if there are subscribes to the SYCL
+    // stream
+    const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
+    if (xptiEnabled) {
+      std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
+          StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
+          *KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(),
+          KernelBundleImpPtr, KData.getArgs());
+      detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
+                                         xpti::trace_task_begin, nullptr);
+    }
+#endif
+    const detail::RTDeviceBinaryImage *BinImage = nullptr;
+    if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
+      BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
+      assert(BinImage && "Failed to obtain a binary image.");
+    }
+    enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
+                     KernelBundleImpPtr, KernelImplPtr,
+                     *KData.getDeviceKernelInfoPtr(), RawEvents,
+                     ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
+                     KData.isCooperative(), KData.usesClusterLaunch(),
+                     KData.getKernelWorkGroupMemorySize(), BinImage,
+                     KData.getKernelFuncPtr());
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    if (xptiEnabled) {
+      // Emit signal only when event is created
+      if (!DiscardEvent) {
+        detail::emitInstrumentationGeneral(
+            StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
+            static_cast<const void *>(ResultEvent->getHandle()));
+      }
+      detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
+                                         xpti::trace_task_end, nullptr);
+    }
+#endif
+  };
+
+  if (DiscardEvent) {
+    EnqueueKernel();
+  } else {
+    ResultEvent->setWorkerQueue(weak_from_this());
+    ResultEvent->setStateIncomplete();
+    ResultEvent->setSubmissionTime();
+
+    EnqueueKernel();
+    ResultEvent->setEnqueued();
+    // connect returned event with dependent events
+    if (!isInOrder()) {
+      // DepEvents is not used anymore, so can move.
+      ResultEvent->getPreparedDepsEvents() = std::move(DepEvents);
+      // ResultEvent is local for current thread, no need to lock.
+      ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
+    }
+  }
+
+  return ResultEvent;
+}
+
 EventImplPtr queue_impl::submit_command_to_graph(
     ext::oneapi::experimental::detail::graph_impl &GraphImpl,
     std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
@@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
   return EventImpl;
 }
 
-detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
+EventImplPtr queue_impl::submit_kernel_direct_impl(
     const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
     const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
 
   KernelData KData;
 
-  std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
-      HostKernel.takeOrCopyOwnership();
-
   KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
-  KData.setKernelFunc(HostKernelPtr->getPtr());
+  KData.setKernelFunc(HostKernel.getPtr());
   KData.setNDRDesc(NDRDesc);
 
-  auto SubmitKernelFunc =
-      [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
+  auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
+                              bool SchedulerBypass) -> EventImplPtr {
+    if (SchedulerBypass) {
+      return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
+                                            CallerNeedsEvent, nullptr, nullptr,
+                                            CodeLoc, IsTopCodeLoc);
+    }
     std::unique_ptr<detail::CG> CommandGroup;
     std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
     std::vector<std::shared_ptr<const void>> AuxiliaryResources;
 
+    std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
+        HostKernel.takeOrCopyOwnership();
+
     KData.extractArgsAndReqsFromLambda();
 
     CommandGroup.reset(new detail::CGExecKernel(
@@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
         std::move(CGData), std::move(KData).getArgs(),
         *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
         std::move(AuxiliaryResources), detail::CGType::Kernel,
-        UR_KERNEL_CACHE_CONFIG_DEFAULT,
-        false, // KernelIsCooperative
-        false, // KernelUsesClusterLaunch
-        0,     // KernelWorkGroupMemorySize
+        KData.getKernelCacheConfig(), KData.isCooperative(),
+        KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(),
         CodeLoc));
     CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
 
@@ -530,9 +621,6 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
   detail::CG::StorageInitHelper CGData;
   std::unique_lock<std::mutex> Lock(MMutex);
 
-  // Graphs are not supported yet for the no-handler path
-  assert(!hasCommandGraph());
-
   // Set the No Last Event Mode to false, since the no-handler path
   // does not support it yet.
   MNoLastEventMode.store(false, std::memory_order_relaxed);
@@ -567,11 +655,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
     }
   }
 
-  EventImplPtr EventImpl = SubmitCommandFunc(CGData);
+  bool SchedulerBypass =
+      (CGData.MEvents.size() > 0
+           ? detail::Scheduler::areEventsSafeForSchedulerBypass(
+                 CGData.MEvents, getContextImpl())
+           : true) &&
+      !hasCommandGraph();
 
-  // Sync with the last event for in order queue
-  if (isInOrder() && !EventImpl->isDiscarded()) {
-    LastEvent = EventImpl;
+  EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);
+
+  // Sync with the last event for in order queue. For scheduler-bypass flow,
+  // the ordering is done at the layers below the SYCL runtime,
+  // but for the scheduler-based flow, it needs to be done here, as the
+  // scheduler handles host task submissions.
+  if (isInOrder()) {
+    LastEvent = SchedulerBypass ? nullptr : EventImpl;
   }
 
   // Barrier and un-enqueued commands synchronization for out or order queue

@@ -386,6 +386,24 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
     submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
   }
 
+  /// Submits a kernel using the scheduler bypass fast path
+  ///
+  /// \param KData is an object storing data related to the kernel.
+  /// \param DepEvents is a list of event dependencies.
+  /// \param EventNeeded should be true, if the resulting event is needed.
+  /// \param Kernel to be used, if kernel defined as a kernel object.
+  /// \param KernelBundleImpPtr to be used, if kernel bundle defined.
+  /// \param CodeLoc is the code location of the submit call.
+  /// \param IsTopCodeLoc is used to determine if the object is in a local
+  ///        scope or in the top level scope.
+  ///
+  /// \return a SYCL event representing submitted command or nullptr.
+  EventImplPtr submit_kernel_scheduler_bypass(
+      KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
+      bool EventNeeded, detail::kernel_impl *KernelImplPtr,
+      detail::kernel_bundle_impl *KernelBundleImpPtr,
+      const detail::code_location &CodeLoc, bool IsTopCodeLoc);
+
   /// Performs a blocking wait for the completion of all enqueued tasks in the
   /// queue.
   ///
@@ -908,14 +926,14 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
   ///        scope or in the top level scope.
   ///
   /// \return a SYCL event representing submitted command group or nullptr.
-  detail::EventImplPtr submit_kernel_direct_impl(
+  EventImplPtr submit_kernel_direct_impl(
       const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
       detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
       const detail::code_location &CodeLoc, bool IsTopCodeLoc);
 
   template <typename SubmitCommandFuncType>
-  detail::EventImplPtr submit_direct(bool CallerNeedsEvent,
-                                     SubmitCommandFuncType &SubmitCommandFunc);
+  EventImplPtr submit_direct(bool CallerNeedsEvent,
+                             SubmitCommandFuncType &SubmitCommandFunc);
 
   /// Helper function for submitting a memory operation with a handler.
   /// \param DepEvents is a vector of dependencies of the operation.