diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
index 3dc28532b2372..718c5850b3f9a 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
@@ -259,14 +259,15 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
           typename KernelType, typename... ReductionsT>
 void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
                ReductionsT &&...Reductions) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  // TODO The handler-less path does not support reductions yet.
-  if constexpr (sizeof...(ReductionsT) == 0) {
+  // TODO The handler-less path does not support reductions and kernel function
+  // properties yet.
+  if constexpr (sizeof...(ReductionsT) == 0 &&
+                !(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
     detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
                                              Range, KernelObj);
-  } else
-#endif
-  {
+  } else {
     submit(std::move(Q), [&](handler &CGH) {
       nd_launch<KernelName>(CGH, Range, KernelObj,
                             std::forward<ReductionsT>(Reductions)...);
@@ -292,23 +293,10 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
           typename Properties, typename KernelType, typename... ReductionsT>
 void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
                const KernelType &KernelObj, ReductionsT &&...Reductions) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  // TODO The handler-less path does not support reductions yet.
-  if constexpr (sizeof...(ReductionsT) == 0) {
-    ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
-                                                          Properties>
-        ConfigAccess(Config);
-    detail::submit_kernel_direct<KernelName>(
-        std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(),
-        KernelObj);
-  } else
-#endif
-  {
-    submit(std::move(Q), [&](handler &CGH) {
-      nd_launch<KernelName>(CGH, Config, KernelObj,
-                            std::forward<ReductionsT>(Reductions)...);
-    });
-  }
+  submit(std::move(Q), [&](handler &CGH) {
+    nd_launch<KernelName>(CGH, Config, KernelObj,
+                          std::forward<ReductionsT>(Reductions)...);
+  });
 }
 
 template <int Dimensions, typename... ArgsT>
diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp
index 31464ba588dfc..e1afe00672f21 100644
--- a/sycl/include/sycl/khr/free_function_commands.hpp
+++ b/sycl/include/sycl/khr/free_function_commands.hpp
@@ -157,45 +157,54 @@ template <typename KernelType, typename = typename std::enable_if_t<
 void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
                     const sycl::detail::code_location &codeLoc =
                         sycl::detail::code_location::current()) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  detail::submit_kernel_direct(
-      q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size),
-      std::forward<KernelType>(k));
-#else
-  submit(
-      q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
-      codeLoc);
-#endif
+  // TODO The handler-less path does not support kernel function properties yet.
+  if constexpr (!(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
+    detail::submit_kernel_direct(
+        q, ext::oneapi::experimental::empty_properties_t{},
+        nd_range<1>(r, size), std::forward<KernelType>(k));
+  } else {
+    submit(
+        q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
+        codeLoc);
+  }
 }
 template <typename KernelType, typename = typename std::enable_if_t<
                                    enable_kernel_function_overload<KernelType>>>
 void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
                     const sycl::detail::code_location &codeLoc =
                         sycl::detail::code_location::current()) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  detail::submit_kernel_direct(
-      q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size),
-      std::forward<KernelType>(k));
-#else
-  submit(
-      q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
-      codeLoc);
-#endif
+  // TODO The handler-less path does not support kernel function properties yet.
+  if constexpr (!(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
+    detail::submit_kernel_direct(
+        q, ext::oneapi::experimental::empty_properties_t{},
+        nd_range<2>(r, size), std::forward<KernelType>(k));
+  } else {
+    submit(
+        q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
+        codeLoc);
+  }
 }
 template <typename KernelType, typename = typename std::enable_if_t<
                                    enable_kernel_function_overload<KernelType>>>
 void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
                     const sycl::detail::code_location &codeLoc =
                         sycl::detail::code_location::current()) {
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  detail::submit_kernel_direct(
-      q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size),
-      std::forward<KernelType>(k));
-#else
-  submit(
-      q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
-      codeLoc);
-#endif
+  // TODO The handler-less path does not support kernel function properties yet.
+  if constexpr (!(ext::oneapi::experimental::detail::
+                      HasKernelPropertiesGetMethod<
+                          const KernelType &>::value)) {
+    detail::submit_kernel_direct(
+        q, ext::oneapi::experimental::empty_properties_t{},
+        nd_range<3>(r, size), std::forward<KernelType>(k));
+  } else {
+    submit(
+        q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
+        codeLoc);
+  }
 }
 
 template <typename... Args>
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index 69911bec229fc..6e3f6b00f67ef 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -3275,15 +3275,19 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
     constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-    // TODO The handler-less path does not support reductions yet.
-    if constexpr (sizeof...(RestT) == 1) {
+
+    using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;
+
+    // TODO The handler-less path does not support reductions and kernel
+    // function properties yet.
+    if constexpr (sizeof...(RestT) == 1 &&
+                  !(ext::oneapi::experimental::detail::
+                        HasKernelPropertiesGetMethod<
+                            const KernelType &>::value)) {
       return detail::submit_kernel_direct<KernelName, true>(
           *this, ext::oneapi::experimental::empty_properties_t{}, Range,
           Rest...);
-    } else
-#endif
-    {
+    } else {
       return submit(
           [&](handler &CGH) {
             CGH.template parallel_for<KernelName>(Range, Rest...);
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 8243896cf76d1..b8c2a2866a625 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
   return EventImpl;
 }
 
+EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
+    KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
+    bool EventNeeded, detail::kernel_impl *KernelImplPtr,
+    detail::kernel_bundle_impl *KernelBundleImpPtr,
+    const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
+  std::vector<ur_event_handle_t> RawEvents;
+
+  // TODO checking the size of the events vector and avoiding the call is
+  // more efficient here at this point
+  if (DepEvents.size() > 0) {
+    RawEvents = detail::Command::getUrEvents(DepEvents, this, false);
+  }
+
+  bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents();
+  if (DiscardEvent) {
+    // Kernel only uses assert if it's non interop one
+    bool KernelUsesAssert =
+        !(KernelImplPtr && KernelImplPtr->isInterop()) && KData.usesAssert();
+    DiscardEvent = !KernelUsesAssert;
+  }
+
+  std::shared_ptr<detail::event_impl> ResultEvent =
+      DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this);
+
+  auto EnqueueKernel = [&]() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    xpti_td *CmdTraceEvent = nullptr;
+    uint64_t InstanceID = 0;
+    auto StreamID = detail::getActiveXPTIStreamID();
+    // Only enable instrumentation if there are subscribes to the SYCL
+    // stream
+    const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
+    if (xptiEnabled) {
+      std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
+          StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
+          *KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(),
+          KernelBundleImpPtr, KData.getArgs());
+      detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
+                                         xpti::trace_task_begin, nullptr);
+    }
+#endif
+    const detail::RTDeviceBinaryImage *BinImage = nullptr;
+    if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
+      BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
+      assert(BinImage && "Failed to obtain a binary image.");
+    }
+    enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
+                     KernelBundleImpPtr, KernelImplPtr,
+                     *KData.getDeviceKernelInfoPtr(), RawEvents,
+                     ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
+                     KData.isCooperative(), KData.usesClusterLaunch(),
+                     KData.getKernelWorkGroupMemorySize(), BinImage,
+                     KData.getKernelFuncPtr());
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    if (xptiEnabled) {
+      // Emit signal only when event is created
+      if (!DiscardEvent) {
+        detail::emitInstrumentationGeneral(
+            StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
+            static_cast<const void *>(ResultEvent->getHandle()));
+      }
+      detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
+                                         xpti::trace_task_end, nullptr);
+    }
+#endif
+  };
+
+  if (DiscardEvent) {
+    EnqueueKernel();
+  } else {
+    ResultEvent->setWorkerQueue(weak_from_this());
+    ResultEvent->setStateIncomplete();
+    ResultEvent->setSubmissionTime();
+
+    EnqueueKernel();
+    ResultEvent->setEnqueued();
+    // connect returned event with dependent events
+    if (!isInOrder()) {
+      // DepEvents is not used anymore, so can move.
+      ResultEvent->getPreparedDepsEvents() = std::move(DepEvents);
+      // ResultEvent is local for current thread, no need to lock.
+      ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
+    }
+  }
+
+  return ResultEvent;
+}
+
 EventImplPtr queue_impl::submit_command_to_graph(
     ext::oneapi::experimental::detail::graph_impl &GraphImpl,
     std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
@@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
   return EventImpl;
 }
 
-detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
+EventImplPtr queue_impl::submit_kernel_direct_impl(
     const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
     const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
 
   KernelData KData;
 
-  std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
-      HostKernel.takeOrCopyOwnership();
-
   KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
-  KData.setKernelFunc(HostKernelPtr->getPtr());
+  KData.setKernelFunc(HostKernel.getPtr());
   KData.setNDRDesc(NDRDesc);
 
-  auto SubmitKernelFunc =
-      [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
+  auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
+                              bool SchedulerBypass) -> EventImplPtr {
+    if (SchedulerBypass) {
+      return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
+                                            CallerNeedsEvent, nullptr, nullptr,
+                                            CodeLoc, IsTopCodeLoc);
+    }
     std::unique_ptr<detail::CG> CommandGroup;
     std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
     std::vector<std::shared_ptr<const void>> AuxiliaryResources;
 
+    std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
+        HostKernel.takeOrCopyOwnership();
+
     KData.extractArgsAndReqsFromLambda();
 
     CommandGroup.reset(new detail::CGExecKernel(
@@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
         std::move(CGData), std::move(KData).getArgs(),
         *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
         std::move(AuxiliaryResources), detail::CGType::Kernel,
-        UR_KERNEL_CACHE_CONFIG_DEFAULT,
-        false, // KernelIsCooperative
-        false, // KernelUsesClusterLaunch
-        0,     // KernelWorkGroupMemorySize
+        KData.getKernelCacheConfig(), KData.isCooperative(),
+        KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(),
         CodeLoc));
     CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
 
@@ -530,9 +621,6 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
   detail::CG::StorageInitHelper CGData;
   std::unique_lock<std::mutex> Lock(MMutex);
 
-  // Graphs are not supported yet for the no-handler path
-  assert(!hasCommandGraph());
-
   // Set the No Last Event Mode to false, since the no-handler path
   // does not support it yet.
   MNoLastEventMode.store(false, std::memory_order_relaxed);
@@ -567,11 +655,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
     }
   }
 
-  EventImplPtr EventImpl = SubmitCommandFunc(CGData);
+  bool SchedulerBypass =
+      (CGData.MEvents.size() > 0
+           ? detail::Scheduler::areEventsSafeForSchedulerBypass(
+                 CGData.MEvents, getContextImpl())
+           : true) &&
+      !hasCommandGraph();
 
-  // Sync with the last event for in order queue
-  if (isInOrder() && !EventImpl->isDiscarded()) {
-    LastEvent = EventImpl;
+  EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);
+
+  // Sync with the last event for in order queue. For scheduler-bypass flow,
+  // the ordering is done at the layers below the SYCL runtime,
+  // but for the scheduler-based flow, it needs to be done here, as the
+  // scheduler handles host task submissions.
+  if (isInOrder()) {
+    LastEvent = SchedulerBypass ? nullptr : EventImpl;
   }
 
   // Barrier and un-enqueued commands synchronization for out or order queue
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index b3cfb474e7926..49da7aee8c448 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -386,6 +386,24 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
     submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
   }
 
+  /// Submits a kernel using the scheduler bypass fast path
+  ///
+  /// \param KData is an object storing data related to the kernel.
+  /// \param DepEvents is a list of event dependencies.
+  /// \param EventNeeded should be true, if the resulting event is needed.
+  /// \param Kernel to be used, if kernel defined as a kernel object.
+  /// \param KernelBundleImpPtr to be used, if kernel bundle defined.
+  /// \param CodeLoc is the code location of the submit call.
+  /// \param IsTopCodeLoc is used to determine if the object is in a local
+  ///        scope or in the top level scope.
+  ///
+  /// \return a SYCL event representing submitted command or nullptr.
+  EventImplPtr submit_kernel_scheduler_bypass(
+      KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
+      bool EventNeeded, detail::kernel_impl *KernelImplPtr,
+      detail::kernel_bundle_impl *KernelBundleImpPtr,
+      const detail::code_location &CodeLoc, bool IsTopCodeLoc);
+
   /// Performs a blocking wait for the completion of all enqueued tasks in the
   /// queue.
   ///
@@ -908,14 +926,14 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
   ///        scope or in the top level scope.
   ///
   /// \return a SYCL event representing submitted command group or nullptr.
-  detail::EventImplPtr submit_kernel_direct_impl(
+  EventImplPtr submit_kernel_direct_impl(
       const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
       detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
       const detail::code_location &CodeLoc, bool IsTopCodeLoc);
 
   template <typename SubmitCommandFuncType>
-  detail::EventImplPtr submit_direct(bool CallerNeedsEvent,
-                                     SubmitCommandFuncType &SubmitCommandFunc);
+  EventImplPtr submit_direct(bool CallerNeedsEvent,
+                             SubmitCommandFuncType &SubmitCommandFunc);
 
   /// Helper function for submitting a memory operation with a handler.
   /// \param DepEvents is a vector of dependencies of the operation.
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 47a97a812eefa..26477c99be62c 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -638,97 +638,11 @@ event handler::finalize() {
       // the graph is not changed, then this faster path is used to submit
       // kernel bypassing scheduler and avoiding CommandGroup, Command objects
       // creation.
-      std::vector<ur_event_handle_t> RawEvents;
-      // TODO checking the size of the events vector and avoiding the call is
-      // more efficient here at this point
-      if (impl->CGData.MEvents.size() > 0) {
-        RawEvents = detail::Command::getUrEvents(
-            impl->CGData.MEvents, impl->get_queue_or_null(), false);
-      }
-
-      bool DiscardEvent =
-          !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents();
-      if (DiscardEvent) {
-        // Kernel only uses assert if it's non interop one
-        bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) &&
-                                impl->MKernelData.usesAssert();
-        DiscardEvent = !KernelUsesAssert;
-      }
-
-      std::shared_ptr<detail::event_impl> ResultEvent =
-          DiscardEvent
-              ? nullptr
-              : detail::event_impl::create_device_event(impl->get_queue());
 
-      auto EnqueueKernel = [&]() {
-#ifdef XPTI_ENABLE_INSTRUMENTATION
-        xpti_td *CmdTraceEvent = nullptr;
-        uint64_t InstanceID = 0;
-        auto StreamID = detail::getActiveXPTIStreamID();
-        // Only enable instrumentation if there are subscribes to the SYCL
-        // stream
-        const bool xptiEnabled = xptiCheckTraceEnabled(StreamID);
-        if (xptiEnabled) {
-          std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
-              StreamID, MKernel.get(), MCodeLoc, impl->MIsTopCodeLoc,
-              *impl->MKernelData.getDeviceKernelInfoPtr(),
-              impl->get_queue_or_null(), impl->MKernelData.getNDRDesc(),
-              KernelBundleImpPtr, impl->MKernelData.getArgs());
-          detail::emitInstrumentationGeneral(StreamID, InstanceID,
-                                             CmdTraceEvent,
-                                             xpti::trace_task_begin, nullptr);
-        }
-#endif
-        const detail::RTDeviceBinaryImage *BinImage = nullptr;
-        if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
-          BinImage = detail::retrieveKernelBinary(impl->get_queue(),
-                                                  impl->getKernelName());
-          assert(BinImage && "Failed to obtain a binary image.");
-        }
-        enqueueImpKernel(impl->get_queue(), impl->MKernelData.getNDRDesc(),
-                         impl->MKernelData.getArgs(), KernelBundleImpPtr,
-                         MKernel.get(),
-                         *impl->MKernelData.getDeviceKernelInfoPtr(), RawEvents,
-                         ResultEvent.get(), nullptr,
-                         impl->MKernelData.getKernelCacheConfig(),
-                         impl->MKernelData.isCooperative(),
-                         impl->MKernelData.usesClusterLaunch(),
-                         impl->MKernelData.getKernelWorkGroupMemorySize(),
-                         BinImage, impl->MKernelData.getKernelFuncPtr());
-#ifdef XPTI_ENABLE_INSTRUMENTATION
-        if (xptiEnabled) {
-          // Emit signal only when event is created
-          if (!DiscardEvent) {
-            detail::emitInstrumentationGeneral(
-                StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
-                static_cast<const void *>(ResultEvent->getHandle()));
-          }
-          detail::emitInstrumentationGeneral(StreamID, InstanceID,
-                                             CmdTraceEvent,
-                                             xpti::trace_task_end, nullptr);
-        }
-#endif
-      };
-
-      if (DiscardEvent) {
-        EnqueueKernel();
-      } else {
-        detail::queue_impl &Queue = impl->get_queue();
-        ResultEvent->setWorkerQueue(Queue.weak_from_this());
-        ResultEvent->setStateIncomplete();
-        ResultEvent->setSubmissionTime();
-
-        EnqueueKernel();
-        ResultEvent->setEnqueued();
-        // connect returned event with dependent events
-        if (!Queue.isInOrder()) {
-          // MEvents is not used anymore, so can move.
-          ResultEvent->getPreparedDepsEvents() =
-              std::move(impl->CGData.MEvents);
-          // ResultEvent is local for current thread, no need to lock.
-          ResultEvent->cleanDepEventsThroughOneLevelUnlocked();
-        }
-      }
+      detail::EventImplPtr ResultEvent =
+          impl->get_queue().submit_kernel_scheduler_bypass(
+              impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded,
+              MKernel.get(), KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc);
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
       return ResultEvent;
 #else
diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp
index 7c1781e873a39..98987c13c196e 100644
--- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp
+++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
 
   kernel<1> krn1;
   q.parallel_for(sycl::nd_range<1>{1, 1}, krn1);
-  assert(copy_count == 1);
+  assert(copy_count == 0);
   assert(move_count == 0);
   copy_count = 0;
 
diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp
index 360bdca27e73f..ca0753013c03b 100644
--- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp
+++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp
@@ -227,7 +227,6 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutNoEvent) {
   ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
 }
 
-#if __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
 TEST_F(FreeFunctionCommandsEventsTests,
        LaunchGroupedShortcutMoveKernelNoEvent) {
   mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
@@ -235,16 +234,47 @@ TEST_F(FreeFunctionCommandsEventsTests,
 
   TestMoveFunctor::MoveCtorCalls = 0;
   TestMoveFunctor MoveOnly;
+  std::mutex CvMutex;
+  std::condition_variable Cv;
+  bool ready = false;
+
+  // This kernel submission uses scheduler-bypass path, so the HostKernel
+  // shouldn't be constructed.
+
+  sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
+                            std::move(MoveOnly));
+
+  ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0);
+  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+
+  // Another kernel submission is queued behind a host task,
+  // to force the scheduler-based submission. In this case, the HostKernel
+  // should be constructed.
+
+  Queue.submit([&](sycl::handler &CGH) {
+    CGH.host_task([&] {
+      std::unique_lock<std::mutex> lk(CvMutex);
+      Cv.wait(lk, [&ready] { return ready; });
+    });
+  });
+
   sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
                             std::move(MoveOnly));
+
+  {
+    std::unique_lock<std::mutex> lk(CvMutex);
+    ready = true;
+  }
+  Cv.notify_one();
+
+  Queue.wait();
+
   // Move ctor for TestMoveFunctor is called during move construction of
   // HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete
   // it.
   ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1);
-
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2});
 }
-#endif
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) {
   mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp
index e0a5c9be50c15..c275f7d03cc9d 100644
--- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp
+++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp
@@ -26,53 +26,53 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) {
 static thread_local size_t counter_urEnqueueKernelLaunch = 0;
 inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) {
   ++counter_urEnqueueKernelLaunch;
-// TODO The no-handler scheduler submission includes a fix for the event return,
-// where the event is returned by the scheduler on every submission. This fix
-// is not yet applied to the handler-based path.
-#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
-  EXPECT_EQ(*params.pphEvent, nullptr);
-#endif
+  // TODO The no-handler scheduler submission includes a fix for the event
+  // return, where the event is returned by the scheduler on every submission.
+  // This fix is not yet applied to the handler-based path. #ifndef
+  // __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
+  //   auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+  //   EXPECT_EQ(*params.pphEvent, nullptr);
+  // #endif
   return UR_RESULT_SUCCESS;
 }
 
 static thread_local size_t counter_urUSMEnqueueMemcpy = 0;
 inline ur_result_t redefined_urUSMEnqueueMemcpy(void *pParams) {
   ++counter_urUSMEnqueueMemcpy;
-#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  auto params = *static_cast<ur_enqueue_usm_memcpy_params_t *>(pParams);
-  EXPECT_EQ(*params.pphEvent, nullptr);
-#endif
+  // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
+  //   auto params = *static_cast<ur_enqueue_usm_memcpy_params_t *>(pParams);
+  //   EXPECT_EQ(*params.pphEvent, nullptr);
+  // #endif
   return UR_RESULT_SUCCESS;
 }
 
 static thread_local size_t counter_urUSMEnqueueFill = 0;
 inline ur_result_t redefined_urUSMEnqueueFill(void *pParams) {
   ++counter_urUSMEnqueueFill;
-#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
-  EXPECT_EQ(*params.pphEvent, nullptr);
-#endif
+  // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
+  //   auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
+  //   EXPECT_EQ(*params.pphEvent, nullptr);
+  // #endif
   return UR_RESULT_SUCCESS;
 }
 
 static thread_local size_t counter_urUSMEnqueuePrefetch = 0;
 inline ur_result_t redefined_urUSMEnqueuePrefetch(void *pParams) {
   ++counter_urUSMEnqueuePrefetch;
-#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  auto params = *static_cast<ur_enqueue_usm_prefetch_params_t *>(pParams);
-  EXPECT_EQ(*params.pphEvent, nullptr);
-#endif
+  // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
+  //   auto params = *static_cast<ur_enqueue_usm_prefetch_params_t *>(pParams);
+  //   EXPECT_EQ(*params.pphEvent, nullptr);
+  // #endif
   return UR_RESULT_SUCCESS;
 }
 
 static thread_local size_t counter_urUSMEnqueueMemAdvise = 0;
 inline ur_result_t redefined_urUSMEnqueueMemAdvise(void *pParams) {
   ++counter_urUSMEnqueueMemAdvise;
-#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
-  auto params = *static_cast<ur_enqueue_usm_advise_params_t *>(pParams);
-  EXPECT_EQ(*params.pphEvent, nullptr);
-#endif
+  // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
+  //   auto params = *static_cast<ur_enqueue_usm_advise_params_t *>(pParams);
+  //   EXPECT_EQ(*params.pphEvent, nullptr);
+  // #endif
   return UR_RESULT_SUCCESS;
 }