intel · sarnex · Nov 3, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 17, 2025
@@ -511,7 +511,7 @@ event handler::finalize() {
 
   // TODO checking the size of the events vector and avoiding the call is more
   // efficient here at this point
-  const bool KernelFastPath =
+  const bool KernelSchedulerBypass =
       (Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() &&
        !impl->CGData.MRequirements.size() && !MStreamStorage.size() &&
        (impl->CGData.MEvents.size() == 0 ||
@@ -521,7 +521,7 @@ event handler::finalize() {
   // Extract arguments from the kernel lambda, if required.
   // Skipping this is currently limited to simple kernels on the fast path.
   if (type == detail::CGType::Kernel && impl->MKernelData.getKernelFuncPtr() &&
-      (!KernelFastPath || impl->MKernelData.hasSpecialCaptures())) {
+      (!KernelSchedulerBypass || impl->MKernelData.hasSpecialCaptures())) {
     impl->MKernelData.extractArgsAndReqsFromLambda();
   }
 
@@ -633,7 +633,7 @@ event handler::finalize() {
       }
     }
 
-    if (KernelFastPath) {
+    if (KernelSchedulerBypass) {
       // if user does not add a new dependency to the dependency graph, i.e.
       // the graph is not changed, then this faster path is used to submit
       // kernel bypassing scheduler and avoiding CommandGroup, Command objects
@@ -879,9 +879,18 @@ event handler::finalize() {
 #endif
   }
 
-  bool DiscardEvent = !impl->MEventNeeded && Queue &&
-                      Queue->supportsDiscardingPiEvents() &&
-                      CommandGroup->getRequirements().size() == 0;
+  // For kernel submission, regardless of whether an event has been requested,
+  // the scheduler needs to generate an event so the commands are properly
+  // ordered (for in-order queue) and synchronized with a barrier (for
+  // out-of-order queue). The event can only be skipped for the scheduler bypass
+  // path.
+  //
+  // For commands other than kernel submission, if an event has not been
+  // requested, the queue supports events discarding, and the scheduler
+  // might have been bypassed (not supported yet), the event can be skipped.
+  bool DiscardEvent =
+      (type != detail::CGType::Kernel && KernelSchedulerBypass &&
+       !impl->MEventNeeded && Queue->supportsDiscardingPiEvents());
 
   detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
       std::move(CommandGroup), *Queue, !DiscardEvent);

@@ -21,6 +21,8 @@
 
 using namespace sycl;
 
+namespace oneapiext = ext::oneapi::experimental;
+
 size_t GEventsWaitCounter = 0;
 
 inline ur_result_t redefinedEventsWaitWithBarrier(void *pParams) {
@@ -49,20 +51,20 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
 }
 
 enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 };
-std::vector<std::pair<CommandType, size_t>> ExecutedCommands;
+std::vector<std::tuple<CommandType, size_t, size_t>> ExecutedCommands;
 
 inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) {
   auto params =
       *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
-  ExecutedCommands.push_back(
-      {CommandType::KERNEL, *params.pnumEventsInWaitList});
+  ExecutedCommands.push_back({CommandType::KERNEL, *params.pnumEventsInWaitList,
+                              **params.ppGlobalWorkSize});
   return UR_RESULT_SUCCESS;
 }
 
 inline ur_result_t customEnqueueUSMFill(void *pParams) {
   auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
   ExecutedCommands.push_back(
-      {CommandType::MEMSET, *params.pnumEventsInWaitList});
+      {CommandType::MEMSET, *params.pnumEventsInWaitList, 0});
   return UR_RESULT_SUCCESS;
 }
 
@@ -92,6 +94,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
     CGH.host_task([&] {
       std::unique_lock<std::mutex> lk(CvMutex);
       Cv.wait(lk, [&ready] { return ready; });
+      ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
     });
   });
 
@@ -113,11 +116,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
 
   InOrderQueue.wait();
 
-  ASSERT_EQ(ExecutedCommands.size(), 2u);
-  EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
-  EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
-  EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
-  EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
+  ASSERT_EQ(ExecutedCommands.size(), 3u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
+            CommandType::HOST_TASK);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
+            CommandType::MEMSET);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
+            CommandType::KERNEL);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
 }
 
 TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
@@ -142,6 +149,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
     CGH.host_task([&] {
       std::unique_lock<std::mutex> lk(CvMutex);
       Cv.wait(lk, [&ready] { return ready; });
+      ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
     });
   });
 
@@ -159,11 +167,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
 
   InOrderQueue.wait();
 
-  ASSERT_EQ(ExecutedCommands.size(), 2u);
-  EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
-  EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
-  EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
-  EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
+  ASSERT_EQ(ExecutedCommands.size(), 3u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
+            CommandType::HOST_TASK);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
+            CommandType::MEMSET);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
+            CommandType::KERNEL);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
 }
 
 TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
@@ -186,7 +198,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
     CGH.host_task([&] {
       std::unique_lock<std::mutex> lk(CvMutex);
       Cv.wait(lk, [&ready] { return ready; });
-      ExecutedCommands.push_back({CommandType::HOST_TASK, 0});
+      ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
     });
   });
 
@@ -202,8 +214,61 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
   InOrderQueue.wait();
 
   ASSERT_EQ(ExecutedCommands.size(), 2u);
-  EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK);
-  EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
-  EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
-  EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
+            CommandType::HOST_TASK);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[0]) /*EventsCount*/, 0u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
+            CommandType::KERNEL);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
+}
+
+TEST_F(SchedulerTest, InOrderQueueCrossDepsEnqueueFunctions) {
+  ExecutedCommands.clear();
+  sycl::unittest::UrMock<> Mock;
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &customEnqueueKernelLaunchWithArgsExp);
+
+  sycl::platform Plt = sycl::platform();
+
+  context Ctx{Plt};
+  queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};
+
+  std::mutex CvMutex;
+  std::condition_variable Cv;
+  bool ready = false;
+
+  InOrderQueue.submit([&](sycl::handler &CGH) {
+    CGH.host_task([&] {
+      std::unique_lock<std::mutex> lk(CvMutex);
+      Cv.wait(lk, [&ready] { return ready; });
+      ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
+    });
+  });
+
+  oneapiext::nd_launch<TestKernel>(
+      InOrderQueue, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {});
+
+  oneapiext::nd_launch<TestKernel>(
+      InOrderQueue, nd_range<1>{range<1>{64}, range<1>{32}}, [](nd_item<1>) {});
+
+  {
+    std::unique_lock<std::mutex> lk(CvMutex);
+    ready = true;
+  }
+  Cv.notify_one();
+
+  InOrderQueue.wait();
+
+  ASSERT_EQ(ExecutedCommands.size(), 3u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
+            CommandType::HOST_TASK);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
+            CommandType::KERNEL);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
+  EXPECT_EQ(std::get<2>(ExecutedCommands[1]) /*GlobalWorkSize*/, 32u);
+  EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
+            CommandType::KERNEL);
+  EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
+  EXPECT_EQ(std::get<2>(ExecutedCommands[2]) /*GlobalWorkSize*/, 64u);
 }