intel
diff --git a/‎devops/dependencies-igc-dev.json‎
Lines changed: 4 additions & 4 deletions b/‎devops/dependencies-igc-dev.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎devops/scripts/benchmarks/benches/compute.py‎
Lines changed: 48 additions & 41 deletions b/‎devops/scripts/benchmarks/benches/compute.py‎
Lines changed: 48 additions & 41 deletions
diff --git a/‎devops/scripts/benchmarks/html/scripts.js‎
Lines changed: 1 addition & 1 deletion b/‎devops/scripts/benchmarks/html/scripts.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sycl/source/detail/program_manager/program_manager.cpp‎
Lines changed: 1 addition & 1 deletion b/‎sycl/source/detail/program_manager/program_manager.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sycl/source/detail/queue_impl.cpp‎
Lines changed: 60 additions & 0 deletions b/‎sycl/source/detail/queue_impl.cpp‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎sycl/source/detail/queue_impl.hpp‎
Lines changed: 6 additions & 0 deletions b/‎sycl/source/detail/queue_impl.hpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎sycl/source/detail/scheduler/commands.cpp‎
Lines changed: 16 additions & 8 deletions b/‎sycl/source/detail/scheduler/commands.cpp‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎sycl/source/handler.cpp‎
Lines changed: 2 additions & 48 deletions b/‎sycl/source/handler.cpp‎
Lines changed: 2 additions & 48 deletions
diff --git a/‎sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp‎
Lines changed: 0 additions & 3 deletions b/‎sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp‎
Lines changed: 0 additions & 3 deletions
@@ -1,10 +1,10 @@
 {
   "linux": {
     "igc_dev": {
-      "github_tag": "igc-dev-e146785",
-      "version": "e146785",
-      "updated_at": "2025-10-02T03:05:40Z",
-      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4161218080/zip",
+      "github_tag": "igc-dev-e4b64c1",
+      "version": "e4b64c1",
+      "updated_at": "2025-10-05T10:41:23Z",
+      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4185473239/zip",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     }
   }
 
@@ -3,19 +3,19 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from itertools import product
+import copy
 import csv
 import io
-import copy
 import math
 from enum import Enum
+from itertools import product
 from pathlib import Path
 
-from .base import Benchmark, Suite, TracingType
-from utils.result import BenchmarkMetadata, Result
-from .base import Benchmark, Suite
-from options import options
 from git_project import GitProject
+from options import options
+from utils.result import BenchmarkMetadata, Result
+
+from .base import Benchmark, Suite, TracingType
 
 
 class RUNTIMES(Enum):
@@ -100,66 +100,57 @@ def setup(self) -> None:
 
     def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata = {
-            "SubmitKernel": BenchmarkMetadata(
-                type="group",
-                description="Measures CPU time overhead of submitting kernels through different APIs.",
-                notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
-                "The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
-                "The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
-                "Work is ongoing to reduce the overhead of the SYCL API\n",
-                tags=["submit", "micro", "SYCL", "UR", "L0"],
-                range_min=0.0,
-            ),
             "SinKernelGraph": BenchmarkMetadata(
                 type="group",
                 unstable="This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
                 tags=["submit", "memory", "proxy", "SYCL", "UR", "L0", "graph"],
             ),
-            "SubmitGraph": BenchmarkMetadata(
-                type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
-            ),
             "FinalizeGraph": BenchmarkMetadata(
                 type="group", tags=["finalize", "micro", "SYCL", "graph"]
             ),
         }
 
         # Add metadata for all SubmitKernel group variants
-        base_metadata = metadata["SubmitKernel"]
-
+        submit_kernel_metadata = BenchmarkMetadata(
+            type="group",
+            notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
+            "The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
+            "The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
+            "Work is ongoing to reduce the overhead of the SYCL API\n",
+            tags=["submit", "micro", "SYCL", "UR", "L0"],
+            range_min=0.0,
+        )
         for order in ["in order", "out of order"]:
             for completion in ["", " with completion"]:
                 for events in ["", " using events"]:
                     group_name = f"SubmitKernel {order}{completion}{events} long kernel"
-                    metadata[group_name] = BenchmarkMetadata(
-                        type="group",
-                        description=f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs.",
-                        notes=base_metadata.notes,
-                        tags=base_metadata.tags,
-                        range_min=base_metadata.range_min,
+                    metadata[group_name] = copy.deepcopy(submit_kernel_metadata)
+                    metadata[group_name].description = (
+                        f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs."
                     )
-
                     # CPU count variants
                     cpu_count_group = f"{group_name}, CPU count"
-                    metadata[cpu_count_group] = BenchmarkMetadata(
-                        type="group",
-                        description=f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs.",
-                        notes=base_metadata.notes,
-                        tags=base_metadata.tags,
-                        range_min=base_metadata.range_min,
+                    metadata[cpu_count_group] = copy.deepcopy(submit_kernel_metadata)
+                    metadata[cpu_count_group].description = (
+                        f"Measures CPU instruction count overhead of submitting {order} kernels with longer execution times through different APIs."
                     )
 
         # Add metadata for all SubmitGraph group variants
-        base_metadata = metadata["SubmitGraph"]
+        submit_graph_metadata = BenchmarkMetadata(
+            type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
+        )
         for order in ["in order", "out of order"]:
             for completion in ["", " with completion"]:
                 for events in ["", " using events"]:
                     for num_kernels in self.submit_graph_num_kernels:
-                        group_name = f"SubmitGraph {order}{completion}{events}, {num_kernels} kernels"
-                        metadata[group_name] = BenchmarkMetadata(
-                            type="group",
-                            tags=base_metadata.tags,
-                        )
-
+                        for host_tasks in ["", " use host tasks"]:
+                            group_name = f"SubmitGraph {order}{completion}{events}{host_tasks}, {num_kernels} kernels"
+                            metadata[group_name] = copy.deepcopy(submit_graph_metadata)
+                            # CPU count variants
+                            cpu_count_group = f"{group_name}, CPU count"
+                            metadata[cpu_count_group] = copy.deepcopy(
+                                submit_graph_metadata
+                            )
         return metadata
 
     def benchmarks(self) -> list[Benchmark]:
@@ -1088,6 +1079,22 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
+    def get_metadata(self) -> dict[str, BenchmarkMetadata]:
+        metadata_dict = super().get_metadata()
+
+        # Create CPU count variant with modified display name and explicit_group
+        cpu_count_name = self.name() + " CPU count"
+        cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
+        cpu_count_display_name = self.display_name() + ", CPU count"
+        cpu_count_explicit_group = (
+            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
+        )
+        cpu_count_metadata.display_name = cpu_count_display_name
+        cpu_count_metadata.explicit_group = cpu_count_explicit_group
+        metadata_dict[cpu_count_name] = cpu_count_metadata
+
+        return metadata_dict
+
 
 class UllsEmptyKernel(ComputeBenchmark):
     def __init__(
 
@@ -345,7 +345,7 @@ function createChart(data, containerId, type) {
             if (elements.length > 0) {
                 const point = elements[0].element.$context.raw;
                 if (point.gitHash && point.gitRepo) {
-                    window.open(`https://github.com/${point.gitRepo}/commit/${point.gitHash}`, '_blank');
+                    window.open(`${point.gitRepo}/commit/${point.gitHash}`, '_blank');
                 }
             }
         };
 
@@ -164,7 +164,7 @@ static bool isDeviceBinaryTypeSupported(context_impl &ContextImpl,
 [[maybe_unused]] auto VecToString = [](auto &Vec) -> std::string {
   std::ostringstream Out;
   Out << "{";
-  for (auto Elem : Vec)
+  for (const auto &Elem : Vec)
     Out << Elem << " ";
   Out << "}";
   return Out.str();
 
@@ -420,6 +420,61 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
   return EventImpl;
 }
 
+EventImplPtr queue_impl::submit_command_to_graph(
+    ext::oneapi::experimental::detail::graph_impl &GraphImpl,
+    std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
+    sycl::ext::oneapi::experimental::node_type UserFacingNodeType) {
+  auto EventImpl = detail::event_impl::create_completed_host_event();
+  EventImpl->setSubmittedQueue(weak_from_this());
+  ext::oneapi::experimental::detail::node_impl *NodeImpl = nullptr;
+
+  // GraphImpl is read and written in this scope so we lock this graph
+  // with full priviledges.
+  ext::oneapi::experimental::detail::graph_impl::WriteLock Lock(
+      GraphImpl.MMutex);
+
+  ext::oneapi::experimental::node_type NodeType =
+      UserFacingNodeType != ext::oneapi::experimental::node_type::empty
+          ? UserFacingNodeType
+          : ext::oneapi::experimental::detail::getNodeTypeFromCG(CGType);
+
+  // Create a new node in the graph representing this command-group
+  if (isInOrder()) {
+    // In-order queues create implicit linear dependencies between nodes.
+    // Find the last node added to the graph from this queue, so our new
+    // node can set it as a predecessor.
+    std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
+    if (ext::oneapi::experimental::detail::node_impl *DependentNode =
+            GraphImpl.getLastInorderNode(this)) {
+      Deps.push_back(DependentNode);
+    }
+    NodeImpl = &GraphImpl.add(NodeType, std::move(CommandGroup), Deps);
+
+    // If we are recording an in-order queue remember the new node, so it
+    // can be used as a dependency for any more nodes recorded from this
+    // queue.
+    GraphImpl.setLastInorderNode(*this, *NodeImpl);
+  } else {
+    ext::oneapi::experimental::detail::node_impl *LastBarrierRecordedFromQueue =
+        GraphImpl.getBarrierDep(weak_from_this());
+    std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
+
+    if (LastBarrierRecordedFromQueue) {
+      Deps.push_back(LastBarrierRecordedFromQueue);
+    }
+    NodeImpl = &GraphImpl.add(NodeType, std::move(CommandGroup), Deps);
+
+    if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) {
+      GraphImpl.setBarrierDep(weak_from_this(), *NodeImpl);
+    }
+  }
+
+  // Associate an event with this new node and return the event.
+  GraphImpl.addEventForNode(EventImpl, *NodeImpl);
+
+  return EventImpl;
+}
+
 detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
     const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
@@ -456,6 +511,11 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
         CodeLoc));
     CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
 
+    if (auto GraphImpl = getCommandGraph(); GraphImpl) {
+      return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
+                                     detail::CGType::Kernel);
+    }
+
     return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
                                                   *this, true);
   };
 
@@ -624,6 +624,12 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
 
   bool hasCommandGraph() const { return !MGraph.expired(); }
 
+  EventImplPtr submit_command_to_graph(
+      ext::oneapi::experimental::detail::graph_impl &GraphImpl,
+      std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
+      sycl::ext::oneapi::experimental::node_type UserFacingNodeType =
+          ext::oneapi::experimental::node_type::empty);
+
   unsigned long long getQueueID() { return MQueueID; }
 
   void *getTraceEvent() { return MTraceEvent; }
 
@@ -2468,14 +2468,16 @@ static ur_result_t SetKernelParamsAndLaunch(
         /* pPropSizeRet = */ nullptr);
 
     const bool EnforcedLocalSize =
-        (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
-         RequiredWGSize[2] != 0);
+        (RequiredWGSize[0] != 0 &&
+         (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
+         (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
     if (EnforcedLocalSize)
       LocalSize = RequiredWGSize;
   }
-  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 ||
-                         NDRDesc.GlobalOffset[1] != 0 ||
-                         NDRDesc.GlobalOffset[2] != 0;
+
+  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
+                         (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
+                         (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
 
   std::vector<ur_kernel_launch_property_t> property_list;
 
@@ -2610,6 +2612,10 @@ ur_result_t enqueueImpCommandBufferKernel(
   size_t RequiredWGSize[3] = {0, 0, 0};
   size_t *LocalSize = nullptr;
 
+  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
+                         (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
+                         (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
+
   if (HasLocalSize)
     LocalSize = &NDRDesc.LocalSize[0];
   else {
@@ -2620,8 +2626,9 @@ ur_result_t enqueueImpCommandBufferKernel(
         /* pPropSizeRet = */ nullptr);
 
     const bool EnforcedLocalSize =
-        (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
-         RequiredWGSize[2] != 0);
+        (RequiredWGSize[0] != 0 &&
+         (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
+         (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
     if (EnforcedLocalSize)
       LocalSize = RequiredWGSize;
   }
@@ -2637,7 +2644,8 @@ ur_result_t enqueueImpCommandBufferKernel(
 
   ur_result_t Res =
       Adapter.call_nocheck<UrApiKind::urCommandBufferAppendKernelLaunchExp>(
-          CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
+          CommandBuffer, UrKernel, NDRDesc.Dims,
+          HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
           &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(),
           AltUrKernels.size() ? AltUrKernels.data() : nullptr,
           SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0,
 
@@ -955,54 +955,8 @@ event handler::finalize() {
   // If the queue has an associated graph then we need to take the CG and pass
   // it to the graph to create a node, rather than submit it to the scheduler.
   if (auto GraphImpl = Queue->getCommandGraph(); GraphImpl) {
-    auto EventImpl = detail::event_impl::create_completed_host_event();
-    EventImpl->setSubmittedQueue(Queue->weak_from_this());
-    ext::oneapi::experimental::detail::node_impl *NodeImpl = nullptr;
-
-    // GraphImpl is read and written in this scope so we lock this graph
-    // with full priviledges.
-    ext::oneapi::experimental::detail::graph_impl::WriteLock Lock(
-        GraphImpl->MMutex);
-
-    ext::oneapi::experimental::node_type NodeType =
-        impl->MUserFacingNodeType != ext::oneapi::experimental::node_type::empty
-            ? impl->MUserFacingNodeType
-            : ext::oneapi::experimental::detail::getNodeTypeFromCG(getType());
-
-    // Create a new node in the graph representing this command-group
-    if (Queue->isInOrder()) {
-      // In-order queues create implicit linear dependencies between nodes.
-      // Find the last node added to the graph from this queue, so our new
-      // node can set it as a predecessor.
-      std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
-      if (ext::oneapi::experimental::detail::node_impl *DependentNode =
-              GraphImpl->getLastInorderNode(Queue)) {
-        Deps.push_back(DependentNode);
-      }
-      NodeImpl = &GraphImpl->add(NodeType, std::move(CommandGroup), Deps);
-
-      // If we are recording an in-order queue remember the new node, so it
-      // can be used as a dependency for any more nodes recorded from this
-      // queue.
-      GraphImpl->setLastInorderNode(*Queue, *NodeImpl);
-    } else {
-      ext::oneapi::experimental::detail::node_impl
-          *LastBarrierRecordedFromQueue =
-              GraphImpl->getBarrierDep(Queue->weak_from_this());
-      std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
-
-      if (LastBarrierRecordedFromQueue) {
-        Deps.push_back(LastBarrierRecordedFromQueue);
-      }
-      NodeImpl = &GraphImpl->add(NodeType, std::move(CommandGroup), Deps);
-
-      if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) {
-        GraphImpl->setBarrierDep(Queue->weak_from_this(), *NodeImpl);
-      }
-    }
-
-    // Associate an event with this new node and return the event.
-    GraphImpl->addEventForNode(EventImpl, *NodeImpl);
+    auto EventImpl = Queue->submit_command_to_graph(
+        *GraphImpl, std::move(CommandGroup), type, impl->MUserFacingNodeType);
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
     return EventImpl;
 
@@ -626,8 +626,6 @@ TEST_F(CommandGraphTest, AccessorModeEdges) {
 
 // Tests the transitive queue recording behaviour with queue shortcuts.
 TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) {
-// Graphs not supported yet for the no-handler submit path
-#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
   device Dev;
   context Ctx{{Dev}};
   queue Q1{Ctx, Dev};
@@ -671,7 +669,6 @@ TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) {
             ext::oneapi::experimental::queue_state::executing);
   ASSERT_EQ(Q3.ext_oneapi_get_state(),
             ext::oneapi::experimental::queue_state::executing);
-#endif
 }
 
 // Tests that dynamic_work_group_memory.get() will throw on the host side.
Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,10 @@`
`1`	`1`	`{`
`2`	`2`	`"linux": {`
`3`	`3`	`"igc_dev": {`
`4`		`- "github_tag": "igc-dev-e146785",`
`5`		`- "version": "e146785",`
`6`		`- "updated_at": "2025-10-02T03:05:40Z",`
`7`		`- "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4161218080/zip",`
	`4`	`+ "github_tag": "igc-dev-e4b64c1",`
	`5`	`+ "version": "e4b64c1",`
	`6`	`+ "updated_at": "2025-10-05T10:41:23Z",`
	`7`	`+ "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4185473239/zip",`
`8`	`8`	`"root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"`
`9`	`9`	`}`
`10`	`10`	`}`
Original file line number	Diff line number	Diff line change
`@@ -345,7 +345,7 @@ function createChart(data, containerId, type) {`
`345`	`345`	`if (elements.length > 0) {`
`346`	`346`	`const point = elements[0].element.$context.raw;`
`347`	`347`	`if (point.gitHash && point.gitRepo) {`
`348`		- window.open(`https://github.com/${point.gitRepo}/commit/${point.gitHash}`, '_blank');
	`348`	+ window.open(`${point.gitRepo}/commit/${point.gitHash}`, '_blank');
`349`	`349`	`}`
`350`	`350`	`}`
`351`	`351`	`};`