intel
diff --git a/‎clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp‎
Lines changed: 23 additions & 1 deletion b/‎clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎devops/dependencies-igc-dev.json‎
Lines changed: 4 additions & 4 deletions b/‎devops/dependencies-igc-dev.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎devops/scripts/benchmarks/benches/compute.py‎
Lines changed: 48 additions & 41 deletions b/‎devops/scripts/benchmarks/benches/compute.py‎
Lines changed: 48 additions & 41 deletions
diff --git a/‎devops/scripts/benchmarks/benches/syclbench.py‎
Lines changed: 1 addition & 1 deletion b/‎devops/scripts/benchmarks/benches/syclbench.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sycl/source/detail/program_manager/program_manager.cpp‎
Lines changed: 1 addition & 1 deletion b/‎sycl/source/detail/program_manager/program_manager.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sycl/source/detail/queue_impl.cpp‎
Lines changed: 60 additions & 0 deletions b/‎sycl/source/detail/queue_impl.cpp‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎sycl/source/detail/queue_impl.hpp‎
Lines changed: 6 additions & 0 deletions b/‎sycl/source/detail/queue_impl.hpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎sycl/source/detail/scheduler/commands.cpp‎
Lines changed: 16 additions & 8 deletions b/‎sycl/source/detail/scheduler/commands.cpp‎
Lines changed: 16 additions & 8 deletions
@@ -942,7 +942,29 @@ static void addBackendOptions(const ArgList &Args,
                               SmallVector<StringRef, 8> &CmdArgs, bool IsCPU) {
   StringRef OptC =
       Args.getLastArgValue(OPT_sycl_backend_compile_options_from_image_EQ);
-  OptC.split(CmdArgs, " ", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  if (IsCPU) {
+    OptC.split(CmdArgs, " ", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  } else {
+    // ocloc -options args need to be comma separated, e.g. `-options
+    // "-g,-cl-opt-disable"`. Otherwise, only the first arg is processed by
+    // ocloc as an arg for -options, and the rest are processed as standalone
+    // flags, possibly leading to errors.
+    // split function here returns a pair with everything before the separator
+    // ("-options") in the first member of the pair, and everything after the
+    // separator in the second part of the pair. The separator is not included
+    // in any of them.
+    auto [BeforeOptions, AfterOptions] = OptC.split("-options ");
+    // Only add if not empty, an empty arg can lead to ocloc errors.
+    if (!BeforeOptions.empty())
+      CmdArgs.push_back(BeforeOptions);
+    if (!AfterOptions.empty()) {
+      // Separator not included by the split function, so explicitly added here.
+      CmdArgs.push_back("-options");
+      std::string Replace = AfterOptions.str();
+      std::replace(Replace.begin(), Replace.end(), ' ', ',');
+      CmdArgs.push_back(Args.MakeArgString(Replace));
+    }
+  }
   StringRef OptL =
       Args.getLastArgValue(OPT_sycl_backend_link_options_from_image_EQ);
   OptL.split(CmdArgs, " ", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
 
@@ -1,10 +1,10 @@
 {
   "linux": {
     "igc_dev": {
-      "github_tag": "igc-dev-e146785",
-      "version": "e146785",
-      "updated_at": "2025-10-02T03:05:40Z",
-      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4161218080/zip",
+      "github_tag": "igc-dev-e4b64c1",
+      "version": "e4b64c1",
+      "updated_at": "2025-10-05T10:41:23Z",
+      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4185473239/zip",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     }
   }
 
@@ -3,19 +3,19 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from itertools import product
+import copy
 import csv
 import io
-import copy
 import math
 from enum import Enum
+from itertools import product
 from pathlib import Path
 
-from .base import Benchmark, Suite, TracingType
-from utils.result import BenchmarkMetadata, Result
-from .base import Benchmark, Suite
-from options import options
 from git_project import GitProject
+from options import options
+from utils.result import BenchmarkMetadata, Result
+
+from .base import Benchmark, Suite, TracingType
 
 
 class RUNTIMES(Enum):
@@ -100,66 +100,57 @@ def setup(self) -> None:
 
     def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata = {
-            "SubmitKernel": BenchmarkMetadata(
-                type="group",
-                description="Measures CPU time overhead of submitting kernels through different APIs.",
-                notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
-                "The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
-                "The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
-                "Work is ongoing to reduce the overhead of the SYCL API\n",
-                tags=["submit", "micro", "SYCL", "UR", "L0"],
-                range_min=0.0,
-            ),
             "SinKernelGraph": BenchmarkMetadata(
                 type="group",
                 unstable="This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
                 tags=["submit", "memory", "proxy", "SYCL", "UR", "L0", "graph"],
             ),
-            "SubmitGraph": BenchmarkMetadata(
-                type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
-            ),
             "FinalizeGraph": BenchmarkMetadata(
                 type="group", tags=["finalize", "micro", "SYCL", "graph"]
             ),
         }
 
         # Add metadata for all SubmitKernel group variants
-        base_metadata = metadata["SubmitKernel"]
-
+        submit_kernel_metadata = BenchmarkMetadata(
+            type="group",
+            notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
+            "The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
+            "The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
+            "Work is ongoing to reduce the overhead of the SYCL API\n",
+            tags=["submit", "micro", "SYCL", "UR", "L0"],
+            range_min=0.0,
+        )
         for order in ["in order", "out of order"]:
             for completion in ["", " with completion"]:
                 for events in ["", " using events"]:
                     group_name = f"SubmitKernel {order}{completion}{events} long kernel"
-                    metadata[group_name] = BenchmarkMetadata(
-                        type="group",
-                        description=f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs.",
-                        notes=base_metadata.notes,
-                        tags=base_metadata.tags,
-                        range_min=base_metadata.range_min,
+                    metadata[group_name] = copy.deepcopy(submit_kernel_metadata)
+                    metadata[group_name].description = (
+                        f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs."
                     )
-
                     # CPU count variants
                     cpu_count_group = f"{group_name}, CPU count"
-                    metadata[cpu_count_group] = BenchmarkMetadata(
-                        type="group",
-                        description=f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs.",
-                        notes=base_metadata.notes,
-                        tags=base_metadata.tags,
-                        range_min=base_metadata.range_min,
+                    metadata[cpu_count_group] = copy.deepcopy(submit_kernel_metadata)
+                    metadata[cpu_count_group].description = (
+                        f"Measures CPU instruction count overhead of submitting {order} kernels with longer execution times through different APIs."
                     )
 
         # Add metadata for all SubmitGraph group variants
-        base_metadata = metadata["SubmitGraph"]
+        submit_graph_metadata = BenchmarkMetadata(
+            type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
+        )
         for order in ["in order", "out of order"]:
             for completion in ["", " with completion"]:
                 for events in ["", " using events"]:
                     for num_kernels in self.submit_graph_num_kernels:
-                        group_name = f"SubmitGraph {order}{completion}{events}, {num_kernels} kernels"
-                        metadata[group_name] = BenchmarkMetadata(
-                            type="group",
-                            tags=base_metadata.tags,
-                        )
-
+                        for host_tasks in ["", " use host tasks"]:
+                            group_name = f"SubmitGraph {order}{completion}{events}{host_tasks}, {num_kernels} kernels"
+                            metadata[group_name] = copy.deepcopy(submit_graph_metadata)
+                            # CPU count variants
+                            cpu_count_group = f"{group_name}, CPU count"
+                            metadata[cpu_count_group] = copy.deepcopy(
+                                submit_graph_metadata
+                            )
         return metadata
 
     def benchmarks(self) -> list[Benchmark]:
@@ -1088,6 +1079,22 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
+    def get_metadata(self) -> dict[str, BenchmarkMetadata]:
+        metadata_dict = super().get_metadata()
+
+        # Create CPU count variant with modified display name and explicit_group
+        cpu_count_name = self.name() + " CPU count"
+        cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
+        cpu_count_display_name = self.display_name() + ", CPU count"
+        cpu_count_explicit_group = (
+            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
+        )
+        cpu_count_metadata.display_name = cpu_count_display_name
+        cpu_count_metadata.explicit_group = cpu_count_explicit_group
+        metadata_dict[cpu_count_name] = cpu_count_metadata
+
+        return metadata_dict
+
 
 class UllsEmptyKernel(ComputeBenchmark):
     def __init__(
 
@@ -86,7 +86,7 @@ def benchmarks(self) -> list[Benchmark]:
             # Gesumv(self), # validation failure
             # Gramschmidt(self), # validation failure
             KMeans(self),
-            LinRegCoeff(self),
+            # LinRegCoeff(self), # FIXME: causes serious GPU hangs on 25.31.34666.3
             # LinRegError(self), # run time < 1ms
             # MatmulChain(self), # validation failure
             MolDyn(self),
 
@@ -164,7 +164,7 @@ static bool isDeviceBinaryTypeSupported(context_impl &ContextImpl,
 [[maybe_unused]] auto VecToString = [](auto &Vec) -> std::string {
   std::ostringstream Out;
   Out << "{";
-  for (auto Elem : Vec)
+  for (const auto &Elem : Vec)
     Out << Elem << " ";
   Out << "}";
   return Out.str();
 
@@ -508,6 +508,61 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
   return ResultEvent;
 }
 
+EventImplPtr queue_impl::submit_command_to_graph(
+    ext::oneapi::experimental::detail::graph_impl &GraphImpl,
+    std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
+    sycl::ext::oneapi::experimental::node_type UserFacingNodeType) {
+  auto EventImpl = detail::event_impl::create_completed_host_event();
+  EventImpl->setSubmittedQueue(weak_from_this());
+  ext::oneapi::experimental::detail::node_impl *NodeImpl = nullptr;
+
+  // GraphImpl is read and written in this scope so we lock this graph
+  // with full priviledges.
+  ext::oneapi::experimental::detail::graph_impl::WriteLock Lock(
+      GraphImpl.MMutex);
+
+  ext::oneapi::experimental::node_type NodeType =
+      UserFacingNodeType != ext::oneapi::experimental::node_type::empty
+          ? UserFacingNodeType
+          : ext::oneapi::experimental::detail::getNodeTypeFromCG(CGType);
+
+  // Create a new node in the graph representing this command-group
+  if (isInOrder()) {
+    // In-order queues create implicit linear dependencies between nodes.
+    // Find the last node added to the graph from this queue, so our new
+    // node can set it as a predecessor.
+    std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
+    if (ext::oneapi::experimental::detail::node_impl *DependentNode =
+            GraphImpl.getLastInorderNode(this)) {
+      Deps.push_back(DependentNode);
+    }
+    NodeImpl = &GraphImpl.add(NodeType, std::move(CommandGroup), Deps);
+
+    // If we are recording an in-order queue remember the new node, so it
+    // can be used as a dependency for any more nodes recorded from this
+    // queue.
+    GraphImpl.setLastInorderNode(*this, *NodeImpl);
+  } else {
+    ext::oneapi::experimental::detail::node_impl *LastBarrierRecordedFromQueue =
+        GraphImpl.getBarrierDep(weak_from_this());
+    std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
+
+    if (LastBarrierRecordedFromQueue) {
+      Deps.push_back(LastBarrierRecordedFromQueue);
+    }
+    NodeImpl = &GraphImpl.add(NodeType, std::move(CommandGroup), Deps);
+
+    if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) {
+      GraphImpl.setBarrierDep(weak_from_this(), *NodeImpl);
+    }
+  }
+
+  // Associate an event with this new node and return the event.
+  GraphImpl.addEventForNode(EventImpl, *NodeImpl);
+
+  return EventImpl;
+}
+
 EventImplPtr queue_impl::submit_kernel_direct_impl(
     const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
     detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
@@ -547,6 +602,11 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
         CodeLoc));
     CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
 
+    if (auto GraphImpl = getCommandGraph(); GraphImpl) {
+      return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
+                                     detail::CGType::Kernel);
+    }
+
     return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
                                                   *this, true);
   };
 
@@ -642,6 +642,12 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
 
   bool hasCommandGraph() const { return !MGraph.expired(); }
 
+  EventImplPtr submit_command_to_graph(
+      ext::oneapi::experimental::detail::graph_impl &GraphImpl,
+      std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
+      sycl::ext::oneapi::experimental::node_type UserFacingNodeType =
+          ext::oneapi::experimental::node_type::empty);
+
   unsigned long long getQueueID() { return MQueueID; }
 
   void *getTraceEvent() { return MTraceEvent; }
 
@@ -2468,14 +2468,16 @@ static ur_result_t SetKernelParamsAndLaunch(
         /* pPropSizeRet = */ nullptr);
 
     const bool EnforcedLocalSize =
-        (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
-         RequiredWGSize[2] != 0);
+        (RequiredWGSize[0] != 0 &&
+         (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
+         (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
     if (EnforcedLocalSize)
       LocalSize = RequiredWGSize;
   }
-  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 ||
-                         NDRDesc.GlobalOffset[1] != 0 ||
-                         NDRDesc.GlobalOffset[2] != 0;
+
+  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
+                         (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
+                         (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
 
   std::vector<ur_kernel_launch_property_t> property_list;
 
@@ -2610,6 +2612,10 @@ ur_result_t enqueueImpCommandBufferKernel(
   size_t RequiredWGSize[3] = {0, 0, 0};
   size_t *LocalSize = nullptr;
 
+  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
+                         (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
+                         (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
+
   if (HasLocalSize)
     LocalSize = &NDRDesc.LocalSize[0];
   else {
@@ -2620,8 +2626,9 @@ ur_result_t enqueueImpCommandBufferKernel(
         /* pPropSizeRet = */ nullptr);
 
     const bool EnforcedLocalSize =
-        (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
-         RequiredWGSize[2] != 0);
+        (RequiredWGSize[0] != 0 &&
+         (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
+         (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
     if (EnforcedLocalSize)
       LocalSize = RequiredWGSize;
   }
@@ -2637,7 +2644,8 @@ ur_result_t enqueueImpCommandBufferKernel(
 
   ur_result_t Res =
       Adapter.call_nocheck<UrApiKind::urCommandBufferAppendKernelLaunchExp>(
-          CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
+          CommandBuffer, UrKernel, NDRDesc.Dims,
+          HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
           &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(),
           AltUrKernels.size() ? AltUrKernels.data() : nullptr,
           SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0,
Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,10 @@`
`1`	`1`	`{`
`2`	`2`	`"linux": {`
`3`	`3`	`"igc_dev": {`
`4`		`- "github_tag": "igc-dev-e146785",`
`5`		`- "version": "e146785",`
`6`		`- "updated_at": "2025-10-02T03:05:40Z",`
`7`		`- "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4161218080/zip",`
	`4`	`+ "github_tag": "igc-dev-e4b64c1",`
	`5`	`+ "version": "e4b64c1",`
	`6`	`+ "updated_at": "2025-10-05T10:41:23Z",`
	`7`	`+ "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4185473239/zip",`
`8`	`8`	`"root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"`
`9`	`9`	`}`
`10`	`10`	`}`