Skip to content

Commit 798c1ca

Browse files
committed
Merge branch 'sycl' into no_handler_scheduler_bypass
2 parents 43b4b3a + 30215eb commit 798c1ca

File tree

23 files changed

+189
-129
lines changed

23 files changed

+189
-129
lines changed

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -942,7 +942,29 @@ static void addBackendOptions(const ArgList &Args,
942942
SmallVector<StringRef, 8> &CmdArgs, bool IsCPU) {
943943
StringRef OptC =
944944
Args.getLastArgValue(OPT_sycl_backend_compile_options_from_image_EQ);
945-
OptC.split(CmdArgs, " ", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
945+
if (IsCPU) {
946+
OptC.split(CmdArgs, " ", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
947+
} else {
948+
// ocloc -options args need to be comma separated, e.g. `-options
949+
// "-g,-cl-opt-disable"`. Otherwise, only the first arg is processed by
950+
// ocloc as an arg for -options, and the rest are processed as standalone
951+
// flags, possibly leading to errors.
952+
// split function here returns a pair with everything before the separator
953+
// ("-options") in the first member of the pair, and everything after the
954+
// separator in the second part of the pair. The separator is not included
955+
// in any of them.
956+
auto [BeforeOptions, AfterOptions] = OptC.split("-options ");
957+
// Only add if not empty, an empty arg can lead to ocloc errors.
958+
if (!BeforeOptions.empty())
959+
CmdArgs.push_back(BeforeOptions);
960+
if (!AfterOptions.empty()) {
961+
// Separator not included by the split function, so explicitly added here.
962+
CmdArgs.push_back("-options");
963+
std::string Replace = AfterOptions.str();
964+
std::replace(Replace.begin(), Replace.end(), ' ', ',');
965+
CmdArgs.push_back(Args.MakeArgString(Replace));
966+
}
967+
}
946968
StringRef OptL =
947969
Args.getLastArgValue(OPT_sycl_backend_link_options_from_image_EQ);
948970
OptL.split(CmdArgs, " ", /*MaxSplit=*/-1, /*KeepEmpty=*/false);

devops/dependencies-igc-dev.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"linux": {
33
"igc_dev": {
4-
"github_tag": "igc-dev-e146785",
5-
"version": "e146785",
6-
"updated_at": "2025-10-02T03:05:40Z",
7-
"url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4161218080/zip",
4+
"github_tag": "igc-dev-e4b64c1",
5+
"version": "e4b64c1",
6+
"updated_at": "2025-10-05T10:41:23Z",
7+
"url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/4185473239/zip",
88
"root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
99
}
1010
}

devops/scripts/benchmarks/benches/compute.py

Lines changed: 48 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@
33
# See LICENSE.TXT
44
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
55

6-
from itertools import product
6+
import copy
77
import csv
88
import io
9-
import copy
109
import math
1110
from enum import Enum
11+
from itertools import product
1212
from pathlib import Path
1313

14-
from .base import Benchmark, Suite, TracingType
15-
from utils.result import BenchmarkMetadata, Result
16-
from .base import Benchmark, Suite
17-
from options import options
1814
from git_project import GitProject
15+
from options import options
16+
from utils.result import BenchmarkMetadata, Result
17+
18+
from .base import Benchmark, Suite, TracingType
1919

2020

2121
class RUNTIMES(Enum):
@@ -100,66 +100,57 @@ def setup(self) -> None:
100100

101101
def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
102102
metadata = {
103-
"SubmitKernel": BenchmarkMetadata(
104-
type="group",
105-
description="Measures CPU time overhead of submitting kernels through different APIs.",
106-
notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
107-
"The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
108-
"The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
109-
"Work is ongoing to reduce the overhead of the SYCL API\n",
110-
tags=["submit", "micro", "SYCL", "UR", "L0"],
111-
range_min=0.0,
112-
),
113103
"SinKernelGraph": BenchmarkMetadata(
114104
type="group",
115105
unstable="This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
116106
tags=["submit", "memory", "proxy", "SYCL", "UR", "L0", "graph"],
117107
),
118-
"SubmitGraph": BenchmarkMetadata(
119-
type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
120-
),
121108
"FinalizeGraph": BenchmarkMetadata(
122109
type="group", tags=["finalize", "micro", "SYCL", "graph"]
123110
),
124111
}
125112

126113
# Add metadata for all SubmitKernel group variants
127-
base_metadata = metadata["SubmitKernel"]
128-
114+
submit_kernel_metadata = BenchmarkMetadata(
115+
type="group",
116+
notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
117+
"The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
118+
"The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
119+
"Work is ongoing to reduce the overhead of the SYCL API\n",
120+
tags=["submit", "micro", "SYCL", "UR", "L0"],
121+
range_min=0.0,
122+
)
129123
for order in ["in order", "out of order"]:
130124
for completion in ["", " with completion"]:
131125
for events in ["", " using events"]:
132126
group_name = f"SubmitKernel {order}{completion}{events} long kernel"
133-
metadata[group_name] = BenchmarkMetadata(
134-
type="group",
135-
description=f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs.",
136-
notes=base_metadata.notes,
137-
tags=base_metadata.tags,
138-
range_min=base_metadata.range_min,
127+
metadata[group_name] = copy.deepcopy(submit_kernel_metadata)
128+
metadata[group_name].description = (
129+
f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs."
139130
)
140-
141131
# CPU count variants
142132
cpu_count_group = f"{group_name}, CPU count"
143-
metadata[cpu_count_group] = BenchmarkMetadata(
144-
type="group",
145-
description=f"Measures CPU time overhead of submitting {order} kernels with longer execution times through different APIs.",
146-
notes=base_metadata.notes,
147-
tags=base_metadata.tags,
148-
range_min=base_metadata.range_min,
133+
metadata[cpu_count_group] = copy.deepcopy(submit_kernel_metadata)
134+
metadata[cpu_count_group].description = (
135+
f"Measures CPU instruction count overhead of submitting {order} kernels with longer execution times through different APIs."
149136
)
150137

151138
# Add metadata for all SubmitGraph group variants
152-
base_metadata = metadata["SubmitGraph"]
139+
submit_graph_metadata = BenchmarkMetadata(
140+
type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
141+
)
153142
for order in ["in order", "out of order"]:
154143
for completion in ["", " with completion"]:
155144
for events in ["", " using events"]:
156145
for num_kernels in self.submit_graph_num_kernels:
157-
group_name = f"SubmitGraph {order}{completion}{events}, {num_kernels} kernels"
158-
metadata[group_name] = BenchmarkMetadata(
159-
type="group",
160-
tags=base_metadata.tags,
161-
)
162-
146+
for host_tasks in ["", " use host tasks"]:
147+
group_name = f"SubmitGraph {order}{completion}{events}{host_tasks}, {num_kernels} kernels"
148+
metadata[group_name] = copy.deepcopy(submit_graph_metadata)
149+
# CPU count variants
150+
cpu_count_group = f"{group_name}, CPU count"
151+
metadata[cpu_count_group] = copy.deepcopy(
152+
submit_graph_metadata
153+
)
163154
return metadata
164155

165156
def benchmarks(self) -> list[Benchmark]:
@@ -1088,6 +1079,22 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10881079
bin_args.append(f"--profilerType={self.profiler_type.value}")
10891080
return bin_args
10901081

1082+
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
1083+
metadata_dict = super().get_metadata()
1084+
1085+
# Create CPU count variant with modified display name and explicit_group
1086+
cpu_count_name = self.name() + " CPU count"
1087+
cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
1088+
cpu_count_display_name = self.display_name() + ", CPU count"
1089+
cpu_count_explicit_group = (
1090+
self.explicit_group() + ", CPU count" if self.explicit_group() else ""
1091+
)
1092+
cpu_count_metadata.display_name = cpu_count_display_name
1093+
cpu_count_metadata.explicit_group = cpu_count_explicit_group
1094+
metadata_dict[cpu_count_name] = cpu_count_metadata
1095+
1096+
return metadata_dict
1097+
10911098

10921099
class UllsEmptyKernel(ComputeBenchmark):
10931100
def __init__(

devops/scripts/benchmarks/benches/syclbench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def benchmarks(self) -> list[Benchmark]:
8686
# Gesumv(self), # validation failure
8787
# Gramschmidt(self), # validation failure
8888
KMeans(self),
89-
LinRegCoeff(self),
89+
# LinRegCoeff(self), # FIXME: causes serious GPU hangs on 25.31.34666.3
9090
# LinRegError(self), # run time < 1ms
9191
# MatmulChain(self), # validation failure
9292
MolDyn(self),

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ static bool isDeviceBinaryTypeSupported(context_impl &ContextImpl,
164164
[[maybe_unused]] auto VecToString = [](auto &Vec) -> std::string {
165165
std::ostringstream Out;
166166
Out << "{";
167-
for (auto Elem : Vec)
167+
for (const auto &Elem : Vec)
168168
Out << Elem << " ";
169169
Out << "}";
170170
return Out.str();

sycl/source/detail/queue_impl.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,61 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
508508
return ResultEvent;
509509
}
510510

511+
EventImplPtr queue_impl::submit_command_to_graph(
512+
ext::oneapi::experimental::detail::graph_impl &GraphImpl,
513+
std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
514+
sycl::ext::oneapi::experimental::node_type UserFacingNodeType) {
515+
auto EventImpl = detail::event_impl::create_completed_host_event();
516+
EventImpl->setSubmittedQueue(weak_from_this());
517+
ext::oneapi::experimental::detail::node_impl *NodeImpl = nullptr;
518+
519+
// GraphImpl is read and written in this scope so we lock this graph
520+
// with full priviledges.
521+
ext::oneapi::experimental::detail::graph_impl::WriteLock Lock(
522+
GraphImpl.MMutex);
523+
524+
ext::oneapi::experimental::node_type NodeType =
525+
UserFacingNodeType != ext::oneapi::experimental::node_type::empty
526+
? UserFacingNodeType
527+
: ext::oneapi::experimental::detail::getNodeTypeFromCG(CGType);
528+
529+
// Create a new node in the graph representing this command-group
530+
if (isInOrder()) {
531+
// In-order queues create implicit linear dependencies between nodes.
532+
// Find the last node added to the graph from this queue, so our new
533+
// node can set it as a predecessor.
534+
std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
535+
if (ext::oneapi::experimental::detail::node_impl *DependentNode =
536+
GraphImpl.getLastInorderNode(this)) {
537+
Deps.push_back(DependentNode);
538+
}
539+
NodeImpl = &GraphImpl.add(NodeType, std::move(CommandGroup), Deps);
540+
541+
// If we are recording an in-order queue remember the new node, so it
542+
// can be used as a dependency for any more nodes recorded from this
543+
// queue.
544+
GraphImpl.setLastInorderNode(*this, *NodeImpl);
545+
} else {
546+
ext::oneapi::experimental::detail::node_impl *LastBarrierRecordedFromQueue =
547+
GraphImpl.getBarrierDep(weak_from_this());
548+
std::vector<ext::oneapi::experimental::detail::node_impl *> Deps;
549+
550+
if (LastBarrierRecordedFromQueue) {
551+
Deps.push_back(LastBarrierRecordedFromQueue);
552+
}
553+
NodeImpl = &GraphImpl.add(NodeType, std::move(CommandGroup), Deps);
554+
555+
if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) {
556+
GraphImpl.setBarrierDep(weak_from_this(), *NodeImpl);
557+
}
558+
}
559+
560+
// Associate an event with this new node and return the event.
561+
GraphImpl.addEventForNode(EventImpl, *NodeImpl);
562+
563+
return EventImpl;
564+
}
565+
511566
EventImplPtr queue_impl::submit_kernel_direct_impl(
512567
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
513568
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
@@ -547,6 +602,11 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
547602
CodeLoc));
548603
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
549604

605+
if (auto GraphImpl = getCommandGraph(); GraphImpl) {
606+
return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
607+
detail::CGType::Kernel);
608+
}
609+
550610
return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
551611
*this, true);
552612
};

sycl/source/detail/queue_impl.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,12 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
642642

643643
bool hasCommandGraph() const { return !MGraph.expired(); }
644644

645+
EventImplPtr submit_command_to_graph(
646+
ext::oneapi::experimental::detail::graph_impl &GraphImpl,
647+
std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
648+
sycl::ext::oneapi::experimental::node_type UserFacingNodeType =
649+
ext::oneapi::experimental::node_type::empty);
650+
645651
unsigned long long getQueueID() { return MQueueID; }
646652

647653
void *getTraceEvent() { return MTraceEvent; }

sycl/source/detail/scheduler/commands.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2468,14 +2468,16 @@ static ur_result_t SetKernelParamsAndLaunch(
24682468
/* pPropSizeRet = */ nullptr);
24692469

24702470
const bool EnforcedLocalSize =
2471-
(RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2472-
RequiredWGSize[2] != 0);
2471+
(RequiredWGSize[0] != 0 &&
2472+
(NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
2473+
(NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
24732474
if (EnforcedLocalSize)
24742475
LocalSize = RequiredWGSize;
24752476
}
2476-
const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 ||
2477-
NDRDesc.GlobalOffset[1] != 0 ||
2478-
NDRDesc.GlobalOffset[2] != 0;
2477+
2478+
const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
2479+
(NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
2480+
(NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
24792481

24802482
std::vector<ur_kernel_launch_property_t> property_list;
24812483

@@ -2610,6 +2612,10 @@ ur_result_t enqueueImpCommandBufferKernel(
26102612
size_t RequiredWGSize[3] = {0, 0, 0};
26112613
size_t *LocalSize = nullptr;
26122614

2615+
const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
2616+
(NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
2617+
(NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
2618+
26132619
if (HasLocalSize)
26142620
LocalSize = &NDRDesc.LocalSize[0];
26152621
else {
@@ -2620,8 +2626,9 @@ ur_result_t enqueueImpCommandBufferKernel(
26202626
/* pPropSizeRet = */ nullptr);
26212627

26222628
const bool EnforcedLocalSize =
2623-
(RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
2624-
RequiredWGSize[2] != 0);
2629+
(RequiredWGSize[0] != 0 &&
2630+
(NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
2631+
(NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
26252632
if (EnforcedLocalSize)
26262633
LocalSize = RequiredWGSize;
26272634
}
@@ -2637,7 +2644,8 @@ ur_result_t enqueueImpCommandBufferKernel(
26372644

26382645
ur_result_t Res =
26392646
Adapter.call_nocheck<UrApiKind::urCommandBufferAppendKernelLaunchExp>(
2640-
CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
2647+
CommandBuffer, UrKernel, NDRDesc.Dims,
2648+
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
26412649
&NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(),
26422650
AltUrKernels.size() ? AltUrKernels.data() : nullptr,
26432651
SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0,

0 commit comments

Comments
 (0)