Skip to content

Commit bd6e8c9

Browse files
committed
Merge branch 'sycl' into do-alloc-use-pool
2 parents 59b3c3e + e37f75f commit bd6e8c9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+668
-528
lines changed

include/ur_api.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3666,6 +3666,11 @@ urUSMSharedAlloc(
36663666
///////////////////////////////////////////////////////////////////////////////
36673667
/// @brief Free the USM memory object
36683668
///
3669+
/// @details
3670+
/// - Note that implementations are required to wait for previously enqueued
3671+
/// commands that may be accessing `pMem` to finish before freeing the
3672+
/// memory.
3673+
///
36693674
/// @returns
36703675
/// - ::UR_RESULT_SUCCESS
36713676
/// - ::UR_RESULT_ERROR_UNINITIALIZED
@@ -9486,13 +9491,17 @@ urEnqueueCooperativeKernelLaunchExp(
94869491
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
94879492
/// + `NULL == hKernel`
94889493
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9494+
/// + `NULL == pLocalWorkSize`
94899495
/// + `NULL == pGroupCountRet`
94909496
/// - ::UR_RESULT_ERROR_INVALID_KERNEL
94919497
UR_APIEXPORT ur_result_t UR_APICALL
94929498
urKernelSuggestMaxCooperativeGroupCountExp(
94939499
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
9494-
size_t localWorkSize, ///< [in] number of local work-items that will form a work-group when the
9495-
///< kernel is launched
9500+
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
9501+
///< work-items
9502+
const size_t *pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
9503+
///< number of local work-items forming a work-group that will execute the
9504+
///< kernel function.
94969505
size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
94979506
///< that will be used when the kernel is launched
94989507
uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups
@@ -11028,7 +11037,8 @@ typedef struct ur_kernel_set_specialization_constants_params_t {
1102811037
/// allowing the callback the ability to modify the parameter's value
1102911038
typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t {
1103011039
ur_kernel_handle_t *phKernel;
11031-
size_t *plocalWorkSize;
11040+
uint32_t *pworkDim;
11041+
const size_t **ppLocalWorkSize;
1103211042
size_t *pdynamicSharedMemorySize;
1103311043
uint32_t **ppGroupCountRet;
1103411044
} ur_kernel_suggest_max_cooperative_group_count_exp_params_t;

include/ur_ddi.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)(
651651
/// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp
652652
typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)(
653653
ur_kernel_handle_t,
654-
size_t,
654+
uint32_t,
655+
const size_t *,
655656
size_t,
656657
uint32_t *);
657658

include/ur_print.hpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13074,9 +13074,15 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1307413074
*(params->phKernel));
1307513075

1307613076
os << ", ";
13077-
os << ".localWorkSize = ";
13077+
os << ".workDim = ";
13078+
13079+
os << *(params->pworkDim);
13080+
13081+
os << ", ";
13082+
os << ".pLocalWorkSize = ";
1307813083

13079-
os << *(params->plocalWorkSize);
13084+
ur::details::printPtr(os,
13085+
*(params->ppLocalWorkSize));
1308013086

1308113087
os << ", ";
1308213088
os << ".dynamicSharedMemorySize = ";

scripts/benchmarks/benches/base.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,6 @@ def run(self, env_vars) -> list[Result]:
7171
def teardown(self):
7272
raise NotImplementedError()
7373

74-
def ignore_iterations(self):
75-
return False
76-
7774
class Suite:
7875
def benchmarks(self) -> list[Benchmark]:
7976
raise NotImplementedError()

scripts/benchmarks/benches/compute.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def benchmarks(self) -> list[Benchmark]:
5050
return []
5151

5252
benches = [
53+
SubmitKernelL0(self, 0),
54+
SubmitKernelL0(self, 1),
5355
SubmitKernelSYCL(self, 0),
5456
SubmitKernelSYCL(self, 1),
5557
QueueInOrderMemcpy(self, 0, 'Device', 'Device', 1024),
@@ -84,7 +86,7 @@ def parse_unit_type(compute_unit):
8486
return "instr"
8587
elif "[us]" in compute_unit:
8688
return "μs"
87-
return "unknown"
89+
return compute_unit.replace("[", "").replace("]", "")
8890

8991
class ComputeBenchmark(Benchmark):
9092
def __init__(self, bench, name, test):
@@ -116,9 +118,9 @@ def run(self, env_vars) -> list[Result]:
116118
result = self.run_bench(command, env_vars)
117119
parsed_results = self.parse_output(result)
118120
ret = []
119-
for label, mean, unit in parsed_results:
121+
for label, median, stddev, unit in parsed_results:
120122
extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
121-
ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
123+
ret.append(Result(label=self.name() + extra_label, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
122124
return ret
123125

124126
def parse_output(self, output):
@@ -133,8 +135,11 @@ def parse_output(self, output):
133135
try:
134136
label = data_row[0]
135137
mean = float(data_row[1])
138+
median = float(data_row[2])
139+
# compute benchmarks report stddev as %
140+
stddev = mean * (float(data_row[3].strip('%')) / 100.0)
136141
unit = data_row[7]
137-
results.append((label, mean, unit))
142+
results.append((label, median, stddev, unit))
138143
except (ValueError, IndexError) as e:
139144
raise ValueError(f"Error parsing output: {e}")
140145
if len(results) == 0:
@@ -184,6 +189,26 @@ def bin_args(self) -> list[str]:
184189
"--KernelExecTime=1"
185190
]
186191

192+
class SubmitKernelL0(ComputeBenchmark):
193+
def __init__(self, bench, ioq):
194+
self.ioq = ioq
195+
super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel")
196+
197+
def name(self):
198+
order = "in order" if self.ioq else "out of order"
199+
return f"api_overhead_benchmark_l0 SubmitKernel {order}"
200+
201+
def bin_args(self) -> list[str]:
202+
return [
203+
f"--Ioq={self.ioq}",
204+
"--DiscardEvents=0",
205+
"--MeasureCompletion=0",
206+
"--iterations=100000",
207+
"--Profiling=0",
208+
"--NumKernels=10",
209+
"--KernelExecTime=1"
210+
]
211+
187212
class ExecImmediateCopyQueue(ComputeBenchmark):
188213
def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
189214
self.ioq = ioq
@@ -257,6 +282,10 @@ def __init__(self, bench, type, size, placement):
257282
def name(self):
258283
return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
259284

285+
# measurement is in GB/s
286+
def lower_is_better(self):
287+
return False
288+
260289
def bin_args(self) -> list[str]:
261290
return [
262291
"--iterations=10000",

scripts/benchmarks/benches/llamacpp.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,6 @@ def name(self):
7676
def lower_is_better(self):
7777
return False
7878

79-
def ignore_iterations(self):
80-
return True
81-
8279
def run(self, env_vars) -> list[Result]:
8380
command = [
8481
f"{self.benchmark_bin}",

scripts/benchmarks/benches/options.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,16 @@ class Options:
1515
rebuild: bool = True
1616
benchmark_cwd: str = "INVALID"
1717
timeout: float = 600
18-
iterations: int = 5
18+
iterations: int = 3
1919
verbose: bool = False
2020
compare: Compare = Compare.LATEST
2121
compare_max: int = 10 # average/median over how many results
2222
output_html: bool = False
2323
output_markdown: bool = True
2424
dry_run: bool = False
25+
# these two should probably be merged into one setting
26+
stddev_threshold: float = 0.02
27+
epsilon: float = 0.02
2528

2629
options = Options()
2730

scripts/benchmarks/benches/result.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ class Result:
1818
stdout: str
1919
passed: bool = True
2020
unit: str = ""
21-
# values should not be set by the benchmark
21+
# stddev can be optionally set by the benchmark,
22+
# if not set, it will be calculated automatically.
23+
stddev: float = 0.0
24+
# values below should not be set by the benchmark
2225
name: str = ""
2326
lower_is_better: bool = True
2427
git_hash: str = ''

scripts/benchmarks/main.py

Lines changed: 102 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,100 @@
1818

1919
import argparse
2020
import re
21+
import statistics
2122

2223
# Update this if you are changing the layout of the results files
2324
INTERNAL_WORKDIR_VERSION = '2.0'
2425

26+
def run_iterations(benchmark: Benchmark, env_vars, iters: int, results: dict[str, list[Result]]):
27+
for iter in range(iters):
28+
print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
29+
bench_results = benchmark.run(env_vars)
30+
if bench_results is None:
31+
print(f"did not finish (OK for sycl-bench).")
32+
break
33+
34+
for bench_result in bench_results:
35+
# TODO: report failures in markdown/html ?
36+
if not bench_result.passed:
37+
print(f"complete ({bench_result.label}: verification FAILED)")
38+
continue
39+
40+
print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
41+
42+
bench_result.name = bench_result.label
43+
bench_result.lower_is_better = benchmark.lower_is_better()
44+
45+
if bench_result.label not in results:
46+
results[bench_result.label] = []
47+
48+
results[bench_result.label].append(bench_result)
49+
50+
# https://www.statology.org/modified-z-score/
51+
def modified_z_score(values: list[float]) -> list[float]:
52+
median = statistics.median(values)
53+
mad = statistics.median([abs(v - median) for v in values])
54+
if mad == 0:
55+
return [0] * len(values)
56+
return [(0.6745 * (v - median)) / mad for v in values]
57+
58+
def remove_outliers(results: dict[str, list[Result]], threshold: float = 3.5) -> dict[str, list[Result]]:
59+
new_results = {}
60+
for key, rlist in results.items():
61+
# don't eliminate outliers on first pass
62+
if len(rlist) <= options.iterations:
63+
new_results[key] = rlist
64+
continue
65+
66+
values = [r.value for r in rlist]
67+
z_scores = modified_z_score(values)
68+
filtered_rlist = [r for r, z in zip(rlist, z_scores) if abs(z) <= threshold]
69+
70+
if not filtered_rlist:
71+
new_results[key] = rlist
72+
else:
73+
new_results[key] = filtered_rlist
74+
75+
return new_results
76+
77+
def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result]]:
78+
processed: list[Result] = []
79+
# technically, we can detect whether result is below or above threshold per
80+
# individual result. However, we can't repeat benchmark runs with that
81+
# granularity. So we just reject all results and try again.
82+
valid_results = True # above stddev threshold
83+
84+
for label, rlist in remove_outliers(results).items():
85+
if (len(rlist) == 0):
86+
continue
87+
88+
if len(rlist) == 1:
89+
processed.append(rlist[0])
90+
continue
91+
92+
values = [r.value for r in rlist]
93+
94+
mean_value = statistics.mean(values)
95+
stddev = statistics.stdev(values)
96+
97+
threshold = options.stddev_threshold * mean_value
98+
99+
if stddev > threshold:
100+
print(f"stddev {stddev} above the threshold {threshold} for {label}")
101+
valid_results = False
102+
103+
rlist.sort(key=lambda res: res.value)
104+
median_index = len(rlist) // 2
105+
median_result = rlist[median_index]
106+
107+
# only override the stddev if not already set
108+
if median_result.stddev == 0.0:
109+
median_result.stddev = stddev
110+
111+
processed.append(median_result)
112+
113+
return valid_results, processed
114+
25115
def main(directory, additional_env_vars, save_name, compare_names, filter):
26116
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
27117

@@ -65,36 +155,14 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
65155
for benchmark in benchmarks:
66156
try:
67157
merged_env_vars = {**additional_env_vars}
68-
iteration_results = []
69-
iterations = options.iterations if not benchmark.ignore_iterations() else 1
70-
for iter in range(iterations):
71-
print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
72-
bench_results = benchmark.run(merged_env_vars)
73-
if bench_results is not None:
74-
for bench_result in bench_results:
75-
if bench_result.passed:
76-
print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
77-
else:
78-
print(f"complete ({bench_result.label}: verification FAILED)")
79-
iteration_results.append(bench_result)
80-
else:
81-
print(f"did not finish (OK for sycl-bench).")
158+
intermediate_results: dict[str, list[Result]] = {}
159+
processed: list[Result] = []
160+
for _ in range(5):
161+
run_iterations(benchmark, merged_env_vars, options.iterations, intermediate_results)
162+
valid, processed = process_results(intermediate_results)
163+
if valid:
82164
break
83-
84-
if len(iteration_results) == 0:
85-
continue
86-
87-
for label in set([result.label for result in iteration_results]):
88-
label_results = [result for result in iteration_results if result.label == label and result.passed == True]
89-
if len(label_results) > 0:
90-
label_results.sort(key=lambda res: res.value)
91-
median_index = len(label_results) // 2
92-
median_result = label_results[median_index]
93-
94-
median_result.name = label
95-
median_result.lower_is_better = benchmark.lower_is_better()
96-
97-
results.append(median_result)
165+
results += processed
98166
except Exception as e:
99167
if options.exit_on_failure:
100168
raise e
@@ -164,14 +232,15 @@ def validate_and_parse_env_args(env_args):
164232
parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
165233
parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
166234
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
167-
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
168-
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
235+
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
236+
parser.add_argument("--stddev-threshold", type=float, help='If stddev % is above this threshold, rerun all iterations', default=options.stddev_threshold)
237+
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
169238
parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
170-
parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005)
239+
parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=options.epsilon)
171240
parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
172241
parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
173242
parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
174-
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=10)
243+
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
175244
parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
176245
parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
177246
parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)

0 commit comments

Comments
 (0)