Skip to content

Commit cd920b6

Browse files
authored
Merge pull request swiftlang#19910 from palimondo/fluctuation-of-the-pupil
[benchmark] More Robust Benchmark_Driver
2 parents 9f7ee56 + 897b9ef commit cd920b6

File tree

7 files changed

+412
-119
lines changed

7 files changed

+412
-119
lines changed

benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -665,11 +665,11 @@ function(swift_benchmark_compile)
665665
COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
666666
"-o" "O" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
667667
"--swift-repo" "${SWIFT_SOURCE_DIR}"
668-
"--iterations" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
668+
"--independent-samples" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
669669
COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
670670
"-o" "Onone" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
671671
"--swift-repo" "${SWIFT_SOURCE_DIR}"
672-
"--iterations" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
672+
"--independent-samples" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
673673
COMMAND "${swift-bin-dir}/Benchmark_Driver" "compare"
674674
"--log-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
675675
"--swift-repo" "${SWIFT_SOURCE_DIR}"

benchmark/scripts/Benchmark_Driver

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -133,27 +133,31 @@ class BenchmarkDriver(object):
133133
benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
134134

135135
def run(self, test, num_samples=None, num_iters=None,
136-
verbose=None, measure_memory=False):
136+
verbose=None, measure_memory=False, quantile=None):
137137
"""Execute benchmark and gather results."""
138-
num_samples = num_samples or 1
138+
num_samples = num_samples or 0
139139
num_iters = num_iters or 0 # automatically determine N to run for 1s
140140

141141
cmd = self._cmd_run(
142-
test, num_samples, num_iters, verbose, measure_memory)
142+
test, num_samples, num_iters, verbose, measure_memory, quantile)
143143
output = self._invoke(cmd)
144144
result = self.parser.results_from_string(output).items()[0][1]
145145
return result
146146

147-
def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
147+
def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory,
148+
quantile):
148149
cmd = [self.test_harness, test]
149-
if num_samples > 1:
150+
if num_samples > 0:
150151
cmd.append('--num-samples={0}'.format(num_samples))
151152
if num_iters > 0:
152153
cmd.append('--num-iters={0}'.format(num_iters))
153154
if verbose:
154155
cmd.append('--verbose')
155156
if measure_memory:
156157
cmd.append('--memory')
158+
if quantile:
159+
cmd.append('--quantile={0}'.format(quantile))
160+
cmd.append('--delta')
157161
return cmd
158162

159163
def run_independent_samples(self, test):
@@ -166,8 +170,9 @@ class BenchmarkDriver(object):
166170
return a
167171

168172
return reduce(merge_results,
169-
[self.run(test, measure_memory=True)
170-
for _ in range(self.args.iterations)])
173+
[self.run(test, measure_memory=True,
174+
num_iters=1, quantile=20)
175+
for _ in range(self.args.independent_samples)])
171176

172177
def log_results(self, output, log_file=None):
173178
"""Log output to `log_file`.
@@ -182,7 +187,7 @@ class BenchmarkDriver(object):
182187
with open(log_file, 'w') as f:
183188
f.write(output)
184189

185-
RESULT = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
190+
RESULT = '{:>3} {:<25} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}'
186191

187192
def run_and_log(self, csv_console=True):
188193
"""Run benchmarks and continuously log results to the console.
@@ -201,14 +206,15 @@ class BenchmarkDriver(object):
201206
def console_log(values):
202207
print(format(values))
203208

204-
console_log(['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', # header
205-
'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)'])
206-
207209
def result_values(r):
208-
return map(str, [r.test_num, r.name, r.num_samples, r.min, r.max,
209-
int(r.mean), int(r.sd), r.median, r.max_rss])
210-
211-
results = []
210+
return map(str, [r.test_num, r.name, r.num_samples, r.min,
211+
r.samples.q1, r.median, r.samples.q3, r.max,
212+
r.max_rss])
213+
214+
header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
215+
'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
216+
console_log(header)
217+
results = [header]
212218
for test in self.tests:
213219
result = result_values(self.run_independent_samples(test))
214220
console_log(result)
@@ -290,6 +296,7 @@ class BenchmarkDoctor(object):
290296
self._name_matches_capital_words_convention,
291297
self._name_is_at_most_40_chars_long,
292298
self._no_setup_overhead,
299+
self._reasonable_setup_time,
293300
self._optimized_runtime_in_range,
294301
self._constant_memory_use
295302
]
@@ -341,15 +348,20 @@ class BenchmarkDoctor(object):
341348
for correction in [(setup / i) for i in [1, 2]]
342349
] for result in i_series])
343350

344-
if 2500 < runtime:
345-
log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
351+
threshold = 1000
352+
if threshold < runtime:
353+
log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else
346354
BenchmarkDoctor.log_runtime.error)
347355
caveat = '' if setup == 0 else ' (excluding the setup overhead)'
348356
log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
349-
factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
357+
358+
def factor(base): # suitable divisior that's integer power of base
359+
return int(pow(base, math.ceil(
360+
math.log(runtime / float(threshold), base))))
361+
350362
BenchmarkDoctor.log_runtime.info(
351-
"Decrease the workload of '%s' by a factor of %d, "
352-
"to be less than 2500 μs.", name, factor)
363+
"Decrease the workload of '%s' by a factor of %d (%d), to be "
364+
"less than %d μs.", name, factor(2), factor(10), threshold)
353365

354366
@staticmethod
355367
def _setup_overhead(measurements):
@@ -372,6 +384,17 @@ class BenchmarkDoctor(object):
372384
'Move initialization of benchmark data to the `setUpFunction` '
373385
'registered in `BenchmarkInfo`.')
374386

387+
@staticmethod
388+
def _reasonable_setup_time(measurements):
389+
setup = min([result.setup
390+
for result in BenchmarkDoctor._select(measurements)])
391+
if 200000 < setup: # 200 ms
392+
BenchmarkDoctor.log_runtime.error(
393+
"'%s' setup took at least %d μs.",
394+
measurements['name'], setup)
395+
BenchmarkDoctor.log_runtime.info(
396+
'The `setUpFunction` should take no more than 200 ms.')
397+
375398
@staticmethod
376399
def _constant_memory_use(measurements):
377400
select = BenchmarkDoctor._select
@@ -603,7 +626,7 @@ def parse_args(args):
603626
help='Run benchmarks and output results to stdout',
604627
parents=[shared_benchmarks_parser])
605628
run_parser.add_argument(
606-
'-i', '--iterations',
629+
'-i', '--independent-samples',
607630
help='number of times to run each test (default: 1)',
608631
type=positive_int, default=1)
609632
run_parser.add_argument(

benchmark/scripts/compare_perf_tests.py

Lines changed: 100 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ def __repr__(self):
5151
return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
5252

5353

54+
class Yield(namedtuple('Yield', 'before_sample after')):
55+
u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
56+
57+
`before_sample`: index of measurement taken just after returning from yield
58+
`after`: time elapsed since the previous yield in microseconds (μs)
59+
"""
60+
61+
5462
class PerformanceTestSamples(object):
5563
"""Collection of runtime samples from the benchmark execution.
5664
@@ -69,7 +77,7 @@ def __init__(self, name, samples=None):
6977
self.add(sample)
7078

7179
def __str__(self):
72-
"""Text summary of benchmark statisctics."""
80+
"""Text summary of benchmark statistics."""
7381
return (
7482
'{0.name!s} n={0.count!r} '
7583
'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
@@ -211,31 +219,60 @@ class PerformanceTestResult(object):
211219
Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
212220
or Benchmark_Driver).
213221
214-
It depends on the log format emitted by the test driver in the form:
215-
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
216-
217-
The last column, MAX_RSS, is emitted only for runs instrumented by the
218-
Benchmark_Driver to measure rough memory use during the execution of the
219-
benchmark.
222+
It suppors 2 log formats emitted by the test driver. Legacy format with
223+
statistics for normal distribution (MEAN, SD):
224+
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
225+
And new quantiles format with variable number of columns:
226+
#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
227+
#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
228+
The number of columns between MIN and MAX depends on the test driver's
229+
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
220230
"""
221231

222-
def __init__(self, csv_row):
223-
"""Initialize from a row with 8 or 9 columns with benchmark summary.
232+
def __init__(self, csv_row, quantiles=False, memory=False, delta=False):
233+
"""Initialize from a row of multiple columns with benchmark summary.
224234
225235
The row is an iterable, such as a row provided by the CSV parser.
226236
"""
227-
self.test_num = csv_row[0] # Ordinal number of the test
228-
self.name = csv_row[1] # Name of the performance test
229-
self.num_samples = ( # Number of measurement samples taken
230-
int(csv_row[2]))
231-
self.min = int(csv_row[3]) # Minimum runtime (μs)
232-
self.max = int(csv_row[4]) # Maximum runtime (μs)
233-
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
234-
self.sd = float(csv_row[6]) # Standard Deviation (μs)
235-
self.median = int(csv_row[7]) # Median runtime (μs)
236-
self.max_rss = ( # Maximum Resident Set Size (B)
237-
int(csv_row[8]) if len(csv_row) > 8 else None)
238-
self.samples = None
237+
self.test_num = csv_row[0] # Ordinal number of the test
238+
self.name = csv_row[1] # Name of the performance test
239+
self.num_samples = int(csv_row[2]) # Number of measurements taken
240+
241+
if quantiles: # Variable number of columns representing quantiles
242+
runtimes = csv_row[3:-1] if memory else csv_row[3:]
243+
if delta:
244+
runtimes = [int(x) if x else 0 for x in runtimes]
245+
runtimes = reduce(lambda l, x: l.append(l[-1] + x) or # runnin
246+
l if l else [x], runtimes, None) # total
247+
num_values = len(runtimes)
248+
if self.num_samples < num_values: # remove repeated samples
249+
quantile = num_values - 1
250+
qs = [float(i) / float(quantile) for i in range(0, num_values)]
251+
indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
252+
for q in qs]
253+
runtimes = [runtimes[indices.index(i)]
254+
for i in range(0, self.num_samples)]
255+
256+
self.samples = PerformanceTestSamples(
257+
self.name,
258+
[Sample(None, None, int(runtime)) for runtime in runtimes])
259+
self.samples.exclude_outliers(top_only=True)
260+
sams = self.samples
261+
self.min, self.max, self.median, self.mean, self.sd = \
262+
sams.min, sams.max, sams.median, sams.mean, sams.sd
263+
self.max_rss = ( # Maximum Resident Set Size (B)
264+
int(csv_row[-1]) if memory else None)
265+
else: # Legacy format with statistics for normal distribution.
266+
self.min = int(csv_row[3]) # Minimum runtime (μs)
267+
self.max = int(csv_row[4]) # Maximum runtime (μs)
268+
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
269+
self.sd = float(csv_row[6]) # Standard Deviation (μs)
270+
self.median = int(csv_row[7]) # Median runtime (μs)
271+
self.max_rss = ( # Maximum Resident Set Size (B)
272+
int(csv_row[8]) if len(csv_row) > 8 else None)
273+
self.samples = None
274+
self.yields = None
275+
self.setup = None
239276

240277
def __repr__(self):
241278
"""Short summary for debugging purposes."""
@@ -253,6 +290,7 @@ def merge(self, r):
253290
The use case here is comparing test results parsed from concatenated
254291
log files from multiple runs of benchmark driver.
255292
"""
293+
# Statistics
256294
if self.samples and r.samples:
257295
map(self.samples.add, r.samples.samples)
258296
sams = self.samples
@@ -266,8 +304,14 @@ def merge(self, r):
266304
(self.mean * self.num_samples) + (r.mean * r.num_samples)
267305
) / float(self.num_samples + r.num_samples)
268306
self.num_samples += r.num_samples
269-
self.max_rss = min(self.max_rss, r.max_rss)
270-
self.median, self.sd = 0, 0
307+
self.median, self.sd = None, None
308+
309+
# Metadata
310+
def minimum(a, b): # work around None being less than everything
311+
return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
312+
else None)
313+
self.max_rss = minimum(self.max_rss, r.max_rss)
314+
self.setup = minimum(self.setup, r.setup)
271315

272316

273317
class ResultComparison(object):
@@ -307,40 +351,48 @@ class LogParser(object):
307351
def __init__(self):
308352
"""Create instance of `LogParser`."""
309353
self.results = []
354+
self.quantiles, self.delta, self.memory = False, False, False
310355
self._reset()
311356

312357
def _reset(self):
313358
"""Reset parser to the default state for reading a new result."""
314-
self.samples, self.num_iters = [], 1
315-
self.max_rss, self.mem_pages = None, None
359+
self.samples, self.yields, self.num_iters = [], [], 1
360+
self.setup, self.max_rss, self.mem_pages = None, None, None
316361
self.voluntary_cs, self.involuntary_cs = None, None
317362

318363
# Parse lines like this
319364
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
320-
results_re = re.compile(r'( *\d+[, \t]*[\w.]+[, \t]*' +
321-
r'[, \t]*'.join([r'[\d.]+'] * 6) +
322-
r'[, \t]*[\d.]*)') # optional MAX_RSS(B)
365+
results_re = re.compile(
366+
r'( *\d+[, \t]+[\w.]+[, \t]+' + # #,TEST
367+
r'[, \t]+'.join([r'\d+'] * 2) + # at least 2...
368+
r'(?:[, \t]+\d*)*)') # ...or more numeric columns
323369

324370
def _append_result(self, result):
325-
columns = result.split(',')
326-
if len(columns) < 8:
327-
columns = result.split()
328-
r = PerformanceTestResult(columns)
329-
if self.max_rss:
330-
r.max_rss = self.max_rss
331-
r.mem_pages = self.mem_pages
332-
r.voluntary_cs = self.voluntary_cs
333-
r.involuntary_cs = self.involuntary_cs
371+
columns = result.split(',') if ',' in result else result.split()
372+
r = PerformanceTestResult(
373+
columns, quantiles=self.quantiles, memory=self.memory,
374+
delta=self.delta)
375+
r.setup = self.setup
376+
r.max_rss = r.max_rss or self.max_rss
377+
r.mem_pages = self.mem_pages
378+
r.voluntary_cs = self.voluntary_cs
379+
r.involuntary_cs = self.involuntary_cs
334380
if self.samples:
335381
r.samples = PerformanceTestSamples(r.name, self.samples)
336382
r.samples.exclude_outliers()
337383
self.results.append(r)
384+
r.yields = self.yields or None
338385
self._reset()
339386

340387
def _store_memory_stats(self, max_rss, mem_pages):
341388
self.max_rss = int(max_rss)
342389
self.mem_pages = int(mem_pages)
343390

391+
def _configure_format(self, header):
392+
self.quantiles = 'MEAN' not in header
393+
self.memory = 'MAX_RSS' in header
394+
self.delta = '𝚫' in header
395+
344396
# Regular expression and action to take when it matches the parsed line
345397
state_actions = {
346398
results_re: _append_result,
@@ -355,6 +407,17 @@ def _store_memory_stats(self, max_rss, mem_pages):
355407
self.samples.append(
356408
Sample(int(i), int(self.num_iters), int(runtime)))),
357409

410+
re.compile(r'\s+SetUp (\d+)'):
411+
(lambda self, setup: setattr(self, 'setup', int(setup))),
412+
413+
re.compile(r'\s+Yielding after ~(\d+) μs'):
414+
(lambda self, since_last_yield:
415+
self.yields.append(
416+
Yield(len(self.samples), int(since_last_yield)))),
417+
418+
re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
419+
_configure_format,
420+
358421
# Environmental statistics: memory usage and context switches
359422
re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
360423
_store_memory_stats,

0 commit comments

Comments
 (0)