Skip to content

Commit ac4675b

Browse files
committed
Implement running 'benchmark thresholds update' multiple times
1 parent ff9629b commit ac4675b

11 files changed

+405
-98
lines changed

Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift

Lines changed: 116 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ import Glibc
4444
let skipLoadingBenchmarks = argumentExtractor.extractFlag(named: "skip-loading-benchmark-targets")
4545
let checkAbsoluteThresholds =
4646
checkAbsoluteThresholdsPath.count > 0 ? 1 : argumentExtractor.extractFlag(named: "check-absolute")
47+
let runCount = argumentExtractor.extractOption(named: "run-count")
48+
let relative = argumentExtractor.extractFlag(named: "relative")
49+
let range = argumentExtractor.extractFlag(named: "range")
4750
let groupingToUse = argumentExtractor.extractOption(named: "grouping")
4851
let metricsToUse = argumentExtractor.extractOption(named: "metric")
4952
let timeUnits = argumentExtractor.extractOption(named: "time-units")
@@ -234,6 +237,7 @@ import Glibc
234237
throw MyError.invalidArgument
235238
}
236239

240+
var totalRunCount = 1
237241
var skipLoadingBenchmarksFlagIsValid = skipLoadingBenchmarks == 0
238242
if commandToPerform == .thresholds {
239243
guard positionalArguments.count > 0,
@@ -267,6 +271,19 @@ import Glibc
267271
if positionalArguments.count > 0 {
268272
shouldBuildTargets = false
269273
}
274+
totalRunCount = runCount.first.flatMap { Int($0) } ?? 1
275+
/// These update the run count to 5 by default if it's set to 1.
276+
/// Using relative/range flags doesn't mean anything if we're not running multiple times.
277+
/// The benchmarks will need to be run multiple times in order to be able to calculate a
278+
/// relative/range of thresholds which satisfy all benchmark runs.
279+
if relative > 0 {
280+
totalRunCount = totalRunCount < 2 ? 5 : totalRunCount
281+
args.append("--wants-relative-thresholds")
282+
}
283+
if range > 0 {
284+
totalRunCount = totalRunCount < 2 ? 5 : totalRunCount
285+
args.append("--wants-range-thresholds")
286+
}
270287
break
271288
case .check:
272289
skipLoadingBenchmarksFlagIsValid = true
@@ -477,53 +494,108 @@ import Glibc
477494

478495
var failedBenchmarkCount = 0
479496

480-
try withCStrings(args) { cArgs in
481-
if debug > 0 {
482-
print("To debug, start \(benchmarkToolName) in LLDB using:")
483-
print("lldb \(benchmarkTool.string)")
484-
print("")
485-
print("Then launch \(benchmarkToolName) with:")
486-
print("run \(args.dropFirst().joined(separator: " "))")
487-
print("")
488-
return
489-
}
497+
var allFailureCount = 0
498+
let results: [Result<Void, Error>] = (0..<max(totalRunCount, 1))
499+
.map { runIdx in
500+
// If we're running multiple times, we need to add the run count to the arguments
501+
var args = args
502+
if totalRunCount > 1 {
503+
args += ["--run-number", "\(runIdx + 1)"]
504+
if quietRunning == 0 {
505+
print(
506+
"""
490507
491-
var pid: pid_t = 0
492-
var status = posix_spawn(&pid, benchmarkTool.string, nil, nil, cArgs, environ)
493-
494-
if status == 0 {
495-
if waitpid(pid, &status, 0) != -1 {
496-
// Ok, this sucks, but there is no way to get a C support target for plugins and
497-
// the way the status is extracted portably is with macros - so we just need to
498-
// reimplement the logic here in Swift according to the waitpid man page to
499-
// get some nicer feedback on failure reason.
500-
guard let waitStatus = ExitCode(rawValue: (status & 0xFF00) >> 8) else {
501-
print("One or more benchmarks returned an unexpected return code \(status)")
502-
throw MyError.benchmarkUnexpectedReturnCode
508+
Running the command multiple times, round \(runIdx + 1) of \(totalRunCount)...
509+
"""
510+
)
503511
}
504-
switch waitStatus {
505-
case .success:
506-
break
507-
case .baselineNotFound:
508-
throw MyError.baselineNotFound
509-
case .genericFailure:
510-
print("One or more benchmark suites crashed during runtime.")
511-
throw MyError.benchmarkCrashed
512-
case .thresholdRegression:
513-
throw MyError.benchmarkThresholdRegression
514-
case .thresholdImprovement:
515-
throw MyError.benchmarkThresholdImprovement
516-
case .benchmarkJobFailed:
517-
failedBenchmarkCount += 1
518-
case .noPermissions:
519-
throw MyError.noPermissions
512+
}
513+
514+
return Result<Void, Error> {
515+
try withCStrings(args) { cArgs in
516+
/// We'll decrement this in the success path
517+
allFailureCount += 1
518+
519+
if debug > 0 {
520+
print("To debug, start \(benchmarkToolName) in LLDB using:")
521+
print("lldb \(benchmarkTool.string)")
522+
print("")
523+
print("Then launch \(benchmarkToolName) with:")
524+
print("run \(args.dropFirst().joined(separator: " "))")
525+
print("")
526+
return
527+
}
528+
529+
var pid: pid_t = 0
530+
var status = posix_spawn(&pid, benchmarkTool.string, nil, nil, cArgs, environ)
531+
532+
if status == 0 {
533+
if waitpid(pid, &status, 0) != -1 {
534+
// Ok, this sucks, but there is no way to get a C support target for plugins and
535+
// the way the status is extracted portably is with macros - so we just need to
536+
// reimplement the logic here in Swift according to the waitpid man page to
537+
// get some nicer feedback on failure reason.
538+
guard let waitStatus = ExitCode(rawValue: (status & 0xFF00) >> 8) else {
539+
print("One or more benchmarks returned an unexpected return code \(status)")
540+
throw MyError.benchmarkUnexpectedReturnCode
541+
}
542+
switch waitStatus {
543+
case .success:
544+
allFailureCount -= 1
545+
case .baselineNotFound:
546+
throw MyError.baselineNotFound
547+
case .genericFailure:
548+
print("One or more benchmark suites crashed during runtime.")
549+
throw MyError.benchmarkCrashed
550+
case .thresholdRegression:
551+
throw MyError.benchmarkThresholdRegression
552+
case .thresholdImprovement:
553+
throw MyError.benchmarkThresholdImprovement
554+
case .benchmarkJobFailed:
555+
failedBenchmarkCount += 1
556+
case .noPermissions:
557+
throw MyError.noPermissions
558+
}
559+
} else {
560+
print(
561+
"waitpid() for pid \(pid) returned a non-zero exit code \(status), errno = \(errno)"
562+
)
563+
exit(errno)
564+
}
565+
} else {
566+
print("Failed to run BenchmarkTool, posix_spawn() returned [\(status)]")
567+
}
520568
}
521-
} else {
522-
print("waitpid() for pid \(pid) returned a non-zero exit code \(status), errno = \(errno)")
523-
exit(errno)
524569
}
525-
} else {
526-
print("Failed to run BenchmarkTool, posix_spawn() returned [\(status)]")
570+
}
571+
572+
switch results.count {
573+
case ...0:
574+
throw MyError.unknownFailure
575+
case 1:
576+
try results[0].get()
577+
default:
578+
if allFailureCount > 0 {
579+
print(
580+
"""
581+
Ran BenchmarkTool \(results.count) times, but it failed \(allFailureCount) times.
582+
Will exit with the first failure.
583+
584+
"""
585+
)
586+
guard
587+
let failure = results.first(where: { result in
588+
switch result {
589+
case .failure:
590+
return true
591+
case .success:
592+
return false
593+
}
594+
})
595+
else {
596+
throw MyError.unknownFailure
597+
}
598+
try failure.get()
527599
}
528600
}
529601

@@ -543,5 +615,6 @@ import Glibc
543615
case noPermissions = 6
544616
case invalidArgument = 101
545617
case buildFailed = 102
618+
case unknownFailure = 103
546619
}
547620
}

Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,42 @@ struct Benchmark: AsyncParsableCommand {
153153
)
154154
var checkAbsolute = false
155155

156+
@Flag(
157+
name: .long,
158+
help: """
159+
Specifies that thresholds check command should skip loading benchmark targets.
160+
Use this flag to skip unnecessary building of benchmark targets and loading of benchmark results, to save time.
161+
This flag is specially useful when combined with static threshold files that contain the newly supported relative or range thresholds.
162+
With such a set up, you'll save the time needed to build the benchmark targets and the thresholds check operation
163+
will only read the threshold tolerance values from the static files.
164+
"""
165+
)
166+
var skipLoadingBenchmarks = false
167+
168+
@Option(
169+
name: .long,
170+
help: """
171+
The number of times to run each benchmark in thresholds update operation.
172+
This is only valid when --relative or --range are also specified.
173+
When combined with --relative or --range flags, this option will run the benchmarks multiple times to calculate
174+
relative or range thresholds, and each time it'll widen the threshold tolerances according to the new result.
175+
Defaults to 1.
176+
"""
177+
)
178+
var runCount: Int?
179+
180+
@Flag(
181+
name: .long,
182+
help: "Specifies that thresholds update command should output relative thresholds to the static files."
183+
)
184+
var relative = false
185+
186+
@Flag(
187+
name: .long,
188+
help: "Specifies that thresholds update command should output min-max range thresholds to the static files."
189+
)
190+
var range = false
191+
156192
@Option(
157193
name: .long,
158194
help:

0 commit comments

Comments
 (0)