Skip to content

Commit 58b1962

Browse files
committed
Implement running 'benchmark thresholds update' multiple times
1 parent ff9629b commit 58b1962

11 files changed

+517
-200
lines changed

Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift

Lines changed: 122 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ import Glibc
4444
let skipLoadingBenchmarks = argumentExtractor.extractFlag(named: "skip-loading-benchmark-targets")
4545
let checkAbsoluteThresholds =
4646
checkAbsoluteThresholdsPath.count > 0 ? 1 : argumentExtractor.extractFlag(named: "check-absolute")
47+
let runCount = argumentExtractor.extractOption(named: "run-count")
48+
let relative = argumentExtractor.extractFlag(named: "relative")
49+
let range = argumentExtractor.extractFlag(named: "range")
4750
let groupingToUse = argumentExtractor.extractOption(named: "grouping")
4851
let metricsToUse = argumentExtractor.extractOption(named: "metric")
4952
let timeUnits = argumentExtractor.extractOption(named: "time-units")
@@ -234,6 +237,7 @@ import Glibc
234237
throw MyError.invalidArgument
235238
}
236239

240+
var totalRunCount = 1
237241
var skipLoadingBenchmarksFlagIsValid = skipLoadingBenchmarks == 0
238242
if commandToPerform == .thresholds {
239243
guard positionalArguments.count > 0,
@@ -264,10 +268,27 @@ import Glibc
264268
)
265269
throw MyError.invalidArgument
266270
}
267-
if positionalArguments.count > 0 {
271+
let usesExistingBaseline = positionalArguments.count > 0
272+
if usesExistingBaseline {
268273
shouldBuildTargets = false
269274
}
270-
break
275+
let requestedRunCount = runCount.first.flatMap { Int($0) } ?? 1
276+
/// These update the run count to 5 by default if it's set to 1.
277+
/// Using relative/range flags doesn't mean anything if we're not running multiple times.
278+
/// The benchmarks will need to be run multiple times in order to be able to calculate a
279+
/// relative/range of thresholds which satisfy all benchmark runs.
280+
if relative > 0 {
281+
args.append("--wants-relative-thresholds")
282+
if !usesExistingBaseline {
283+
totalRunCount = requestedRunCount < 2 ? 5 : requestedRunCount
284+
}
285+
}
286+
if range > 0 {
287+
args.append("--wants-range-thresholds")
288+
if !usesExistingBaseline {
289+
totalRunCount = requestedRunCount < 2 ? 5 : requestedRunCount
290+
}
291+
}
271292
case .check:
272293
skipLoadingBenchmarksFlagIsValid = true
273294
shouldBuildTargets = skipLoadingBenchmarks == 0
@@ -477,53 +498,108 @@ import Glibc
477498

478499
var failedBenchmarkCount = 0
479500

480-
try withCStrings(args) { cArgs in
481-
if debug > 0 {
482-
print("To debug, start \(benchmarkToolName) in LLDB using:")
483-
print("lldb \(benchmarkTool.string)")
484-
print("")
485-
print("Then launch \(benchmarkToolName) with:")
486-
print("run \(args.dropFirst().joined(separator: " "))")
487-
print("")
488-
return
489-
}
501+
var allFailureCount = 0
502+
let results: [Result<Void, Error>] = (0..<max(totalRunCount, 1))
503+
.map { runIdx in
504+
// If we're running multiple times, we need to add the run count to the arguments
505+
var args = args
506+
if totalRunCount > 1 {
507+
args += ["--run-number", "\(runIdx + 1)"]
508+
if quietRunning == 0 {
509+
print(
510+
"""
490511
491-
var pid: pid_t = 0
492-
var status = posix_spawn(&pid, benchmarkTool.string, nil, nil, cArgs, environ)
493-
494-
if status == 0 {
495-
if waitpid(pid, &status, 0) != -1 {
496-
// Ok, this sucks, but there is no way to get a C support target for plugins and
497-
// the way the status is extracted portably is with macros - so we just need to
498-
// reimplement the logic here in Swift according to the waitpid man page to
499-
// get some nicer feedback on failure reason.
500-
guard let waitStatus = ExitCode(rawValue: (status & 0xFF00) >> 8) else {
501-
print("One or more benchmarks returned an unexpected return code \(status)")
502-
throw MyError.benchmarkUnexpectedReturnCode
512+
Running the command multiple times, round \(runIdx + 1) of \(totalRunCount)...
513+
"""
514+
)
503515
}
504-
switch waitStatus {
505-
case .success:
506-
break
507-
case .baselineNotFound:
508-
throw MyError.baselineNotFound
509-
case .genericFailure:
510-
print("One or more benchmark suites crashed during runtime.")
511-
throw MyError.benchmarkCrashed
512-
case .thresholdRegression:
513-
throw MyError.benchmarkThresholdRegression
514-
case .thresholdImprovement:
515-
throw MyError.benchmarkThresholdImprovement
516-
case .benchmarkJobFailed:
517-
failedBenchmarkCount += 1
518-
case .noPermissions:
519-
throw MyError.noPermissions
516+
}
517+
518+
return Result<Void, Error> {
519+
try withCStrings(args) { cArgs in
520+
/// We'll decrement this in the success path
521+
allFailureCount += 1
522+
523+
if debug > 0 {
524+
print("To debug, start \(benchmarkToolName) in LLDB using:")
525+
print("lldb \(benchmarkTool.string)")
526+
print("")
527+
print("Then launch \(benchmarkToolName) with:")
528+
print("run \(args.dropFirst().joined(separator: " "))")
529+
print("")
530+
return
531+
}
532+
533+
var pid: pid_t = 0
534+
var status = posix_spawn(&pid, benchmarkTool.string, nil, nil, cArgs, environ)
535+
536+
if status == 0 {
537+
if waitpid(pid, &status, 0) != -1 {
538+
// Ok, this sucks, but there is no way to get a C support target for plugins and
539+
// the way the status is extracted portably is with macros - so we just need to
540+
// reimplement the logic here in Swift according to the waitpid man page to
541+
// get some nicer feedback on failure reason.
542+
guard let waitStatus = ExitCode(rawValue: (status & 0xFF00) >> 8) else {
543+
print("One or more benchmarks returned an unexpected return code \(status)")
544+
throw MyError.benchmarkUnexpectedReturnCode
545+
}
546+
switch waitStatus {
547+
case .success:
548+
allFailureCount -= 1
549+
case .baselineNotFound:
550+
throw MyError.baselineNotFound
551+
case .genericFailure:
552+
print("One or more benchmark suites crashed during runtime.")
553+
throw MyError.benchmarkCrashed
554+
case .thresholdRegression:
555+
throw MyError.benchmarkThresholdRegression
556+
case .thresholdImprovement:
557+
throw MyError.benchmarkThresholdImprovement
558+
case .benchmarkJobFailed:
559+
failedBenchmarkCount += 1
560+
case .noPermissions:
561+
throw MyError.noPermissions
562+
}
563+
} else {
564+
print(
565+
"waitpid() for pid \(pid) returned a non-zero exit code \(status), errno = \(errno)"
566+
)
567+
exit(errno)
568+
}
569+
} else {
570+
print("Failed to run BenchmarkTool, posix_spawn() returned [\(status)]")
571+
}
520572
}
521-
} else {
522-
print("waitpid() for pid \(pid) returned a non-zero exit code \(status), errno = \(errno)")
523-
exit(errno)
524573
}
525-
} else {
526-
print("Failed to run BenchmarkTool, posix_spawn() returned [\(status)]")
574+
}
575+
576+
switch results.count {
577+
case ...0:
578+
throw MyError.unknownFailure
579+
case 1:
580+
try results[0].get()
581+
default:
582+
if allFailureCount > 0 {
583+
print(
584+
"""
585+
Ran BenchmarkTool \(results.count) times, but it failed \(allFailureCount) times.
586+
Will exit with the first failure.
587+
588+
"""
589+
)
590+
guard
591+
let failure = results.first(where: { result in
592+
switch result {
593+
case .failure:
594+
return true
595+
case .success:
596+
return false
597+
}
598+
})
599+
else {
600+
throw MyError.unknownFailure
601+
}
602+
try failure.get()
527603
}
528604
}
529605

@@ -543,5 +619,6 @@ import Glibc
543619
case noPermissions = 6
544620
case invalidArgument = 101
545621
case buildFailed = 102
622+
case unknownFailure = 103
546623
}
547624
}

Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,42 @@ struct Benchmark: AsyncParsableCommand {
153153
)
154154
var checkAbsolute = false
155155

156+
@Flag(
157+
name: .long,
158+
help: """
159+
Specifies that thresholds check command should skip loading benchmark targets.
160+
Use this flag to skip unnecessary building of benchmark targets and loading of benchmark results, to save time.
161+
This flag is specially useful when combined with static threshold files that contain the newly supported relative or range thresholds.
162+
With such a set up, you'll save the time needed to build the benchmark targets and the thresholds check operation
163+
will only read the threshold tolerance values from the static files.
164+
"""
165+
)
166+
var skipLoadingBenchmarks = false
167+
168+
@Option(
169+
name: .long,
170+
help: """
171+
The number of times to run each benchmark in thresholds update operation.
172+
This is only valid when --relative or --range are also specified.
173+
When combined with --relative or --range flags, this option will run the benchmarks multiple times to calculate
174+
relative or range thresholds, and each time it'll widen the threshold tolerances according to the new result.
175+
Defaults to 1.
176+
"""
177+
)
178+
var runCount: Int?
179+
180+
@Flag(
181+
name: .long,
182+
help: "Specifies that thresholds update command should output relative thresholds to the static files."
183+
)
184+
var relative = false
185+
186+
@Flag(
187+
name: .long,
188+
help: "Specifies that thresholds update command should output min-max range thresholds to the static files."
189+
)
190+
var range = false
191+
156192
@Option(
157193
name: .long,
158194
help:

0 commit comments

Comments
 (0)