Skip to content

Commit f9f4600

Browse files
committed
add single threaded optimization
1 parent e85fdc3 commit f9f4600

File tree

6 files changed

+120
-6
lines changed

6 files changed

+120
-6
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ quanta = "0.12"
4040

4141
[target.'cfg(target_os = "linux")'.dependencies]
4242
perf-event = { version = "0.4.8" }
43+
libc = "0.2"
4344

4445
[features]
4546
real_blackbox = []

src/bench.rs

Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,21 @@ pub(crate) struct NamedBench<'a, I, O> {
2828
pub fun: CallBench<'a, I, O>,
2929
pub num_group_iter: usize,
3030
clock: Clock,
31+
adjust_for_single_threaded_cpu_scheduling: bool,
3132
}
3233
impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
33-
pub fn new(bench_id: BenchId, fun: CallBench<'a, I, O>, num_group_iter: usize) -> Self {
34+
pub fn new(
35+
bench_id: BenchId,
36+
fun: CallBench<'a, I, O>,
37+
num_group_iter: usize,
38+
adjust_for_single_threaded_cpu_scheduling: bool,
39+
) -> Self {
3440
Self {
3541
bench_id,
3642
fun,
3743
num_group_iter,
3844
clock: Clock::new(),
45+
adjust_for_single_threaded_cpu_scheduling,
3946
}
4047
}
4148
}
@@ -237,6 +244,11 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
237244
// Accumulate raw deltas and scale once at the end.
238245
// Scaling is linear, so `scale(sum(delta)) == sum(scale(delta))`.
239246
let mut sum_raw = 0u64;
247+
let mut adjuster = if self.adjust_for_single_threaded_cpu_scheduling {
248+
SingleThreadedCpuSchedulingAdjuster::start(&self.clock)
249+
} else {
250+
None
251+
};
240252
let mut res: Option<O> = None;
241253
// In this mode, we measure each iteration separately to avoid destructor cost.
242254
// There may be some overhead, but it should be outweighed by benchmarks that allocate
@@ -255,16 +267,30 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
255267
res = Some(val);
256268
}
257269
let sum_ns = self.clock.delta_as_nanos(0, sum_raw);
258-
let duration_ns = sum_ns / num_iter as u64;
270+
let adjusted_ns = adjuster
271+
.as_mut()
272+
.and_then(|adjuster| adjuster.finish(sum_ns, &self.clock))
273+
.unwrap_or(sum_ns);
274+
let duration_ns = adjusted_ns / num_iter as u64;
259275
RunResult::new(duration_ns, res.unwrap())
260276
} else {
261277
let start = self.clock.raw();
278+
let mut adjuster = if self.adjust_for_single_threaded_cpu_scheduling {
279+
SingleThreadedCpuSchedulingAdjuster::start_with_wall(start)
280+
} else {
281+
None
282+
};
262283
let mut res: Option<O> = None;
263284
for _ in 0..num_iter {
264285
res = Some(black_box((self.fun)(input)));
265286
}
266-
let elapsed_ns = self.clock.delta_as_nanos(start, self.clock.raw());
267-
let duration_ns = elapsed_ns / num_iter as u64;
287+
let end = self.clock.raw();
288+
let elapsed_ns = self.clock.delta_as_nanos(start, end);
289+
let adjusted_ns = adjuster
290+
.as_mut()
291+
.and_then(|adjuster| adjuster.finish_with_wall(elapsed_ns, end, &self.clock))
292+
.unwrap_or(elapsed_ns);
293+
let duration_ns = adjusted_ns / num_iter as u64;
268294
RunResult::new(duration_ns, res.unwrap())
269295
};
270296

@@ -275,3 +301,70 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
275301
run_result
276302
}
277303
}
304+
305+
/// Adjusts measured wall time by subtracting time the single thread was not scheduled.
306+
///
307+
/// Uses wall time from `quanta::Clock` and per-thread CPU time from
308+
/// `clock_gettime(CLOCK_THREAD_CPUTIME_ID)` on Linux. That clock reports
309+
/// CPU time consumed by the calling thread only (does not advance while
310+
/// the thread is off-CPU or blocked), so `wall - cpu` approximates time
311+
/// spent descheduled. This is subtracted from the measured duration and
312+
/// assumes a single-threaded benchmark.
313+
struct SingleThreadedCpuSchedulingAdjuster {
314+
wall_start_raw: u64,
315+
cpu_start_ns: u64,
316+
}
317+
318+
impl SingleThreadedCpuSchedulingAdjuster {
319+
fn start(clock: &Clock) -> Option<Self> {
320+
Self::start_with_wall(clock.raw())
321+
}
322+
323+
fn start_with_wall(wall_start_raw: u64) -> Option<Self> {
324+
let cpu_start_ns = thread_cpu_time_ns()?;
325+
Some(Self {
326+
wall_start_raw,
327+
cpu_start_ns,
328+
})
329+
}
330+
331+
fn finish(&mut self, elapsed_ns: u64, clock: &Clock) -> Option<u64> {
332+
self.finish_with_wall(elapsed_ns, clock.raw(), clock)
333+
}
334+
335+
fn finish_with_wall(
336+
&mut self,
337+
elapsed_ns: u64,
338+
wall_end_raw: u64,
339+
clock: &Clock,
340+
) -> Option<u64> {
341+
let cpu_end_ns = thread_cpu_time_ns()?;
342+
let wall_ns = clock.delta_as_nanos(self.wall_start_raw, wall_end_raw);
343+
let cpu_ns = cpu_end_ns.saturating_sub(self.cpu_start_ns);
344+
// The difference between wall time and thread CPU time is time not scheduled.
345+
let unscheduled_ns = wall_ns.saturating_sub(cpu_ns);
346+
// Subtract unscheduled time from the measured duration.
347+
Some(elapsed_ns.saturating_sub(unscheduled_ns))
348+
}
349+
}
350+
351+
#[cfg(target_os = "linux")]
352+
fn thread_cpu_time_ns() -> Option<u64> {
353+
let mut ts = libc::timespec {
354+
tv_sec: 0,
355+
tv_nsec: 0,
356+
};
357+
let res = unsafe { libc::clock_gettime(libc::CLOCK_THREAD_CPUTIME_ID, &mut ts) };
358+
if res == 0 {
359+
let secs = ts.tv_sec as u64;
360+
let nanos = ts.tv_nsec as u64;
361+
Some(secs.saturating_mul(1_000_000_000).saturating_add(nanos))
362+
} else {
363+
None
364+
}
365+
}
366+
367+
#[cfg(not(target_os = "linux"))]
368+
fn thread_cpu_time_ns() -> Option<u64> {
369+
None
370+
}

src/bench_group.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ impl<'a, 'runner> BenchGroup<'a, 'runner> {
6363
self.get_bench_id(bench_name.into()),
6464
Box::new(fun),
6565
self.runner.config().get_num_iter_for_group(),
66+
self.runner.config.adjust_for_single_threaded_cpu_scheduling,
6667
);
6768
self.register_named_with_input(bench, input);
6869
}
@@ -79,6 +80,7 @@ impl<'a, 'runner> BenchGroup<'a, 'runner> {
7980
self.get_bench_id(bench_name),
8081
Box::new(fun),
8182
self.runner.config().get_num_iter_for_group(),
83+
self.runner.config.adjust_for_single_threaded_cpu_scheduling,
8284
);
8385

8486
self.register_named_with_input(bench, &());

src/bench_input_group.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,12 @@ impl<I: 'static, O: OutputValue + 'static> InputGroup<I, O> {
100100
let bench_id = BenchId::from_bench_name(name.clone())
101101
.runner_name(self.runner.name.as_deref())
102102
.group_name(Some(input.name.clone()));
103-
let named_bench: NamedBench<'static, I, O> =
104-
NamedBench::new(bench_id, Box::new(fun.clone()), num_iter_for_group);
103+
let named_bench: NamedBench<'static, I, O> = NamedBench::new(
104+
bench_id,
105+
Box::new(fun.clone()),
106+
num_iter_for_group,
107+
self.runner.config.adjust_for_single_threaded_cpu_scheduling,
108+
);
105109

106110
self.benches_per_input[ord].push(named_bench);
107111
}

src/bench_runner.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ impl BenchRunner {
106106
bench_id,
107107
Box::new(f),
108108
self.config().get_num_iter_for_group(),
109+
self.config.adjust_for_single_threaded_cpu_scheduling,
109110
);
110111
let bundle = InputWithBenchmark::new(
111112
EMPTY_INPUT,

src/config.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ pub struct Config {
1717
/// Manually set the number of iterations the benchmark group is run.
1818
///
1919
pub num_iter_group: Option<usize>,
20+
/// Adjust duration by subtracting time the thread was not scheduled (Linux only).
21+
/// Intended for single-threaded, single-benchmark runs.
22+
/// Assumes a single thread is doing work during the measurement.
23+
pub adjust_for_single_threaded_cpu_scheduling: bool,
2024
}
2125

2226
impl Default for Config {
@@ -29,6 +33,7 @@ impl Default for Config {
2933
verbose,
3034
num_iter_bench: None,
3135
num_iter_group: None,
36+
adjust_for_single_threaded_cpu_scheduling: false,
3237
}
3338
}
3439
}
@@ -84,6 +89,14 @@ impl Config {
8489
self.interleave = interleave;
8590
self
8691
}
92+
93+
/// Adjust duration by subtracting time the thread was not scheduled (Linux only).
94+
/// Intended for single-threaded, single-benchmark runs.
95+
/// Assumes a single thread is doing work during the measurement.
96+
pub fn set_adjust_for_single_threaded_cpu_scheduling(&mut self, enabled: bool) -> &mut Self {
97+
self.adjust_for_single_threaded_cpu_scheduling = enabled;
98+
self
99+
}
87100
}
88101

89102
pub(crate) fn parse_args() -> Config {

0 commit comments

Comments
 (0)