add single threaded optimization

PSeitz · PSeitz · commit f9f4600eb6ab · 2025-12-21T19:29:31.000+08:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -40,6 +40,7 @@ quanta = "0.12"
 
 [target.'cfg(target_os = "linux")'.dependencies]
 perf-event = { version = "0.4.8" }
+libc = "0.2"
 
 [features]
 real_blackbox = []
diff --git a/src/bench.rs b/src/bench.rs
@@ -28,14 +28,21 @@ pub(crate) struct NamedBench<'a, I, O> {
     pub fun: CallBench<'a, I, O>,
     pub num_group_iter: usize,
     clock: Clock,
+    adjust_for_single_threaded_cpu_scheduling: bool,
 }
 impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
-    pub fn new(bench_id: BenchId, fun: CallBench<'a, I, O>, num_group_iter: usize) -> Self {
+    pub fn new(
+        bench_id: BenchId,
+        fun: CallBench<'a, I, O>,
+        num_group_iter: usize,
+        adjust_for_single_threaded_cpu_scheduling: bool,
+    ) -> Self {
         Self {
             bench_id,
             fun,
             num_group_iter,
             clock: Clock::new(),
+            adjust_for_single_threaded_cpu_scheduling,
         }
     }
 }
@@ -237,6 +244,11 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
             // Accumulate raw deltas and scale once at the end.
             // Scaling is linear, so `scale(sum(delta)) == sum(scale(delta))`.
             let mut sum_raw = 0u64;
+            let mut adjuster = if self.adjust_for_single_threaded_cpu_scheduling {
+                SingleThreadedCpuSchedulingAdjuster::start(&self.clock)
+            } else {
+                None
+            };
             let mut res: Option<O> = None;
             // In this mode, we measure each iteration separately to avoid destructor cost.
             // There may be some overhead, but it should be outweighed by benchmarks that allocate
@@ -255,16 +267,30 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
                 res = Some(val);
             }
             let sum_ns = self.clock.delta_as_nanos(0, sum_raw);
-            let duration_ns = sum_ns / num_iter as u64;
+            let adjusted_ns = adjuster
+                .as_mut()
+                .and_then(|adjuster| adjuster.finish(sum_ns, &self.clock))
+                .unwrap_or(sum_ns);
+            let duration_ns = adjusted_ns / num_iter as u64;
             RunResult::new(duration_ns, res.unwrap())
         } else {
             let start = self.clock.raw();
+            let mut adjuster = if self.adjust_for_single_threaded_cpu_scheduling {
+                SingleThreadedCpuSchedulingAdjuster::start_with_wall(start)
+            } else {
+                None
+            };
             let mut res: Option<O> = None;
             for _ in 0..num_iter {
                 res = Some(black_box((self.fun)(input)));
             }
-            let elapsed_ns = self.clock.delta_as_nanos(start, self.clock.raw());
-            let duration_ns = elapsed_ns / num_iter as u64;
+            let end = self.clock.raw();
+            let elapsed_ns = self.clock.delta_as_nanos(start, end);
+            let adjusted_ns = adjuster
+                .as_mut()
+                .and_then(|adjuster| adjuster.finish_with_wall(elapsed_ns, end, &self.clock))
+                .unwrap_or(elapsed_ns);
+            let duration_ns = adjusted_ns / num_iter as u64;
             RunResult::new(duration_ns, res.unwrap())
         };
 
@@ -275,3 +301,70 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
         run_result
     }
 }
+
+/// Adjusts measured wall time by subtracting time the single thread was not scheduled.
+///
+/// Uses wall time from `quanta::Clock` and per-thread CPU time from
+/// `clock_gettime(CLOCK_THREAD_CPUTIME_ID)` on Linux. That clock reports
+/// CPU time consumed by the calling thread only (does not advance while
+/// the thread is off-CPU or blocked), so `wall - cpu` approximates time
+/// spent descheduled. This is subtracted from the measured duration and
+/// assumes a single-threaded benchmark.
+struct SingleThreadedCpuSchedulingAdjuster {
+    wall_start_raw: u64,
+    cpu_start_ns: u64,
+}
+
+impl SingleThreadedCpuSchedulingAdjuster {
+    fn start(clock: &Clock) -> Option<Self> {
+        Self::start_with_wall(clock.raw())
+    }
+
+    fn start_with_wall(wall_start_raw: u64) -> Option<Self> {
+        let cpu_start_ns = thread_cpu_time_ns()?;
+        Some(Self {
+            wall_start_raw,
+            cpu_start_ns,
+        })
+    }
+
+    fn finish(&mut self, elapsed_ns: u64, clock: &Clock) -> Option<u64> {
+        self.finish_with_wall(elapsed_ns, clock.raw(), clock)
+    }
+
+    fn finish_with_wall(
+        &mut self,
+        elapsed_ns: u64,
+        wall_end_raw: u64,
+        clock: &Clock,
+    ) -> Option<u64> {
+        let cpu_end_ns = thread_cpu_time_ns()?;
+        let wall_ns = clock.delta_as_nanos(self.wall_start_raw, wall_end_raw);
+        let cpu_ns = cpu_end_ns.saturating_sub(self.cpu_start_ns);
+        // The difference between wall time and thread CPU time is time not scheduled.
+        let unscheduled_ns = wall_ns.saturating_sub(cpu_ns);
+        // Subtract unscheduled time from the measured duration.
+        Some(elapsed_ns.saturating_sub(unscheduled_ns))
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn thread_cpu_time_ns() -> Option<u64> {
+    let mut ts = libc::timespec {
+        tv_sec: 0,
+        tv_nsec: 0,
+    };
+    let res = unsafe { libc::clock_gettime(libc::CLOCK_THREAD_CPUTIME_ID, &mut ts) };
+    if res == 0 {
+        let secs = ts.tv_sec as u64;
+        let nanos = ts.tv_nsec as u64;
+        Some(secs.saturating_mul(1_000_000_000).saturating_add(nanos))
+    } else {
+        None
+    }
+}
+
+#[cfg(not(target_os = "linux"))]
+fn thread_cpu_time_ns() -> Option<u64> {
+    None
+}
diff --git a/src/bench_group.rs b/src/bench_group.rs
@@ -63,6 +63,7 @@ impl<'a, 'runner> BenchGroup<'a, 'runner> {
             self.get_bench_id(bench_name.into()),
             Box::new(fun),
             self.runner.config().get_num_iter_for_group(),
+            self.runner.config.adjust_for_single_threaded_cpu_scheduling,
         );
         self.register_named_with_input(bench, input);
     }
@@ -79,6 +80,7 @@ impl<'a, 'runner> BenchGroup<'a, 'runner> {
             self.get_bench_id(bench_name),
             Box::new(fun),
             self.runner.config().get_num_iter_for_group(),
+            self.runner.config.adjust_for_single_threaded_cpu_scheduling,
         );
 
         self.register_named_with_input(bench, &());
diff --git a/src/bench_input_group.rs b/src/bench_input_group.rs
@@ -100,8 +100,12 @@ impl<I: 'static, O: OutputValue + 'static> InputGroup<I, O> {
             let bench_id = BenchId::from_bench_name(name.clone())
                 .runner_name(self.runner.name.as_deref())
                 .group_name(Some(input.name.clone()));
-            let named_bench: NamedBench<'static, I, O> =
-                NamedBench::new(bench_id, Box::new(fun.clone()), num_iter_for_group);
+            let named_bench: NamedBench<'static, I, O> = NamedBench::new(
+                bench_id,
+                Box::new(fun.clone()),
+                num_iter_for_group,
+                self.runner.config.adjust_for_single_threaded_cpu_scheduling,
+            );
 
             self.benches_per_input[ord].push(named_bench);
         }
diff --git a/src/bench_runner.rs b/src/bench_runner.rs
@@ -106,6 +106,7 @@ impl BenchRunner {
             bench_id,
             Box::new(f),
             self.config().get_num_iter_for_group(),
+            self.config.adjust_for_single_threaded_cpu_scheduling,
         );
         let bundle = InputWithBenchmark::new(
             EMPTY_INPUT,
diff --git a/src/config.rs b/src/config.rs
@@ -17,6 +17,10 @@ pub struct Config {
     /// Manually set the number of iterations the benchmark group is run.
     ///
     pub num_iter_group: Option<usize>,
+    /// Adjust duration by subtracting time the thread was not scheduled (Linux only).
+    /// Intended for single-threaded, single-benchmark runs.
+    /// Assumes a single thread is doing work during the measurement.
+    pub adjust_for_single_threaded_cpu_scheduling: bool,
 }
 
 impl Default for Config {
@@ -29,6 +33,7 @@ impl Default for Config {
             verbose,
             num_iter_bench: None,
             num_iter_group: None,
+            adjust_for_single_threaded_cpu_scheduling: false,
         }
     }
 }
@@ -84,6 +89,14 @@ impl Config {
         self.interleave = interleave;
         self
     }
+
+    /// Adjust duration by subtracting time the thread was not scheduled (Linux only).
+    /// Intended for single-threaded, single-benchmark runs.
+    /// Assumes a single thread is doing work during the measurement.
+    pub fn set_adjust_for_single_threaded_cpu_scheduling(&mut self, enabled: bool) -> &mut Self {
+        self.adjust_for_single_threaded_cpu_scheduling = enabled;
+        self
+    }
 }
 
 pub(crate) fn parse_args() -> Config {

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,10 @@ pub struct Config {`
`17`	`17`	`/// Manually set the number of iterations the benchmark group is run.`
`18`	`18`	`///`
`19`	`19`	`pub num_iter_group: Option<usize>,`
	`20`	`+ /// Adjust duration by subtracting time the thread was not scheduled (Linux only).`
	`21`	`+ /// Intended for single-threaded, single-benchmark runs.`
	`22`	`+ /// Assumes a single thread is doing work during the measurement.`
	`23`	`+ pub adjust_for_single_threaded_cpu_scheduling: bool,`
`20`	`24`	`}`
`21`	`25`
`22`	`26`	`impl Default for Config {`
`@@ -29,6 +33,7 @@ impl Default for Config {`
`29`	`33`	`verbose,`
`30`	`34`	`num_iter_bench: None,`
`31`	`35`	`num_iter_group: None,`
	`36`	`+ adjust_for_single_threaded_cpu_scheduling: false,`
`32`	`37`	`}`
`33`	`38`	`}`
`34`	`39`	`}`
`@@ -84,6 +89,14 @@ impl Config {`
`84`	`89`	`self.interleave = interleave;`
`85`	`90`	`self`
`86`	`91`	`}`
	`92`	`+`
	`93`	`+ /// Adjust duration by subtracting time the thread was not scheduled (Linux only).`
	`94`	`+ /// Intended for single-threaded, single-benchmark runs.`
	`95`	`+ /// Assumes a single thread is doing work during the measurement.`
	`96`	`+ pub fn set_adjust_for_single_threaded_cpu_scheduling(&mut self, enabled: bool) -> &mut Self {`
	`97`	`+ self.adjust_for_single_threaded_cpu_scheduling = enabled;`
	`98`	`+ self`
	`99`	`+ }`
`87`	`100`	`}`
`88`	`101`
`89`	`102`	`pub(crate) fn parse_args() -> Config {`