@@ -28,14 +28,21 @@ pub(crate) struct NamedBench<'a, I, O> {
2828 pub fun : CallBench < ' a , I , O > ,
2929 pub num_group_iter : usize ,
3030 clock : Clock ,
31+ adjust_for_single_threaded_cpu_scheduling : bool ,
3132}
3233impl < ' a , I , O : OutputValue > NamedBench < ' a , I , O > {
33- pub fn new ( bench_id : BenchId , fun : CallBench < ' a , I , O > , num_group_iter : usize ) -> Self {
34+ pub fn new (
35+ bench_id : BenchId ,
36+ fun : CallBench < ' a , I , O > ,
37+ num_group_iter : usize ,
38+ adjust_for_single_threaded_cpu_scheduling : bool ,
39+ ) -> Self {
3440 Self {
3541 bench_id,
3642 fun,
3743 num_group_iter,
3844 clock : Clock :: new ( ) ,
45+ adjust_for_single_threaded_cpu_scheduling,
3946 }
4047 }
4148}
@@ -237,6 +244,11 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
237244 // Accumulate raw deltas and scale once at the end.
238245 // Scaling is linear, so `scale(sum(delta)) == sum(scale(delta))`.
239246 let mut sum_raw = 0u64 ;
247+ let mut adjuster = if self . adjust_for_single_threaded_cpu_scheduling {
248+ SingleThreadedCpuSchedulingAdjuster :: start ( & self . clock )
249+ } else {
250+ None
251+ } ;
240252 let mut res: Option < O > = None ;
241253 // In this mode, we measure each iteration separately to avoid destructor cost.
242254 // There may be some overhead, but it should be outweighed by benchmarks that allocate
@@ -255,16 +267,30 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
255267 res = Some ( val) ;
256268 }
257269 let sum_ns = self . clock . delta_as_nanos ( 0 , sum_raw) ;
258- let duration_ns = sum_ns / num_iter as u64 ;
270+ let adjusted_ns = adjuster
271+ . as_mut ( )
272+ . and_then ( |adjuster| adjuster. finish ( sum_ns, & self . clock ) )
273+ . unwrap_or ( sum_ns) ;
274+ let duration_ns = adjusted_ns / num_iter as u64 ;
259275 RunResult :: new ( duration_ns, res. unwrap ( ) )
260276 } else {
261277 let start = self . clock . raw ( ) ;
278+ let mut adjuster = if self . adjust_for_single_threaded_cpu_scheduling {
279+ SingleThreadedCpuSchedulingAdjuster :: start_with_wall ( start)
280+ } else {
281+ None
282+ } ;
262283 let mut res: Option < O > = None ;
263284 for _ in 0 ..num_iter {
264285 res = Some ( black_box ( ( self . fun ) ( input) ) ) ;
265286 }
266- let elapsed_ns = self . clock . delta_as_nanos ( start, self . clock . raw ( ) ) ;
267- let duration_ns = elapsed_ns / num_iter as u64 ;
287+ let end = self . clock . raw ( ) ;
288+ let elapsed_ns = self . clock . delta_as_nanos ( start, end) ;
289+ let adjusted_ns = adjuster
290+ . as_mut ( )
291+ . and_then ( |adjuster| adjuster. finish_with_wall ( elapsed_ns, end, & self . clock ) )
292+ . unwrap_or ( elapsed_ns) ;
293+ let duration_ns = adjusted_ns / num_iter as u64 ;
268294 RunResult :: new ( duration_ns, res. unwrap ( ) )
269295 } ;
270296
@@ -275,3 +301,70 @@ impl<'a, I, O: OutputValue> NamedBench<'a, I, O> {
275301 run_result
276302 }
277303}
304+
305+ /// Adjusts measured wall time by subtracting time the single thread was not scheduled.
306+ ///
307+ /// Uses wall time from `quanta::Clock` and per-thread CPU time from
308+ /// `clock_gettime(CLOCK_THREAD_CPUTIME_ID)` on Linux. That clock reports
309+ /// CPU time consumed by the calling thread only (does not advance while
310+ /// the thread is off-CPU or blocked), so `wall - cpu` approximates time
311+ /// spent descheduled. This is subtracted from the measured duration and
312+ /// assumes a single-threaded benchmark.
313+ struct SingleThreadedCpuSchedulingAdjuster {
314+ wall_start_raw : u64 ,
315+ cpu_start_ns : u64 ,
316+ }
317+
318+ impl SingleThreadedCpuSchedulingAdjuster {
319+ fn start ( clock : & Clock ) -> Option < Self > {
320+ Self :: start_with_wall ( clock. raw ( ) )
321+ }
322+
323+ fn start_with_wall ( wall_start_raw : u64 ) -> Option < Self > {
324+ let cpu_start_ns = thread_cpu_time_ns ( ) ?;
325+ Some ( Self {
326+ wall_start_raw,
327+ cpu_start_ns,
328+ } )
329+ }
330+
331+ fn finish ( & mut self , elapsed_ns : u64 , clock : & Clock ) -> Option < u64 > {
332+ self . finish_with_wall ( elapsed_ns, clock. raw ( ) , clock)
333+ }
334+
335+ fn finish_with_wall (
336+ & mut self ,
337+ elapsed_ns : u64 ,
338+ wall_end_raw : u64 ,
339+ clock : & Clock ,
340+ ) -> Option < u64 > {
341+ let cpu_end_ns = thread_cpu_time_ns ( ) ?;
342+ let wall_ns = clock. delta_as_nanos ( self . wall_start_raw , wall_end_raw) ;
343+ let cpu_ns = cpu_end_ns. saturating_sub ( self . cpu_start_ns ) ;
344+ // The difference between wall time and thread CPU time is time not scheduled.
345+ let unscheduled_ns = wall_ns. saturating_sub ( cpu_ns) ;
346+ // Subtract unscheduled time from the measured duration.
347+ Some ( elapsed_ns. saturating_sub ( unscheduled_ns) )
348+ }
349+ }
350+
351+ #[ cfg( target_os = "linux" ) ]
352+ fn thread_cpu_time_ns ( ) -> Option < u64 > {
353+ let mut ts = libc:: timespec {
354+ tv_sec : 0 ,
355+ tv_nsec : 0 ,
356+ } ;
357+ let res = unsafe { libc:: clock_gettime ( libc:: CLOCK_THREAD_CPUTIME_ID , & mut ts) } ;
358+ if res == 0 {
359+ let secs = ts. tv_sec as u64 ;
360+ let nanos = ts. tv_nsec as u64 ;
361+ Some ( secs. saturating_mul ( 1_000_000_000 ) . saturating_add ( nanos) )
362+ } else {
363+ None
364+ }
365+ }
366+
367+ #[ cfg( not( target_os = "linux" ) ) ]
368+ fn thread_cpu_time_ns ( ) -> Option < u64 > {
369+ None
370+ }
0 commit comments