Update summarize/effect-size to use engine flags

posborne · posborne · commit 989927e6470a · 2025-12-03T21:03:08.000Z
Previously, recordings were updated to capture
engine flag information.  This change updates the code
used for summaries and effect-size analysis to allow
for comparing engine/flag combinations against each other
which is useful for a class of comparisons.
diff --git a/crates/analysis/src/effect_size.rs b/crates/analysis/src/effect_size.rs
@@ -1,7 +1,7 @@
 use crate::keys::KeyBuilder;
 use anyhow::Result;
 use sightglass_data::{EffectSize, Measurement, Phase, Summary};
-use std::{collections::BTreeSet, io::Write};
+use std::{borrow::Cow, collections::BTreeSet, io::Write};
 
 /// Find the effect size (and confidence interval) of between two different
 /// engines (i.e. two different commits of Wasmtime).
@@ -25,14 +25,20 @@ pub fn calculate<'a>(
         significance_level,
     );
 
-    let keys = KeyBuilder::all().engine(false).keys(measurements);
+    let keys = KeyBuilder::all()
+        .engine(false)
+        .engine_flags(false)
+        .keys(measurements);
     let mut results = Vec::with_capacity(keys.len());
 
     for key in keys {
         let key_measurements: Vec<_> = measurements.iter().filter(|m| key.matches(m)).collect();
 
         // NB: `BTreeSet` so they're always sorted.
-        let engines: BTreeSet<_> = key_measurements.iter().map(|m| &m.engine).collect();
+        let engines: BTreeSet<_> = key_measurements
+            .iter()
+            .map(|m| (&m.engine, &m.engine_flags))
+            .collect();
         anyhow::ensure!(
             engines.len() == 2,
             "Can only test significance between exactly two different engines. Found {} \
@@ -41,17 +47,17 @@ pub fn calculate<'a>(
         );
 
         let mut engines = engines.into_iter();
-        let engine_a = engines.next().unwrap();
-        let engine_b = engines.next().unwrap();
+        let (engine_a, engine_a_flags) = engines.next().unwrap();
+        let (engine_b, engine_b_flags) = engines.next().unwrap();
 
         let a: behrens_fisher::Stats = key_measurements
             .iter()
-            .filter(|m| m.engine.as_ref() == engine_a)
+            .filter(|m| m.engine.as_ref() == engine_a && &m.engine_flags == engine_a_flags)
             .map(|m| m.count as f64)
             .collect();
         let b: behrens_fisher::Stats = key_measurements
             .iter()
-            .filter(|m| m.engine.as_ref() == engine_b)
+            .filter(|m| m.engine.as_ref() == engine_b && &m.engine_flags == engine_b_flags)
             .map(|m| m.count as f64)
             .collect();
 
@@ -62,8 +68,10 @@ pub fn calculate<'a>(
             phase: key.phase.unwrap(),
             event: key.event.unwrap(),
             a_engine: engine_a.clone(),
+            a_engine_flags: engine_a_flags.clone(),
             a_mean: a.mean,
             b_engine: engine_b.clone(),
+            b_engine_flags: engine_b_flags.clone(),
             b_mean: b.mean,
             significance_level,
             half_width_confidence_interval: ci,
@@ -73,6 +81,18 @@ pub fn calculate<'a>(
     Ok(results)
 }
 
+fn engine_label(engine: &str, engine_flags: &Option<Cow<str>>) -> String {
+    format!(
+        "{}{}",
+        engine,
+        if let Some(ef) = engine_flags {
+            format!(" ({ef})")
+        } else {
+            "".into()
+        }
+    )
+}
+
 /// Write a vector of [EffectSize] structures to the passed `output_file` in human-readable form.
 /// The `summaries` are needed
 pub fn write(
@@ -100,22 +120,50 @@ pub fn write(
         )?;
         writeln!(output_file)?;
 
+        let end_of_shared_prefix = |astr: &str, bstr: &str| {
+            astr.char_indices()
+                .zip(bstr.char_indices())
+                .find_map(|((i, a), (j, b))| {
+                    if a == b {
+                        None
+                    } else {
+                        debug_assert_eq!(i, j);
+                        Some(i)
+                    }
+                })
+                .unwrap_or(0)
+        };
+
         // For readability, trim the shared prefix from our two engine names.
-        let end_of_shared_prefix = effect_size
-            .a_engine
-            .char_indices()
-            .zip(effect_size.b_engine.char_indices())
-            .find_map(|((i, a), (j, b))| {
-                if a == b {
-                    None
-                } else {
-                    debug_assert_eq!(i, j);
-                    Some(i)
-                }
-            })
-            .unwrap_or(0);
-        let a_engine = &effect_size.a_engine[end_of_shared_prefix..];
-        let b_engine = &effect_size.b_engine[end_of_shared_prefix..];
+        //
+        // Furthermore, there are a few special cases:
+        // 1. If the engines are the same, show just the flags.
+        // 2. If not, show the computed full label with common prefix removed.
+        let (a_eng_label, b_eng_label) = if effect_size.a_engine == effect_size.b_engine {
+            (
+                effect_size
+                    .a_engine_flags
+                    .as_ref()
+                    .map(|ref ef| ef.to_string())
+                    .unwrap_or_else(|| "(no flags)".into())
+                    .to_string(),
+                effect_size
+                    .b_engine_flags
+                    .as_ref()
+                    .map(|ref ef| ef.to_string())
+                    .unwrap_or_else(|| "(no flags)".into())
+                    .to_string(),
+            )
+        } else {
+            let a_label = engine_label(&effect_size.a_engine, &effect_size.a_engine_flags);
+            let b_label = engine_label(&effect_size.b_engine, &effect_size.b_engine_flags);
+            let idx_end_of_shared = end_of_shared_prefix(&a_label, &b_label);
+
+            (
+                a_label[idx_end_of_shared..].into(),
+                b_label[idx_end_of_shared..].into(),
+            )
+        };
 
         if effect_size.is_significant() {
             writeln!(
@@ -132,9 +180,7 @@ pub fn write(
                 let ratio_ci = effect_size.half_width_confidence_interval / effect_size.a_mean;
                 writeln!(
                     output_file,
-                    "  {a_engine} is {ratio_min:.2}x to {ratio_max:.2}x faster than {b_engine}!",
-                    a_engine = a_engine,
-                    b_engine = b_engine,
+                    "  {a_eng_label} is {ratio_min:.2}x to {ratio_max:.2}x faster than {b_eng_label}!",
                     ratio_min = ratio - ratio_ci,
                     ratio_max = ratio + ratio_ci,
                 )?;
@@ -143,9 +189,7 @@ pub fn write(
                 let ratio_ci = effect_size.half_width_confidence_interval / effect_size.b_mean;
                 writeln!(
                     output_file,
-                    "  {b_engine} is {ratio_min:.2}x to {ratio_max:.2}x faster than {a_engine}!",
-                    a_engine = a_engine,
-                    b_engine = b_engine,
+                    "  {b_eng_label} is {ratio_min:.2}x to {ratio_max:.2}x faster than {a_eng_label}!",
                     ratio_min = ratio - ratio_ci,
                     ratio_max = ratio + ratio_ci,
                 )?;
@@ -155,39 +199,49 @@ pub fn write(
         }
         writeln!(output_file)?;
 
-        let get_summary = |engine: &str, wasm: &str, phase: Phase, event: &str| {
+        let get_summary = |engine: &str,
+                           engine_flags: Option<Cow<str>>,
+                           wasm: &str,
+                           phase: Phase,
+                           event: &str| {
             // TODO this sorting is not using `arch` which is not guaranteed to be the same in
             // result sets; potentially this could re-use `Key` functionality.
             summaries
                 .iter()
                 .find(|s| {
-                    s.engine == engine && s.wasm == wasm && s.phase == phase && s.event == event
+                    s.engine == engine
+                        && s.engine_flags == engine_flags
+                        && s.wasm == wasm
+                        && s.phase == phase
+                        && s.event == event
                 })
                 .unwrap()
         };
 
         let a_summary = get_summary(
             &effect_size.a_engine,
+            effect_size.a_engine_flags,
             &effect_size.wasm,
             effect_size.phase,
             &effect_size.event,
         );
         writeln!(
             output_file,
             "  [{} {:.2} {}] {}",
-            a_summary.min, a_summary.mean, a_summary.max, a_engine,
+            a_summary.min, a_summary.mean, a_summary.max, a_eng_label,
         )?;
 
         let b_summary = get_summary(
             &effect_size.b_engine,
+            effect_size.b_engine_flags,
             &effect_size.wasm,
             effect_size.phase,
             &effect_size.event,
         );
         writeln!(
             output_file,
             "  [{} {:.2} {}] {}",
-            b_summary.min, b_summary.mean, b_summary.max, b_engine,
+            b_summary.min, b_summary.mean, b_summary.max, b_eng_label,
         )?;
     }
 
diff --git a/crates/analysis/src/keys.rs b/crates/analysis/src/keys.rs
@@ -6,6 +6,7 @@ use std::{borrow::Cow, collections::BTreeSet};
 pub struct KeyBuilder {
     arch: bool,
     engine: bool,
+    engine_flags: bool,
     wasm: bool,
     phase: bool,
     event: bool,
@@ -20,6 +21,7 @@ impl KeyBuilder {
             wasm: true,
             phase: true,
             event: true,
+            engine_flags: true,
         }
     }
 
@@ -31,6 +33,7 @@ impl KeyBuilder {
             wasm: false,
             phase: false,
             event: false,
+            engine_flags: false,
         }
     }
 
@@ -52,6 +55,12 @@ impl KeyBuilder {
         self
     }
 
+    /// Whether to group keys by engine flags or not.
+    pub fn engine_flags(mut self, engine_flags: bool) -> Self {
+        self.engine_flags = engine_flags;
+        self
+    }
+
     /// Whether to group keys by phase or not.
     pub fn phase(mut self, phase: bool) -> Self {
         self.phase = phase;
@@ -72,6 +81,11 @@ impl KeyBuilder {
             .map(|m| Key {
                 arch: if self.arch { Some(m.arch) } else { None },
                 engine: if self.engine { Some(m.engine) } else { None },
+                engine_flags: if self.engine_flags {
+                    m.engine_flags
+                } else {
+                    None
+                },
                 wasm: if self.wasm { Some(m.wasm) } else { None },
                 phase: if self.phase { Some(m.phase) } else { None },
                 event: if self.event { Some(m.event) } else { None },
@@ -89,6 +103,7 @@ pub struct Key<'a> {
     pub wasm: Option<Cow<'a, str>>,
     pub phase: Option<Phase>,
     pub event: Option<Cow<'a, str>>,
+    pub engine_flags: Option<Cow<'a, str>>,
 }
 
 impl Key<'_> {
@@ -99,6 +114,10 @@ impl Key<'_> {
             && self.wasm.as_ref().is_none_or(|x| *x == m.wasm)
             && self.phase.as_ref().is_none_or(|x| *x == m.phase)
             && self.event.as_ref().is_none_or(|x| *x == m.event)
+            && self
+                .engine_flags
+                .as_ref()
+                .is_none_or(|x| Some(x) == m.engine_flags.as_ref())
     }
 }
 
@@ -115,6 +134,7 @@ mod tests {
             wasm: Some("bench.wasm".into()),
             phase: Some(Phase::Compilation),
             event: Some("cycles".into()),
+            engine_flags: Some("-Wfoo=bar".into()),
         };
 
         // More test cases are needed, but this provides a sanity check for the matched key and
@@ -128,7 +148,7 @@ mod tests {
             phase: Phase::Compilation,
             event: "cycles".into(),
             count: 42,
-            engine_flags: None,
+            engine_flags: Some("-Wfoo=bar".into()),
         }));
     }
 }
diff --git a/crates/analysis/src/summarize.rs b/crates/analysis/src/summarize.rs
@@ -3,7 +3,7 @@ use anyhow::Result;
 use sightglass_data::{Measurement, Summary};
 use std::io::Write;
 
-/// Summarize measurements grouped by: architecture, engine, benchmark file, phase and event.
+/// Summarize measurements grouped by: architecture, engine, flags, benchmark file, phase and event.
 pub fn calculate<'a>(measurements: &[Measurement<'a>]) -> Vec<Summary<'a>> {
     let mut summaries = Vec::new();
     for k in KeyBuilder::all().keys(measurements) {
@@ -15,6 +15,7 @@ pub fn calculate<'a>(measurements: &[Measurement<'a>]) -> Vec<Summary<'a>> {
         summaries.push(Summary {
             arch: k.arch.unwrap(),
             engine: k.engine.unwrap(),
+            engine_flags: k.engine_flags,
             wasm: k.wasm.unwrap(),
             phase: k.phase.unwrap(),
             event: k.event.unwrap(),
@@ -69,6 +70,7 @@ pub fn write(mut summaries: Vec<Summary<'_>>, output_file: &mut dyn Write) -> Re
             .then_with(|| x.wasm.cmp(&y.wasm))
             .then_with(|| x.event.cmp(&y.event))
             .then_with(|| x.engine.cmp(&y.engine))
+            .then_with(|| x.engine_flags.cmp(&y.engine_flags))
     });
 
     let mut last_phase = None;
@@ -93,9 +95,16 @@ pub fn write(mut summaries: Vec<Summary<'_>>, output_file: &mut dyn Write) -> Re
             writeln!(output_file, "    {}", summary.event)?;
         }
 
+        let engine_flags = match summary.engine_flags {
+            None => "".into(),
+            Some(ef) => {
+                format!(" ({ef})")
+            }
+        };
+
         writeln!(
             output_file,
-            "      [{} {:.2} {}] {}",
+            "      [{} {:.2} {}] {}{engine_flags}",
             summary.min, summary.mean, summary.max, summary.engine,
         )?;
     }
@@ -130,6 +139,7 @@ mod tests {
             vec![Summary {
                 arch: "x86".into(),
                 engine: "wasmtime".into(),
+                engine_flags: None,
                 wasm: "bench.wasm".into(),
                 phase: Phase::Compilation,
                 event: "cycles".into(),
diff --git a/crates/data/src/lib.rs b/crates/data/src/lib.rs
@@ -108,6 +108,9 @@ pub struct Summary<'a> {
     /// record this measurement.
     pub engine: Cow<'a, str>,
 
+    /// The flags, if any, used to record this measurement.
+    pub engine_flags: Option<Cow<'a, str>>,
+
     /// The file path of the Wasm benchmark program.
     pub wasm: Cow<'a, str>,
 
@@ -164,6 +167,13 @@ pub struct EffectSize<'a> {
     /// to record this measurement.
     pub a_engine: Cow<'a, str>,
 
+    /// The first engine flags being compared.
+    ///
+    /// When provided, this is a string capturing the engine flags passed
+    /// to the benchmark invocation that are in turn used to configure
+    /// wasmtime/cranelift.
+    pub a_engine_flags: Option<Cow<'a, str>>,
+
     /// The first engine's result's arithmetic mean of the `count` field.
     pub a_mean: f64,
 
@@ -173,6 +183,13 @@ pub struct EffectSize<'a> {
     /// to record this measurement.
     pub b_engine: Cow<'a, str>,
 
+    /// The second engine flags being compared.
+    ///
+    /// When provided, this is a string capturing the engine flags passed
+    /// to the benchmark invocation that are in turn used to configure
+    /// wasmtime/cranelift.
+    pub b_engine_flags: Option<Cow<'a, str>>,
+
     /// The second engine's result's arithmetic mean of the `count` field.
     pub b_mean: f64,