From bad8ca1c1f3d8a834cc63e901995e50879d1c84b Mon Sep 17 00:00:00 2001
From: Nathaniel Tucker <me@ntucker.me>
Date: Sun, 22 Mar 2026 08:38:28 -0400
Subject: [PATCH 1/6] =?UTF-8?q?demo(benchmark-react):=20reduce=20CI=20benc?=
 =?UTF-8?q?hmark=20variance=20below=20=C2=B110?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Increase warmup and measurement iterations for CI (small: 5+25, large: 3+20)
- Tighten convergence targets (small: 5%, large: 8%)
- Switch from stddev to MAD-based CI margin for robustness against outliers
- Increase inter-scenario GC settle time from 50ms to 200ms

Made-with: Cursor
---
 examples/benchmark-react/bench/runner.ts    |  2 +-
 examples/benchmark-react/bench/scenarios.ts | 16 +++---
 examples/benchmark-react/bench/stats.ts     | 58 +++++++++++++++------
 3 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/examples/benchmark-react/bench/runner.ts b/examples/benchmark-react/bench/runner.ts
index e23ff88b3ca3..f6ed9b691653 100644
--- a/examples/benchmark-react/bench/runner.ts
+++ b/examples/benchmark-react/bench/runner.ts
@@ -443,7 +443,7 @@ async function runRound(
       try {
         await cdp.send('HeapProfiler.collectGarbage');
       } catch {}
-      await page.waitForTimeout(50);
+      await page.waitForTimeout(200);
 
       done++;
       const prefix = opts.showProgress ? `[${done}/${total}] ` : '';
diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts
index ce8c756b572f..63099b594267 100644
--- a/examples/benchmark-react/bench/scenarios.ts
+++ b/examples/benchmark-react/bench/scenarios.ts
@@ -17,16 +17,16 @@ export interface RunProfile {
 
 export const RUN_CONFIG: Record<ScenarioSize, RunProfile> = {
   small: {
-    warmup: 3,
-    minMeasurement: 5,
-    maxMeasurement: process.env.CI ? 10 : 20,
-    targetMarginPct: process.env.CI ? 15 : 10,
+    warmup: 5,
+    minMeasurement: 8,
+    maxMeasurement: process.env.CI ? 25 : 20,
+    targetMarginPct: process.env.CI ? 5 : 10,
   },
   large: {
-    warmup: 1,
-    minMeasurement: 3,
-    maxMeasurement: process.env.CI ? 6 : 10,
-    targetMarginPct: process.env.CI ? 20 : 15,
+    warmup: 3,
+    minMeasurement: 5,
+    maxMeasurement: process.env.CI ? 20 : 10,
+    targetMarginPct: process.env.CI ? 8 : 15,
   },
 };
 
diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts
index a0aad7ce618c..84751c00126e 100644
--- a/examples/benchmark-react/bench/stats.ts
+++ b/examples/benchmark-react/bench/stats.ts
@@ -15,6 +15,40 @@ function trimOutliers(sorted: number[]): number[] {
   return result.length >= 2 ? result : sorted;
 }
 
+/**
+ * Median Absolute Deviation — robust dispersion estimator with 50%
+ * breakdown point. Scale factor 1.4826 makes it consistent with
+ * stddev for normal distributions.
+ */
+function scaledMAD(sorted: number[]): number {
+  const median = sorted[Math.floor(sorted.length / 2)];
+  const deviations = sorted
+    .map(x => Math.abs(x - median))
+    .sort((a, b) => a - b);
+  const mad = deviations[Math.floor(deviations.length / 2)];
+  return 1.4826 * mad;
+}
+
+/**
+ * Compute the 95% CI margin using MAD-based dispersion.
+ * Falls back to stddev when MAD is zero (all values identical
+ * except outliers) to avoid reporting ± 0 misleadingly.
+ */
+function ciMargin(clean: number[]): number {
+  const mad = scaledMAD(clean);
+  if (mad > 0) {
+    return 1.96 * (mad / Math.sqrt(clean.length));
+  }
+  const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length;
+  const stdDev =
+    clean.length > 1 ?
+      Math.sqrt(
+        clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1),
+      )
+    : 0;
+  return 1.96 * (stdDev / Math.sqrt(clean.length));
+}
+
 /**
  * Check whether a scenario's samples have converged: 95% CI margin
  * is within targetMarginPct of the median.  Zero-variance metrics
@@ -33,19 +67,20 @@ export function isConverged(
   if (measured.length < minSamples) return false;
   const sorted = [...measured].sort((a, b) => a - b);
   const clean = trimOutliers(sorted);
-  const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length;
-  if (mean === 0) return true;
-  const stdDev = Math.sqrt(
-    clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1),
-  );
-  const margin = 1.96 * (stdDev / Math.sqrt(clean.length));
-  return (margin / Math.abs(mean)) * 100 <= targetMarginPct;
+  const median = clean[Math.floor(clean.length / 2)];
+  if (median === 0) return true;
+  const margin = ciMargin(clean);
+  return (margin / Math.abs(median)) * 100 <= targetMarginPct;
 }
 
 /**
  * Compute median, p95, and approximate 95% confidence interval from samples.
  * Discards warmup runs, then trims IQR outliers for median and CI
  * computation. p95 uses the full (untrimmed) sorted data.
+ *
+ * Uses MAD (Median Absolute Deviation) instead of stddev for the CI
+ * margin — MAD is far more robust to heavy-tailed distributions and
+ * residual outliers typical of browser benchmarks.
  */
 export function computeStats(
   samples: number[],
@@ -61,14 +96,7 @@ export function computeStats(
   const median = clean[Math.floor(clean.length / 2)] ?? 0;
   const p95Idx = Math.floor(sorted.length * 0.95);
   const p95 = sorted[Math.min(p95Idx, sorted.length - 1)] ?? median;
-  const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length;
-  const stdDev =
-    clean.length > 1 ?
-      Math.sqrt(
-        clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1),
-      )
-    : 0;
-  const margin = 1.96 * (stdDev / Math.sqrt(clean.length));
+  const margin = ciMargin(clean);
   return {
     median,
     p95,

From 3bda3f3cd35db9a3e37d6866a510c7f2d05d05c0 Mon Sep 17 00:00:00 2001
From: Nathaniel Tucker <me@ntucker.me>
Date: Sun, 22 Mar 2026 12:39:17 -0400
Subject: [PATCH 2/6] demo(benchmark-react): add in-page sub-iterations and
 reduce variance

- Run multiple ops per page visit (default 5), returning the median
  duration as one sample. Eliminates page-navigation overhead between
  measurements and dramatically reduces variance.
- Add resetStore() to BenchAPI for clearing caches between mount
  sub-iterations (data-client, tanstack-query, swr).
- Vary mutation data each sub-iteration (incrementing counter for
  titles, toggling moveItem direction) to ensure real DOM changes.
- Add waitForPaint between mutation sub-iterations to prevent
  server-resolution renders from bleeding into the next measurement.
- Report variance as percentage instead of absolute values.
- Reduce warmup/minMeasurement counts since sub-iterations provide
  sufficient noise reduction.
- Fix SWR mount sub-iterations: add revalidateOnMount + dedupingInterval: 0
  to ensure fresh fetches after cache.clear().
- Update README with latest results showing ~6778% mutation throughput
  advantage for data-client (up from ~4442% with more accurate measurement).

Made-with: Cursor
---
 examples/benchmark-react/README.md            |  40 +--
 examples/benchmark-react/bench/runner.ts      | 282 ++++++++++++------
 examples/benchmark-react/bench/scenarios.ts   |  18 +-
 examples/benchmark-react/bench/stats.ts       |   3 +-
 .../benchmark-react/src/baseline/index.tsx    |  27 +-
 .../benchmark-react/src/data-client/index.tsx |  31 +-
 .../src/shared/benchHarness.tsx               |  11 +-
 examples/benchmark-react/src/shared/types.ts  |   2 +
 examples/benchmark-react/src/swr/index.tsx    |  37 ++-
 .../src/tanstack-query/index.tsx              |  24 +-
 10 files changed, 330 insertions(+), 145 deletions(-)

diff --git a/examples/benchmark-react/README.md b/examples/benchmark-react/README.md
index a4c68fed2014..891e6a95b8e9 100644
--- a/examples/benchmark-react/README.md
+++ b/examples/benchmark-react/README.md
@@ -13,8 +13,8 @@ The repo has two benchmark suites:
 
 - **What we measure:** Wall-clock time from triggering an action (e.g. `init(100)` or `updateUser('user0')`) until a MutationObserver detects the expected DOM change in the benchmark container. Optionally we also record React Profiler commit duration and, with `BENCH_TRACE=true`, Chrome trace duration.
 - **Why:** Normalized caching should show wins on shared-entity updates (one store write, many components update), ref stability (fewer new object references), and derived-view memoization (`Query` schema avoids re-sorting when entities haven't changed). See [js-framework-benchmark "How the duration is measured"](https://github.com/krausest/js-framework-benchmark/wiki/How-the-duration-is-measured) for a similar timeline-based approach.
-- **Statistical:** Warmup runs are discarded; we report median and 95% CI. Libraries are interleaved per round to reduce environmental variance.
-- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 3 warmup + 15 measurement runs locally (10 in CI); large (expensive) scenarios use 1 warmup + 4 measurement runs.
+- **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs 5 sub-iterations per page visit and reports the median, further reducing per-sample noise.
+- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 3 warmup + up to 15 measurement rounds locally; large (expensive) scenarios use 2 warmup + up to 8 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage.
 
 ## Scenario categories
 
@@ -55,10 +55,10 @@ Illustrative **relative** results with **baseline = 100%** (plain React useState
 
 | Category | Scenarios (representative) | data-client | tanstack-query | swr | baseline |
 |---|---|---:|---:|---:|---:|
-| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~95% | ~97% | ~99% | **100%** |
-| Navigation | `list-detail-switch-10` | **~851%** | ~233% | ~247% | 100% |
-| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~4442%** | ~97% | ~99% | 100% |
-| Scaling (10k items) | `update-user-10000` | **~6408%** | ~94% | ~100% | 100% |
+| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~98% | ~100% | ~100% | **100%** |
+| Navigation | `list-detail-switch-10` | **~1354%** | ~233% | ~260% | 100% |
+| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~6778%** | ~97% | ~99% | 100% |
+| Scaling (10k items) | `update-user-10000` | **~9713%** | ~94% | ~100% | 100% |
 
 
 ## Latest measured results (network simulation on)
@@ -70,19 +70,19 @@ Run: **2026-03-22**, Linux (WSL2), `yarn build:benchmark-react`, static preview
 | Scenario | data-client | tanstack-query | swr | baseline |
 |---|---:|---:|---:|---:|
 | **Navigation** | | | | |
-| `getlist-100` | 18.48 ± 0.02 | 18.62 ± 0.07 | 19.12 ± 0.02 | 19.34 ± 0.09 |
-| `getlist-500` | 11.45 ± 0.21 | 11.92 ± 0.18 | 11.96 ± 0.04 | 12.06 ± 0.08 |
-| `getlist-500-sorted` | 11.48 ± 0.39 | 11.81 ± 0.22 | 12.00 ± 0.34 | 12.08 ± 0.37 |
-| `list-detail-switch-10` | 6.13 ± 0.74 | 1.68 ± 0.07 | 1.78 ± 0.12 | 0.72 ± 0.00 |
+| `getlist-100` | 20.28 ± 0.3% | 20.58 ± 1.2% | 20.58 ± 0.3% | 20.62 ± 0.3% |
+| `getlist-500` | 12.41 ± 0.4% | 12.61 ± 0.2% | 12.59 ± 0.2% | 12.63 ± 0.2% |
+| `getlist-500-sorted` | 12.55 ± 0.2% | 12.67 ± 1.3% | 12.72 ± 0.4% | 12.79 ± 0.4% |
+| `list-detail-switch-10` | 9.75 ± 1.7% | 1.68 ± 0.2% | 1.87 ± 0.9% | 0.72 ± 0.0% |
 | **Mutations** | | | | |
-| `update-entity` | 333.33 ± 4.22 | 6.95 ± 0.00 | 6.94 ± 0.02 | 7.17 ± 0.00 |
-| `update-user` | 322.58 ± 11.79 | 6.97 ± 0.01 | 7.15 ± 0.00 | 7.15 ± 0.02 |
-| `update-entity-sorted` | 285.71 ± 30.41 | 7.04 ± 0.01 | 7.05 ± 0.02 | 7.23 ± 0.01 |
-| `update-entity-multi-view` | 344.83 ± 16.69 | 5.89 ± 0.77 | 5.89 ± 0.82 | 5.97 ± 0.05 |
-| `update-user-10000` | 98.04 ± 5.79 | 1.44 ± 0.01 | 1.53 ± 0.00 | 1.53 ± 0.01 |
-| `unshift-item` | 285.71 ± 11.11 | 6.89 ± 0.02 | 7.11 ± 0.01 | 7.11 ± 0.01 |
-| `delete-item` | 312.50 ± 14.76 | 6.87 ± 0.01 | 7.09 ± 0.01 | 7.10 ± 0.00 |
-| `move-item` | 256.41 ± 8.77 | 6.34 ± 0.06 | 6.80 ± 0.01 | 6.77 ± 0.01 |
+| `update-entity` | 555.56 ± 2.9% | 7.00 ± 0.3% | 6.98 ± 0.1% | 7.18 ± 0.2% |
+| `update-user` | 625.00 ± 11.2% | 6.95 ± 0.1% | 7.15 ± 0.2% | 7.17 ± 0.2% |
+| `update-entity-sorted` | 476.19 ± 0.0% | 7.06 ± 0.3% | 7.06 ± 0.0% | 7.24 ± 0.0% |
+| `update-entity-multi-view` | 500.00 ± 3.4% | 7.05 ± 0.2% | 7.09 ± 0.1% | 7.25 ± 0.1% |
+| `update-user-10000` | 151.52 ± 2.8% | 1.47 ± 0.0% | 1.56 ± 0.1% | 1.56 ± 0.3% |
+| `unshift-item` | 434.78 ± 6.1% | 6.91 ± 0.2% | 7.13 ± 0.2% | 7.15 ± 0.3% |
+| `delete-item` | 526.32 ± 2.7% | 6.89 ± 0.0% | 7.13 ± 0.4% | 7.12 ± 0.0% |
+| `move-item` | 277.78 ± 3.9% | 6.52 ± 0.2% | 6.98 ± 0.6% | 6.87 ± 0.2% |
 
 [Measured on a Ryzen 9 7950X; 64 GB RAM; Ubuntu (WSL2); Node 24.12.0; Chromium (Playwright)]
 
@@ -189,8 +189,8 @@ Regressions >5% on stable scenarios or >15% on volatile scenarios are worth inve
 
    Scenarios are classified as `small` or `large` based on their cost:
 
-   - **Small** (3 warmup + 15 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item`
-   - **Large** (1 warmup + 4 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10`
+   - **Small** (3 warmup + 4–15 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item`
+   - **Large** (2 warmup + 3–8 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10`
    - **Memory** (opt-in, 1 warmup + 3 measurement): `memory-mount-unmount-cycle` — run with `--action memory`
 
    When running all scenarios (`yarn bench`), each group runs with its own warmup/measurement count. Use `--size` to run only one group.
diff --git a/examples/benchmark-react/bench/runner.ts b/examples/benchmark-react/bench/runner.ts
index f6ed9b691653..c9f0ac9e507f 100644
--- a/examples/benchmark-react/bench/runner.ts
+++ b/examples/benchmark-react/bench/runner.ts
@@ -26,6 +26,7 @@ function parseArgs(): {
   actions?: string[];
   scenario?: string;
   networkSim: boolean;
+  opsPerRound?: number;
 } {
   const argv = process.argv.slice(2);
   const get = (flag: string, envVar: string): string | undefined => {
@@ -39,6 +40,7 @@ function parseArgs(): {
   const actionRaw = get('--action', 'BENCH_ACTION');
   const scenarioRaw = get('--scenario', 'BENCH_SCENARIO');
   const networkSimRaw = get('--network-sim', 'BENCH_NETWORK_SIM');
+  const opsRaw = get('--ops-per-round', 'BENCH_OPS_PER_ROUND');
 
   const libs = libRaw ? libRaw.split(',').map(s => s.trim()) : undefined;
   const size = sizeRaw === 'small' || sizeRaw === 'large' ? sizeRaw : undefined;
@@ -46,14 +48,23 @@ function parseArgs(): {
     actionRaw ? actionRaw.split(',').map(s => s.trim()) : undefined;
   const networkSim =
     networkSimRaw != null ? networkSimRaw !== 'false' : !process.env.CI;
+  const opsPerRound = opsRaw ? parseInt(opsRaw, 10) : undefined;
 
-  return { libs, size, actions, scenario: scenarioRaw, networkSim };
+  return {
+    libs,
+    size,
+    actions,
+    scenario: scenarioRaw,
+    networkSim,
+    opsPerRound,
+  };
 }
 
 function filterScenarios(scenarios: Scenario[]): {
   filtered: Scenario[];
   libraries: string[];
   networkSim: boolean;
+  opsPerRound?: number;
 } {
   const {
     libs,
@@ -61,6 +72,7 @@ function filterScenarios(scenarios: Scenario[]): {
     actions,
     scenario: scenarioFilter,
     networkSim,
+    opsPerRound,
   } = parseArgs();
 
   const libraries = libs ?? (process.env.CI ? ['data-client'] : [...LIBRARIES]);
@@ -114,7 +126,7 @@ function filterScenarios(scenarios: Scenario[]): {
       !s.onlyLibs?.length || libraries.every(lib => s.onlyLibs!.includes(lib)),
   );
 
-  return { filtered, libraries, networkSim };
+  return { filtered, libraries, networkSim, opsPerRound };
 }
 
 // ---------------------------------------------------------------------------
@@ -200,6 +212,7 @@ async function runScenario(
     );
   }
 
+  // --- Memory path (unchanged, always ops=1) ---
   const isMemory =
     scenario.action === 'mountUnmountCycle' &&
     scenario.resultMetric === 'heapDelta';
@@ -236,17 +249,17 @@ async function runScenario(
     return { value: heapAfter - heapBefore };
   }
 
-  const isUpdate =
-    scenario.action === 'updateEntity' ||
-    scenario.action === 'updateEntityMultiView' ||
-    scenario.action === 'updateUser' ||
-    scenario.action === 'invalidateAndResolve' ||
-    scenario.action === 'unshiftItem' ||
-    scenario.action === 'deleteEntity' ||
-    scenario.action === 'moveItem';
-  const isRefStability = isRefStabilityScenario(scenario);
+  // --- Classify scenario ---
   const isInit = scenario.action === 'init';
+  const isMountLike =
+    isInit ||
+    scenario.action === 'mountSortedView' ||
+    scenario.action === 'initDoubleList' ||
+    scenario.action === 'listDetailSwitch';
+  const isUpdate = !isMountLike;
+  const isRefStability = isRefStabilityScenario(scenario);
 
+  // --- Pre-mount for update/ref-stability scenarios (once) ---
   const mountCount = scenario.mountCount ?? 100;
   if (isUpdate || isRefStability) {
     const preMountAction = scenario.preMountAction ?? 'init';
@@ -277,102 +290,175 @@ async function runScenario(
     });
   }
 
+  // --- Ref stability (deterministic, single run, early return) ---
   if (isRefStability) {
     await (bench as any).evaluate((api: any) => api.captureRefSnapshot());
-  }
 
-  await harness.evaluate(el => {
-    el.removeAttribute('data-bench-complete');
-    el.removeAttribute('data-bench-timeout');
-  });
-  const cdpTracing =
-    USE_TRACE && !isRefStability ?
-      await page.context().newCDPSession(page)
-    : undefined;
-  const traceChunks: object[] = [];
-  if (cdpTracing) {
-    cdpTracing.on('Tracing.dataCollected', (params: { value: object[] }) => {
-      traceChunks.push(...params.value);
+    await harness.evaluate(el => {
+      el.removeAttribute('data-bench-complete');
+      el.removeAttribute('data-bench-timeout');
     });
-    await cdpTracing.send('Tracing.start', {
-      categories: 'devtools.timeline,blink',
+    await page.evaluate(() => {
+      performance.clearMarks();
+      performance.clearMeasures();
     });
-  }
-
-  await page.evaluate(() => {
-    performance.clearMarks();
-    performance.clearMeasures();
-  });
 
-  await (bench as any).evaluate(
-    (api: any, { action, args }: { action: string; args: unknown[] }) => {
-      api[action](...args);
-    },
-    { action: scenario.action, args: scenario.args },
-  );
+    await (bench as any).evaluate(
+      (api: any, { action, args }: { action: string; args: unknown[] }) => {
+        api[action](...args);
+      },
+      { action: scenario.action, args: scenario.args },
+    );
 
-  const completeTimeout = networkSim ? 60000 : 10000;
-  await page.waitForSelector('[data-bench-complete]', {
-    timeout: completeTimeout,
-    state: 'attached',
-  });
+    const completeTimeout = networkSim ? 60000 : 10000;
+    await page.waitForSelector('[data-bench-complete]', {
+      timeout: completeTimeout,
+      state: 'attached',
+    });
+    const timedOut = await harness.evaluate(el =>
+      el.hasAttribute('data-bench-timeout'),
+    );
+    if (timedOut) {
+      throw new Error(
+        `Harness timeout: MutationObserver did not detect expected DOM update within 30 s`,
+      );
+    }
 
-  const timedOut = await harness.evaluate(el =>
-    el.hasAttribute('data-bench-timeout'),
-  );
-  if (timedOut) {
-    throw new Error(
-      `Harness timeout: MutationObserver did not detect expected DOM update within 30 s`,
+    const report = await (bench as any).evaluate((api: any) =>
+      api.getRefStabilityReport(),
     );
+    await bench.dispose();
+    return { value: report[scenario.resultMetric!] as number };
   }
 
-  await (bench as any).evaluate((api: any) => api.flushPendingMutations());
+  // --- Sub-iteration loop ---
+  const ops = effectiveOpsPerRound(scenario);
+  const durations: number[] = [];
+  const commitTimes: number[] = [];
+  const traceDurations: number[] = [];
+  const traceSubIdx = Math.floor(ops / 2);
+
+  for (let subIdx = 0; subIdx < ops; subIdx++) {
+    // Mount scenarios: unmount + detach + resetStore + waitForPaint (skip first iteration — nothing mounted yet)
+    if (isMountLike && subIdx > 0) {
+      await (bench as any).evaluate((api: any) => api.unmountAll());
+      await page
+        .waitForSelector('[data-bench-item], [data-sorted-list]', {
+          state: 'detached',
+          timeout: 10000,
+        })
+        .catch(() => {});
+      await (bench as any).evaluate((api: any) => {
+        if (api.resetStore) api.resetStore();
+      });
+      await page.evaluate(
+        () =>
+          new Promise<void>(r =>
+            requestAnimationFrame(() => requestAnimationFrame(() => r())),
+          ),
+      );
+    }
 
-  let traceDuration: number | undefined;
-  if (cdpTracing) {
-    try {
-      const done = new Promise<void>(resolve => {
-        cdpTracing!.on('Tracing.tracingComplete', () => resolve());
+    // Mutation scenarios: flush pending from prior sub-iteration + let React commit the resolution
+    if (isUpdate && subIdx > 0) {
+      await (bench as any).evaluate((api: any) => api.flushPendingMutations());
+      await page.evaluate(
+        () =>
+          new Promise<void>(r =>
+            requestAnimationFrame(() => requestAnimationFrame(() => r())),
+          ),
+      );
+    }
+
+    // Clear perf marks/measures + reset harness flags
+    await page.evaluate(() => {
+      performance.clearMarks();
+      performance.clearMeasures();
+    });
+    await harness.evaluate(el => {
+      el.removeAttribute('data-bench-complete');
+      el.removeAttribute('data-bench-timeout');
+    });
+
+    // Chrome tracing: only for the middle sub-iteration
+    const shouldTrace = USE_TRACE && subIdx === traceSubIdx;
+    let cdpTracing: CDPSession | undefined;
+    const traceChunks: object[] = [];
+    if (shouldTrace) {
+      cdpTracing = await page.context().newCDPSession(page);
+      cdpTracing.on('Tracing.dataCollected', (params: { value: object[] }) => {
+        traceChunks.push(...params.value);
+      });
+      await cdpTracing.send('Tracing.start', {
+        categories: 'devtools.timeline,blink',
       });
-      await cdpTracing.send('Tracing.end');
-      await done;
-      const traceJson =
-        '[\n' + traceChunks.map(e => JSON.stringify(e)).join(',\n') + '\n]';
-      traceDuration = parseTraceDuration(Buffer.from(traceJson));
-    } catch {
-      traceDuration = undefined;
-    } finally {
-      await cdpTracing.detach().catch(() => {});
     }
-  }
 
-  if (isRefStability && scenario.resultMetric) {
-    const report = await (bench as any).evaluate((api: any) =>
-      api.getRefStabilityReport(),
+    // Execute action (vary args for deleteEntity across sub-iterations)
+    const actionArgs =
+      scenario.action === 'deleteEntity' ? [subIdx + 1] : scenario.args;
+    await (bench as any).evaluate(
+      (api: any, { action, args }: { action: string; args: unknown[] }) => {
+        api[action](...args);
+      },
+      { action: scenario.action, args: actionArgs },
     );
-    await bench.dispose();
-    return { value: report[scenario.resultMetric] as number };
-  }
 
-  const measures = await collectMeasures(page);
-  const isMountLike =
-    isInit ||
-    scenario.action === 'mountSortedView' ||
-    scenario.action === 'initDoubleList' ||
-    scenario.action === 'listDetailSwitch';
-  const duration =
-    isMountLike ?
-      getMeasureDuration(measures, 'mount-duration')
-    : getMeasureDuration(measures, 'update-duration');
-  // Both mount-like and update scenarios trigger state updates (setItems/etc.),
-  // so React Profiler always fires with phase: 'update' for the measured action.
-  const reactCommit = getMeasureDuration(measures, 'react-commit-update');
+    // Wait for completion
+    const completeTimeout = networkSim ? 60000 : 10000;
+    await page.waitForSelector('[data-bench-complete]', {
+      timeout: completeTimeout,
+      state: 'attached',
+    });
+    const timedOut = await harness.evaluate(el =>
+      el.hasAttribute('data-bench-timeout'),
+    );
+    if (timedOut) {
+      throw new Error(
+        `Harness timeout: MutationObserver did not detect expected DOM update within 30 s`,
+      );
+    }
+
+    await (bench as any).evaluate((api: any) => api.flushPendingMutations());
+
+    // Collect trace
+    let traceDuration: number | undefined;
+    if (shouldTrace && cdpTracing) {
+      try {
+        const done = new Promise<void>(resolve => {
+          cdpTracing!.on('Tracing.tracingComplete', () => resolve());
+        });
+        await cdpTracing.send('Tracing.end');
+        await done;
+        const traceJson =
+          '[\n' + traceChunks.map(e => JSON.stringify(e)).join(',\n') + '\n]';
+        traceDuration = parseTraceDuration(Buffer.from(traceJson));
+      } catch {
+        traceDuration = undefined;
+      } finally {
+        await cdpTracing.detach().catch(() => {});
+      }
+    }
+
+    // Collect performance measures
+    const measures = await collectMeasures(page);
+    const duration =
+      isMountLike ?
+        getMeasureDuration(measures, 'mount-duration')
+      : getMeasureDuration(measures, 'update-duration');
+    const reactCommit = getMeasureDuration(measures, 'react-commit-update');
+
+    durations.push(duration);
+    if (reactCommit > 0) commitTimes.push(reactCommit);
+    if (traceDuration != null) traceDurations.push(traceDuration);
+  }
 
   await bench.dispose();
   return {
-    value: duration,
-    reactCommit: reactCommit > 0 ? reactCommit : undefined,
-    traceDuration,
+    value: simpleMedian(durations),
+    reactCommit: commitTimes.length > 0 ? simpleMedian(commitTimes) : undefined,
+    traceDuration:
+      traceDurations.length > 0 ? simpleMedian(traceDurations) : undefined,
   };
 }
 
@@ -380,6 +466,22 @@ async function runScenario(
 // Helpers
 // ---------------------------------------------------------------------------
 
+function effectiveOpsPerRound(scenario: Scenario): number {
+  if (scenario.deterministic) return 1;
+  if (scenario.category === 'memory') return 1;
+  if (scenario.action === 'listDetailSwitch') return 1;
+  return RUN_CONFIG[scenario.size ?? 'small'].opsPerRound;
+}
+
+function simpleMedian(arr: number[]): number {
+  if (arr.length === 0) return 0;
+  const sorted = [...arr].sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  return sorted.length % 2 === 0 ?
+      (sorted[mid - 1] + sorted[mid]) / 2
+    : sorted[mid];
+}
+
 function shuffle<T>(arr: T[]): T[] {
   const out = [...arr];
   for (let i = out.length - 1; i > 0; i--) {
@@ -484,8 +586,14 @@ async function main() {
     filtered: SCENARIOS_TO_RUN,
     libraries,
     networkSim,
+    opsPerRound,
   } = filterScenarios(SCENARIOS);
 
+  if (opsPerRound != null) {
+    RUN_CONFIG.small.opsPerRound = opsPerRound;
+    RUN_CONFIG.large.opsPerRound = opsPerRound;
+  }
+
   if (networkSim) {
     process.stderr.write('Network simulation: ON\n');
   }
diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts
index 63099b594267..aa1b1a8f345c 100644
--- a/examples/benchmark-react/bench/scenarios.ts
+++ b/examples/benchmark-react/bench/scenarios.ts
@@ -13,20 +13,26 @@ export interface RunProfile {
   maxMeasurement: number;
   /** Stop early when 95% CI margin is within this % of the median. */
   targetMarginPct: number;
+  /** Sub-iterations per page visit; median of N is returned as one sample. */
+  opsPerRound: number;
 }
 
+const defaultOpsPerRound = parseInt(process.env.BENCH_OPS_PER_ROUND ?? '5', 10);
+
 export const RUN_CONFIG: Record<ScenarioSize, RunProfile> = {
   small: {
-    warmup: 5,
-    minMeasurement: 8,
-    maxMeasurement: process.env.CI ? 25 : 20,
+    warmup: 3,
+    minMeasurement: 4,
+    maxMeasurement: process.env.CI ? 20 : 15,
     targetMarginPct: process.env.CI ? 5 : 10,
+    opsPerRound: defaultOpsPerRound,
   },
   large: {
-    warmup: 3,
-    minMeasurement: 5,
-    maxMeasurement: process.env.CI ? 20 : 10,
+    warmup: 2,
+    minMeasurement: 3,
+    maxMeasurement: process.env.CI ? 15 : 8,
     targetMarginPct: process.env.CI ? 8 : 15,
+    opsPerRound: defaultOpsPerRound,
   },
 };
 
diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts
index 84751c00126e..56667529e8ce 100644
--- a/examples/benchmark-react/bench/stats.ts
+++ b/examples/benchmark-react/bench/stats.ts
@@ -97,9 +97,10 @@ export function computeStats(
   const p95Idx = Math.floor(sorted.length * 0.95);
   const p95 = sorted[Math.min(p95Idx, sorted.length - 1)] ?? median;
   const margin = ciMargin(clean);
+  const pct = median !== 0 ? (margin / Math.abs(median)) * 100 : 0;
   return {
     median,
     p95,
-    range: `± ${margin.toFixed(2)}`,
+    range: `± ${pct.toFixed(1)}%`,
   };
 }
diff --git a/examples/benchmark-react/src/baseline/index.tsx b/examples/benchmark-react/src/baseline/index.tsx
index 3011a175811c..4a5185f34e55 100644
--- a/examples/benchmark-react/src/baseline/index.tsx
+++ b/examples/benchmark-react/src/baseline/index.tsx
@@ -26,7 +26,15 @@ import {
   deleteIssue,
 } from '@shared/server';
 import type { Issue } from '@shared/types';
-import React, { useCallback, useEffect, useMemo, useState } from 'react';
+import React, {
+  useCallback,
+  useEffect,
+  useMemo,
+  useRef,
+  useState,
+} from 'react';
+
+let mutationCounter = 0;
 
 function SortedListView({
   limit,
@@ -188,10 +196,11 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
+      const v = ++mutationCounter;
       measureUpdate(() =>
         updateIssue({
           number,
-          title: `${issue.title} (updated)`,
+          title: `${issue.title} (v${v})`,
         }).then(triggerRefetch),
       );
     },
@@ -202,10 +211,11 @@ function BenchmarkHarness() {
     (login: string) => {
       const user = FIXTURE_USERS_BY_LOGIN.get(login);
       if (!user) return;
+      const v = ++mutationCounter;
       measureUpdate(() =>
         serverUpdateUser({
           login,
-          name: `${user.name} (updated)`,
+          name: `${user.name} (v${v})`,
         }).then(triggerRefetch),
       );
     },
@@ -226,11 +236,15 @@ function BenchmarkHarness() {
     [measureUpdate, triggerRefetch],
   );
 
+  const moveStateRef = useRef<'open' | 'closed'>('closed');
+
   const moveItem = useCallback(
     (number: number) => {
+      const targetState = moveStateRef.current;
+      moveStateRef.current = targetState === 'closed' ? 'open' : 'closed';
       measureUpdate(
-        () => updateIssue({ number, state: 'closed' }).then(triggerRefetch),
-        () => moveItemIsReady(containerRef, number),
+        () => updateIssue({ number, state: targetState }).then(triggerRefetch),
+        () => moveItemIsReady(containerRef, number, targetState),
       );
     },
     [measureUpdate, triggerRefetch, containerRef],
@@ -240,7 +254,8 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
-      const expected = `${issue.title} (updated)`;
+      const v = ++mutationCounter;
+      const expected = `${issue.title} (v${v})`;
       measureUpdate(
         () => updateIssue({ number, title: expected }).then(triggerRefetch),
         () => {
diff --git a/examples/benchmark-react/src/data-client/index.tsx b/examples/benchmark-react/src/data-client/index.tsx
index 6c2c57260201..97c435c2c876 100644
--- a/examples/benchmark-react/src/data-client/index.tsx
+++ b/examples/benchmark-react/src/data-client/index.tsx
@@ -31,7 +31,9 @@ import {
 } from '@shared/resources';
 import { getIssue, patchIssue } from '@shared/server';
 import type { Issue } from '@shared/types';
-import React, { useCallback } from 'react';
+import React, { useCallback, useRef } from 'react';
+
+let mutationCounter = 0;
 
 /** GCPolicy with no interval (won't fire during timing scenarios) and instant
  *  expiry so an explicit sweep() collects all unreferenced data immediately. */
@@ -147,11 +149,12 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
+      const v = ++mutationCounter;
       measureUpdate(() => {
         controller.fetch(
           IssueResource.update,
           { number },
-          { title: `${issue.title} (updated)` },
+          { title: `${issue.title} (v${v})` },
         );
       });
     },
@@ -162,11 +165,12 @@ function BenchmarkHarness() {
     (login: string) => {
       const user = FIXTURE_USERS_BY_LOGIN.get(login);
       if (!user) return;
+      const v = ++mutationCounter;
       measureUpdate(() => {
         controller.fetch(
           UserResource.update,
           { login },
-          { name: `${user.name} (updated)` },
+          { name: `${user.name} (v${v})` },
         );
       });
     },
@@ -196,13 +200,21 @@ function BenchmarkHarness() {
     [measureUpdate, controller],
   );
 
+  const moveStateRef = useRef<'open' | 'closed'>('closed');
+
   const moveItem = useCallback(
     (number: number) => {
+      const targetState = moveStateRef.current;
+      moveStateRef.current = targetState === 'closed' ? 'open' : 'closed';
       measureUpdate(
         () => {
-          controller.fetch(IssueResource.move, { number }, { state: 'closed' });
+          controller.fetch(
+            IssueResource.move,
+            { number },
+            { state: targetState },
+          );
         },
-        () => moveItemIsReady(containerRef, number),
+        () => moveItemIsReady(containerRef, number, targetState),
       );
     },
     [measureUpdate, controller, containerRef],
@@ -242,7 +254,8 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
-      const expected = `${issue.title} (updated)`;
+      const v = ++mutationCounter;
+      const expected = `${issue.title} (v${v})`;
       measureUpdate(
         () => {
           controller.fetch(
@@ -271,6 +284,11 @@ function BenchmarkHarness() {
     [measureUpdate, controller, containerRef],
   );
 
+  const resetStore = useCallback(
+    () => controller.resetEntireStore(),
+    [controller],
+  );
+
   registerAPI({
     updateEntity,
     updateUser,
@@ -280,6 +298,7 @@ function BenchmarkHarness() {
     deleteEntity,
     moveItem,
     triggerGC: () => benchGC.sweep(),
+    resetStore,
   });
 
   return (
diff --git a/examples/benchmark-react/src/shared/benchHarness.tsx b/examples/benchmark-react/src/shared/benchHarness.tsx
index 8661839db897..00c7341a6db5 100644
--- a/examples/benchmark-react/src/shared/benchHarness.tsx
+++ b/examples/benchmark-react/src/shared/benchHarness.tsx
@@ -36,16 +36,18 @@ const OBSERVE_MUTATIONS: MutationObserverInit = {
   characterData: true,
 };
 
-/** Check whether an issue has moved from the "open" to the "closed" state list. */
+/** Check whether an issue has moved to the target state list and left the source. */
 export function moveItemIsReady(
   containerRef: React.RefObject<HTMLDivElement | null>,
   number: number,
+  targetState: 'open' | 'closed' = 'closed',
 ): boolean {
+  const sourceState = targetState === 'closed' ? 'open' : 'closed';
   const source = containerRef.current?.querySelector(
-    '[data-state-list="open"]',
+    `[data-state-list="${sourceState}"]`,
   );
   const dest = containerRef.current?.querySelector(
-    '[data-state-list="closed"]',
+    `[data-state-list="${targetState}"]`,
   );
   return (
     source?.querySelector(`[data-issue-number="${number}"]`) == null &&
@@ -324,6 +326,8 @@ export function useBenchState() {
    * Libraries only pass their own actions + any overrides; standard actions
    * (init, unmountAll, etc.) are included automatically.
    */
+  const resetStoreNoop = useCallback(() => {}, []);
+
   const registerAPI = (libraryActions: LibraryActions) => {
     apiRef.current = {
       init,
@@ -340,6 +344,7 @@ export function useBenchState() {
       setNetworkSim,
       flushPendingMutations,
       setRenderLimit,
+      resetStore: resetStoreNoop,
       ...libraryActions,
     } as BenchAPI;
   };
diff --git a/examples/benchmark-react/src/shared/types.ts b/examples/benchmark-react/src/shared/types.ts
index 6bc9e48221e9..fdabce92d382 100644
--- a/examples/benchmark-react/src/shared/types.ts
+++ b/examples/benchmark-react/src/shared/types.ts
@@ -55,6 +55,8 @@ export interface BenchAPI {
   triggerGC?(): void;
   /** Cap DOM rendering to the first N items while keeping all data in the store. */
   setRenderLimit?(n: number | undefined): void;
+  /** Clear client-side cache/store so the next mount triggers a fresh fetch. Called between sub-iterations for mount scenarios. */
+  resetStore?(): void;
 }
 
 declare global {
diff --git a/examples/benchmark-react/src/swr/index.tsx b/examples/benchmark-react/src/swr/index.tsx
index f02f6677140e..e83c0ef8d9c2 100644
--- a/examples/benchmark-react/src/swr/index.tsx
+++ b/examples/benchmark-react/src/swr/index.tsx
@@ -19,9 +19,11 @@ import {
 import { setCurrentIssues } from '@shared/refStability';
 import { UserResource, IssueResource } from '@shared/resources';
 import type { Issue } from '@shared/types';
-import React, { useCallback, useMemo } from 'react';
+import React, { useCallback, useMemo, useRef } from 'react';
 import useSWR, { SWRConfig, useSWRConfig } from 'swr';
 
+let mutationCounter = 0;
+
 /** SWR fetcher: dispatches to shared resource fetch methods based on cache key */
 const fetcher = (key: string): Promise<any> => {
   if (key.startsWith('issue:'))
@@ -134,10 +136,11 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
+      const v = ++mutationCounter;
       measureUpdate(() =>
         IssueResource.update(
           { number },
-          { title: `${issue.title} (updated)` },
+          { title: `${issue.title} (v${v})` },
         ).then(() =>
           mutate(key => typeof key === 'string' && key.startsWith('issues:')),
         ),
@@ -150,13 +153,14 @@ function BenchmarkHarness() {
     (login: string) => {
       const user = FIXTURE_USERS_BY_LOGIN.get(login);
       if (!user) return;
+      const v = ++mutationCounter;
       measureUpdate(
         () =>
-          UserResource.update(
-            { login },
-            { name: `${user.name} (updated)` },
-          ).then(() =>
-            mutate(key => typeof key === 'string' && key.startsWith('issues:')),
+          UserResource.update({ login }, { name: `${user.name} (v${v})` }).then(
+            () =>
+              mutate(
+                key => typeof key === 'string' && key.startsWith('issues:'),
+              ),
           ) as Promise<any>,
       );
     },
@@ -183,14 +187,18 @@ function BenchmarkHarness() {
     [measureUpdate, mutate],
   );
 
+  const moveStateRef = useRef<'open' | 'closed'>('closed');
+
   const moveItem = useCallback(
     (number: number) => {
+      const targetState = moveStateRef.current;
+      moveStateRef.current = targetState === 'closed' ? 'open' : 'closed';
       measureUpdate(
         () =>
-          IssueResource.update({ number }, { state: 'closed' }).then(() =>
+          IssueResource.update({ number }, { state: targetState }).then(() =>
             mutate(key => typeof key === 'string' && key.startsWith('issues:')),
           ),
-        () => moveItemIsReady(containerRef, number),
+        () => moveItemIsReady(containerRef, number, targetState),
       );
     },
     [measureUpdate, mutate, containerRef],
@@ -200,7 +208,8 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
-      const expected = `${issue.title} (updated)`;
+      const v = ++mutationCounter;
+      const expected = `${issue.title} (v${v})`;
       measureUpdate(
         () =>
           IssueResource.update({ number }, { title: expected }).then(() =>
@@ -230,6 +239,11 @@ function BenchmarkHarness() {
     [measureUpdate, mutate, containerRef],
   );
 
+  const { cache } = useSWRConfig();
+  const resetStore = useCallback(() => {
+    if (typeof (cache as any).clear === 'function') (cache as any).clear();
+  }, [cache]);
+
   registerAPI({
     updateEntity,
     updateUser,
@@ -237,6 +251,7 @@ function BenchmarkHarness() {
     unshiftItem,
     deleteEntity,
     moveItem,
+    resetStore,
   });
 
   return (
@@ -261,6 +276,8 @@ function BenchProvider({ children }: { children: React.ReactNode }) {
         revalidateOnFocus: false,
         revalidateOnReconnect: false,
         revalidateIfStale: false,
+        revalidateOnMount: true,
+        dedupingInterval: 0,
       }}
     >
       {children}
diff --git a/examples/benchmark-react/src/tanstack-query/index.tsx b/examples/benchmark-react/src/tanstack-query/index.tsx
index 71d7b45855b8..f13a7a268358 100644
--- a/examples/benchmark-react/src/tanstack-query/index.tsx
+++ b/examples/benchmark-react/src/tanstack-query/index.tsx
@@ -25,7 +25,9 @@ import {
   useQuery,
   useQueryClient,
 } from '@tanstack/react-query';
-import React, { useCallback, useMemo } from 'react';
+import React, { useCallback, useMemo, useRef } from 'react';
+
+let mutationCounter = 0;
 
 function queryFn({ queryKey }: { queryKey: readonly unknown[] }): Promise<any> {
   const [type, id] = queryKey as [string, string | number | undefined];
@@ -160,10 +162,11 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
+      const v = ++mutationCounter;
       measureUpdate(() =>
         IssueResource.update(
           { number },
-          { title: `${issue.title} (updated)` },
+          { title: `${issue.title} (v${v})` },
         ).then(() =>
           client.invalidateQueries({
             queryKey: ['issues'],
@@ -178,8 +181,9 @@ function BenchmarkHarness() {
     (login: string) => {
       const user = FIXTURE_USERS_BY_LOGIN.get(login);
       if (!user) return;
+      const v = ++mutationCounter;
       measureUpdate(() =>
-        UserResource.update({ login }, { name: `${user.name} (updated)` }).then(
+        UserResource.update({ login }, { name: `${user.name} (v${v})` }).then(
           () =>
             client.invalidateQueries({
               queryKey: ['issues'],
@@ -212,14 +216,18 @@ function BenchmarkHarness() {
     [measureUpdate, client],
   );
 
+  const moveStateRef = useRef<'open' | 'closed'>('closed');
+
   const moveItem = useCallback(
     (number: number) => {
+      const targetState = moveStateRef.current;
+      moveStateRef.current = targetState === 'closed' ? 'open' : 'closed';
       measureUpdate(
         () =>
-          IssueResource.update({ number }, { state: 'closed' }).then(() =>
+          IssueResource.update({ number }, { state: targetState }).then(() =>
             client.invalidateQueries({ queryKey: ['issues'] }),
           ),
-        () => moveItemIsReady(containerRef, number),
+        () => moveItemIsReady(containerRef, number, targetState),
       );
     },
     [measureUpdate, client, containerRef],
@@ -229,7 +237,8 @@ function BenchmarkHarness() {
     (number: number) => {
       const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
       if (!issue) return;
-      const expected = `${issue.title} (updated)`;
+      const v = ++mutationCounter;
+      const expected = `${issue.title} (v${v})`;
       measureUpdate(
         () =>
           IssueResource.update({ number }, { title: expected }).then(() =>
@@ -258,6 +267,8 @@ function BenchmarkHarness() {
     [measureUpdate, client, containerRef],
   );
 
+  const resetStore = useCallback(() => queryClient.clear(), []);
+
   registerAPI({
     updateEntity,
     updateUser,
@@ -265,6 +276,7 @@ function BenchmarkHarness() {
     unshiftItem,
     deleteEntity,
     moveItem,
+    resetStore,
   });
 
   return (

From 8598d7250fbfb2830b730e32f6a12744cac22b99 Mon Sep 17 00:00:00 2001
From: Nathaniel Tucker <me@ntucker.me>
Date: Sun, 22 Mar 2026 12:55:41 -0400
Subject: [PATCH 3/6] fix: bugbot

---
 examples/benchmark-react/bench/validate.ts | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/benchmark-react/bench/validate.ts b/examples/benchmark-react/bench/validate.ts
index 427d13293b7c..3006c11a896c 100644
--- a/examples/benchmark-react/bench/validate.ts
+++ b/examples/benchmark-react/bench/validate.ts
@@ -187,19 +187,19 @@ test('updateEntity changes issue title in DOM', async (page, lib) => {
 
   await waitFor(
     page,
-    async () => (await getIssueTitles(page))[1]?.includes('(updated)') ?? false,
-    'issue #1 title contains "(updated)"',
+    async () => (await getIssueTitles(page))[1]?.includes('(v') ?? false,
+    'issue #1 title contains "(v"',
   );
 
   const titles = await getIssueTitles(page);
   assert(
-    titles[1]?.includes('(updated)'),
+    titles[1]?.includes('(v'),
     lib,
     'updateEntity',
-    `issue #1 should contain "(updated)", got "${titles[1]}"`,
+    `issue #1 should contain "(v…)", got "${titles[1]}"`,
   );
   assert(
-    !titles[2]?.includes('(updated)'),
+    !titles[2]?.includes('(v'),
     lib,
     'updateEntity unchanged',
     `issue #2 should be unchanged, got "${titles[2]}"`,
@@ -239,7 +239,7 @@ test('ref-stability after updateEntity', async (page, lib) => {
 
   await waitFor(
     page,
-    async () => (await getIssueTitles(page))[1]?.includes('(updated)') ?? false,
+    async () => (await getIssueTitles(page))[1]?.includes('(v') ?? false,
     'issue #1 title updated before ref check',
   );
 
@@ -552,7 +552,7 @@ test('updateEntity timing: DOM reflects change at measurement end', async (page,
 
   const titles = await getIssueTitles(page);
   assert(
-    titles[1]?.includes('(updated)') ?? false,
+    titles[1]?.includes('(v') ?? false,
     lib,
     'updateEntity timing',
     `DOM not updated when data-bench-complete fired. ` +

From be1071763c1ab4577a1d3e04c765785f70715d1f Mon Sep 17 00:00:00 2001
From: Nathaniel Tucker <me@ntucker.me>
Date: Sun, 22 Mar 2026 12:56:34 -0400
Subject: [PATCH 4/6] demo(benchmark-react): reduce warmup/measurement counts
 for faster CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With 5 sub-iterations per round providing sufficient noise reduction,
lower warmup (small: 3→2, large: 2→1) and max measurement caps
(small CI: 20→15, large CI: 15→12) to cut ~30-40s from CI runtime.

Made-with: Cursor
---
 examples/benchmark-react/README.md          |  6 +++---
 examples/benchmark-react/bench/scenarios.ts | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/benchmark-react/README.md b/examples/benchmark-react/README.md
index 891e6a95b8e9..10d55c6ef321 100644
--- a/examples/benchmark-react/README.md
+++ b/examples/benchmark-react/README.md
@@ -14,7 +14,7 @@ The repo has two benchmark suites:
 - **What we measure:** Wall-clock time from triggering an action (e.g. `init(100)` or `updateUser('user0')`) until a MutationObserver detects the expected DOM change in the benchmark container. Optionally we also record React Profiler commit duration and, with `BENCH_TRACE=true`, Chrome trace duration.
 - **Why:** Normalized caching should show wins on shared-entity updates (one store write, many components update), ref stability (fewer new object references), and derived-view memoization (`Query` schema avoids re-sorting when entities haven't changed). See [js-framework-benchmark "How the duration is measured"](https://github.com/krausest/js-framework-benchmark/wiki/How-the-duration-is-measured) for a similar timeline-based approach.
 - **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs 5 sub-iterations per page visit and reports the median, further reducing per-sample noise.
-- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 3 warmup + up to 15 measurement rounds locally; large (expensive) scenarios use 2 warmup + up to 8 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage.
+- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 2 warmup + up to 12 measurement rounds locally; large (expensive) scenarios use 1 warmup + up to 6 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage.
 
 ## Scenario categories
 
@@ -189,8 +189,8 @@ Regressions >5% on stable scenarios or >15% on volatile scenarios are worth inve
 
    Scenarios are classified as `small` or `large` based on their cost:
 
-   - **Small** (3 warmup + 4–15 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item`
-   - **Large** (2 warmup + 3–8 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10`
+   - **Small** (2 warmup + 3–12 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item`
+   - **Large** (1 warmup + 3–6 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10`
    - **Memory** (opt-in, 1 warmup + 3 measurement): `memory-mount-unmount-cycle` — run with `--action memory`
 
    When running all scenarios (`yarn bench`), each group runs with its own warmup/measurement count. Use `--size` to run only one group.
diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts
index aa1b1a8f345c..2213d44d7f41 100644
--- a/examples/benchmark-react/bench/scenarios.ts
+++ b/examples/benchmark-react/bench/scenarios.ts
@@ -21,16 +21,16 @@ const defaultOpsPerRound = parseInt(process.env.BENCH_OPS_PER_ROUND ?? '5', 10);
 
 export const RUN_CONFIG: Record<ScenarioSize, RunProfile> = {
   small: {
-    warmup: 3,
-    minMeasurement: 4,
-    maxMeasurement: process.env.CI ? 20 : 15,
+    warmup: 2,
+    minMeasurement: 3,
+    maxMeasurement: process.env.CI ? 15 : 12,
     targetMarginPct: process.env.CI ? 5 : 10,
     opsPerRound: defaultOpsPerRound,
   },
   large: {
-    warmup: 2,
+    warmup: 1,
     minMeasurement: 3,
-    maxMeasurement: process.env.CI ? 15 : 8,
+    maxMeasurement: process.env.CI ? 12 : 6,
     targetMarginPct: process.env.CI ? 8 : 15,
     opsPerRound: defaultOpsPerRound,
   },

From f06f7306ca35615c786fba421520c39c9e13b03c Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Sun, 22 Mar 2026 17:10:25 +0000
Subject: [PATCH 5/6] Fix benchmark range format for single sample

Co-authored-by: Nathaniel Tucker <me@ntucker.me>
---
 examples/benchmark-react/bench/stats.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts
index 56667529e8ce..b0559bb9349a 100644
--- a/examples/benchmark-react/bench/stats.ts
+++ b/examples/benchmark-react/bench/stats.ts
@@ -89,7 +89,7 @@ export function computeStats(
   const measured = samples.slice(warmupCount);
   if (measured.length <= 1) {
     const v = measured[0] ?? 0;
-    return { median: v, p95: v, range: '± 0' };
+    return { median: v, p95: v, range: '± 0.0%' };
   }
   const sorted = [...measured].sort((a, b) => a - b);
   const clean = trimOutliers(sorted);

From c94a63accc1f9e3ca9ada75dae9614e33299b544 Mon Sep 17 00:00:00 2001
From: Nathaniel Tucker <me@ntucker.me>
Date: Sun, 22 Mar 2026 15:35:48 -0400
Subject: [PATCH 6/6] demo(benchmark-react): fix stats bugs and add
 per-scenario opsPerRound

- Replace z=1.96 with t-distribution critical values for accurate CI
  on small samples (n=3-15)
- Fix even-length median calculation in scaledMAD/isConverged/computeStats
- Fix median===0 premature convergence (now requires margin===0 too)
- Fix invalidateAndResolve title accumulation bug using fixture data
- Clamp deleteEntity sub-iteration args to mountCount bound
- Remove dead cdp parameter from runScenario
- Add per-scenario opsPerRound override to Scenario type
- Set opsPerRound=9 for update-entity-sorted, =5 for list-detail-switch-10
- Update README with remeasured results and variance tiers

Made-with: Cursor
---
 examples/benchmark-react/README.md            | 36 ++++----
 examples/benchmark-react/bench/runner.ts      | 14 +--
 examples/benchmark-react/bench/scenarios.ts   |  2 +
 examples/benchmark-react/bench/stats.ts       | 86 ++++++++++++++-----
 .../benchmark-react/src/data-client/index.tsx | 13 +--
 examples/benchmark-react/src/shared/types.ts  |  2 +
 6 files changed, 102 insertions(+), 51 deletions(-)

diff --git a/examples/benchmark-react/README.md b/examples/benchmark-react/README.md
index 10d55c6ef321..6dcd2d2a2995 100644
--- a/examples/benchmark-react/README.md
+++ b/examples/benchmark-react/README.md
@@ -13,7 +13,7 @@ The repo has two benchmark suites:
 
 - **What we measure:** Wall-clock time from triggering an action (e.g. `init(100)` or `updateUser('user0')`) until a MutationObserver detects the expected DOM change in the benchmark container. Optionally we also record React Profiler commit duration and, with `BENCH_TRACE=true`, Chrome trace duration.
 - **Why:** Normalized caching should show wins on shared-entity updates (one store write, many components update), ref stability (fewer new object references), and derived-view memoization (`Query` schema avoids re-sorting when entities haven't changed). See [js-framework-benchmark "How the duration is measured"](https://github.com/krausest/js-framework-benchmark/wiki/How-the-duration-is-measured) for a similar timeline-based approach.
-- **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs 5 sub-iterations per page visit and reports the median, further reducing per-sample noise.
+- **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs multiple sub-iterations per page visit and reports the median, further reducing per-sample noise. The default is 5 sub-iterations; individual scenarios can override this via `opsPerRound` in `bench/scenarios.ts` (e.g. `update-entity-sorted` uses 9, `list-detail-switch-10` uses 5).
 - **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 2 warmup + up to 12 measurement rounds locally; large (expensive) scenarios use 1 warmup + up to 6 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage.
 
 ## Scenario categories
@@ -55,9 +55,9 @@ Illustrative **relative** results with **baseline = 100%** (plain React useState
 
 | Category | Scenarios (representative) | data-client | tanstack-query | swr | baseline |
 |---|---|---:|---:|---:|---:|
-| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~98% | ~100% | ~100% | **100%** |
-| Navigation | `list-detail-switch-10` | **~1354%** | ~233% | ~260% | 100% |
-| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~6778%** | ~97% | ~99% | 100% |
+| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~97% | ~100% | ~100% | **100%** |
+| Navigation | `list-detail-switch-10` | **~1652%** | ~231% | ~230% | 100% |
+| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~6994%** | ~97% | ~99% | 100% |
 | Scaling (10k items) | `update-user-10000` | **~9713%** | ~94% | ~100% | 100% |
 
 
@@ -70,19 +70,19 @@ Run: **2026-03-22**, Linux (WSL2), `yarn build:benchmark-react`, static preview
 | Scenario | data-client | tanstack-query | swr | baseline |
 |---|---:|---:|---:|---:|
 | **Navigation** | | | | |
-| `getlist-100` | 20.28 ± 0.3% | 20.58 ± 1.2% | 20.58 ± 0.3% | 20.62 ± 0.3% |
-| `getlist-500` | 12.41 ± 0.4% | 12.61 ± 0.2% | 12.59 ± 0.2% | 12.63 ± 0.2% |
-| `getlist-500-sorted` | 12.55 ± 0.2% | 12.67 ± 1.3% | 12.72 ± 0.4% | 12.79 ± 0.4% |
-| `list-detail-switch-10` | 9.75 ± 1.7% | 1.68 ± 0.2% | 1.87 ± 0.9% | 0.72 ± 0.0% |
+| `getlist-100` | 20.16 ± 0.7% | 20.58 ± 0.8% | 20.58 ± 0.8% | 20.58 ± 0.0% |
+| `getlist-500` | 12.05 ± 0.9% | 12.55 ± 0.0% | 12.61 ± 0.9% | 12.69 ± 1.4% |
+| `getlist-500-sorted` | 12.56 ± 1.4% | 12.72 ± 0.5% | 12.79 ± 0.9% | 12.80 ± 1.4% |
+| `list-detail-switch-10` | 12.06 ± 12.5% | 1.69 ± 1.0% | 1.68 ± 1.1% | 0.73 ± 0.1% |
 | **Mutations** | | | | |
-| `update-entity` | 555.56 ± 2.9% | 7.00 ± 0.3% | 6.98 ± 0.1% | 7.18 ± 0.2% |
-| `update-user` | 625.00 ± 11.2% | 6.95 ± 0.1% | 7.15 ± 0.2% | 7.17 ± 0.2% |
-| `update-entity-sorted` | 476.19 ± 0.0% | 7.06 ± 0.3% | 7.06 ± 0.0% | 7.24 ± 0.0% |
-| `update-entity-multi-view` | 500.00 ± 3.4% | 7.05 ± 0.2% | 7.09 ± 0.1% | 7.25 ± 0.1% |
-| `update-user-10000` | 151.52 ± 2.8% | 1.47 ± 0.0% | 1.56 ± 0.1% | 1.56 ± 0.3% |
-| `unshift-item` | 434.78 ± 6.1% | 6.91 ± 0.2% | 7.13 ± 0.2% | 7.15 ± 0.3% |
-| `delete-item` | 526.32 ± 2.7% | 6.89 ± 0.0% | 7.13 ± 0.4% | 7.12 ± 0.0% |
-| `move-item` | 277.78 ± 3.9% | 6.52 ± 0.2% | 6.98 ± 0.6% | 6.87 ± 0.2% |
+| `update-entity` | 555.56 ± 8.4% | 6.99 ± 0.3% | 6.99 ± 0.3% | 7.17 ± 0.3% |
+| `update-user` | 571.90 ± 12.8% | 6.94 ± 0.5% | 7.18 ± 0.0% | 7.16 ± 0.0% |
+| `update-entity-sorted` | 588.24 ± 8.0% | 7.10 ± 0.3% | 7.09 ± 0.4% | 7.28 ± 0.0% |
+| `update-entity-multi-view` | 555.56 ± 0.0% | 7.06 ± 0.3% | 7.08 ± 0.3% | 7.26 ± 0.2% |
+| `update-user-10000` | 151.52 ± 10.8% | 1.46 ± 0.5% | 1.56 ± 0.2% | 1.56 ± 1.3% |
+| `unshift-item` | 425.72 ± 5.0% | 6.90 ± 0.1% | 7.13 ± 0.3% | 7.14 ± 0.3% |
+| `delete-item` | 526.32 ± 7.2% | 6.89 ± 0.3% | 7.13 ± 0.5% | 7.12 ± 1.0% |
+| `move-item` | 285.71 ± 4.0% | 6.55 ± 0.5% | 6.99 ± 0.5% | 6.92 ± 0.8% |
 
 [Measured on a Ryzen 9 7950X; 64 GB RAM; Ubuntu (WSL2); Node 24.12.0; Chromium (Playwright)]
 
@@ -90,8 +90,8 @@ Run: **2026-03-22**, Linux (WSL2), `yarn build:benchmark-react`, static preview
 
 | Category | Scenarios | Typical run-to-run spread |
 |---|---|---|
-| **Stable** | `getlist-*`, `update-entity`, `ref-stability-*` | 2-5% |
-| **Moderate** | `update-user-*`, `update-entity-sorted`, `update-entity-multi-view` | 5-10% |
+| **Stable** | `getlist-*`, `update-entity`, `update-entity-sorted`, `ref-stability-*` | 2-5% |
+| **Moderate** | `update-user-*`, `update-entity-multi-view`, `list-detail-switch-10` | 5-10% |
 | **Volatile** | `memory-mount-unmount-cycle`, `startup-*`, `(react commit)` suffixes | 10-25% |
 
 Regressions >5% on stable scenarios or >15% on volatile scenarios are worth investigating.
diff --git a/examples/benchmark-react/bench/runner.ts b/examples/benchmark-react/bench/runner.ts
index c9f0ac9e507f..88cd0ee96f19 100644
--- a/examples/benchmark-react/bench/runner.ts
+++ b/examples/benchmark-react/bench/runner.ts
@@ -178,7 +178,6 @@ async function runScenario(
   lib: string,
   scenario: Scenario,
   networkSim: boolean,
-  cdp?: CDPSession,
 ): Promise<ScenarioResult> {
   const appPath = `/${lib}/`;
   await page.goto(`${BASE_URL}${appPath}`, {
@@ -394,9 +393,11 @@ async function runScenario(
       });
     }
 
-    // Execute action (vary args for deleteEntity across sub-iterations)
+    // Vary args for deleteEntity so each sub-iteration deletes a different item
     const actionArgs =
-      scenario.action === 'deleteEntity' ? [subIdx + 1] : scenario.args;
+      scenario.action === 'deleteEntity' ?
+        [Math.min(subIdx + 1, mountCount)]
+      : scenario.args;
     await (bench as any).evaluate(
       (api: any, { action, args }: { action: string; args: unknown[] }) => {
         api[action](...args);
@@ -469,8 +470,9 @@ async function runScenario(
 function effectiveOpsPerRound(scenario: Scenario): number {
   if (scenario.deterministic) return 1;
   if (scenario.category === 'memory') return 1;
-  if (scenario.action === 'listDetailSwitch') return 1;
-  return RUN_CONFIG[scenario.size ?? 'small'].opsPerRound;
+  return (
+    scenario.opsPerRound ?? RUN_CONFIG[scenario.size ?? 'small'].opsPerRound
+  );
 }
 
 function simpleMedian(arr: number[]): number {
@@ -550,7 +552,7 @@ async function runRound(
       done++;
       const prefix = opts.showProgress ? `[${done}/${total}] ` : '';
       try {
-        const result = await runScenario(page, lib, scenario, networkSim, cdp);
+        const result = await runScenario(page, lib, scenario, networkSim);
         recordResult(samples, scenario, result);
         const unit = scenarioUnit(scenario);
         const displayValue =
diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts
index 2213d44d7f41..2ce7df4f56ec 100644
--- a/examples/benchmark-react/bench/scenarios.ts
+++ b/examples/benchmark-react/bench/scenarios.ts
@@ -123,6 +123,7 @@ const BASE_SCENARIOS: BaseScenario[] = [
     renderLimit: 100,
     preMountAction: 'mountSortedView',
     size: 'large',
+    opsPerRound: 9,
   },
   {
     nameSuffix: 'update-entity-multi-view',
@@ -141,6 +142,7 @@ const BASE_SCENARIOS: BaseScenario[] = [
     category: 'hotPath',
     size: 'large',
     renderLimit: 100,
+    opsPerRound: 5,
   },
   {
     nameSuffix: 'update-user-10000',
diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts
index b0559bb9349a..fd5476458301 100644
--- a/examples/benchmark-react/bench/stats.ts
+++ b/examples/benchmark-react/bench/stats.ts
@@ -1,3 +1,50 @@
+/**
+ * Two-tailed t critical values for 95% CI (α = 0.05) keyed by
+ * degrees of freedom. Falls back to z = 1.96 for df > 30.
+ */
+const T_CRIT_95: Record<number, number> = {
+  1: 12.706,
+  2: 4.303,
+  3: 3.182,
+  4: 2.776,
+  5: 2.571,
+  6: 2.447,
+  7: 2.365,
+  8: 2.306,
+  9: 2.262,
+  10: 2.228,
+  11: 2.201,
+  12: 2.179,
+  13: 2.16,
+  14: 2.145,
+  15: 2.131,
+  20: 2.086,
+  25: 2.06,
+  30: 2.042,
+};
+
+function tCrit95(n: number): number {
+  const df = n - 1;
+  if (df <= 0) return 1.96;
+  if (df in T_CRIT_95) return T_CRIT_95[df];
+  const keys = Object.keys(T_CRIT_95)
+    .map(Number)
+    .sort((a, b) => a - b);
+  const lower = keys.filter(k => k <= df).pop();
+  const upper = keys.find(k => k >= df);
+  if (lower == null) return T_CRIT_95[keys[0]];
+  if (upper == null || lower === upper) return T_CRIT_95[lower];
+  const frac = (df - lower) / (upper - lower);
+  return T_CRIT_95[lower] + frac * (T_CRIT_95[upper] - T_CRIT_95[lower]);
+}
+
+function sortedMedian(sorted: number[]): number {
+  const mid = Math.floor(sorted.length / 2);
+  return sorted.length % 2 === 0 ?
+      (sorted[mid - 1] + sorted[mid]) / 2
+    : sorted[mid];
+}
+
 /**
  * Remove outliers using the IQR method (1.5×IQR fence).
  * Input must be sorted ascending. Falls back to the full array when
@@ -21,38 +68,35 @@ function trimOutliers(sorted: number[]): number[] {
  * stddev for normal distributions.
  */
 function scaledMAD(sorted: number[]): number {
-  const median = sorted[Math.floor(sorted.length / 2)];
-  const deviations = sorted
-    .map(x => Math.abs(x - median))
-    .sort((a, b) => a - b);
-  const mad = deviations[Math.floor(deviations.length / 2)];
-  return 1.4826 * mad;
+  const med = sortedMedian(sorted);
+  const deviations = sorted.map(x => Math.abs(x - med)).sort((a, b) => a - b);
+  return 1.4826 * sortedMedian(deviations);
 }
 
 /**
- * Compute the 95% CI margin using MAD-based dispersion.
- * Falls back to stddev when MAD is zero (all values identical
- * except outliers) to avoid reporting ± 0 misleadingly.
+ * Compute the 95% CI margin using MAD-based dispersion and t-distribution
+ * critical values for small samples. Falls back to stddev when MAD is zero
+ * (all values identical except outliers) to avoid reporting ± 0 misleadingly.
  */
 function ciMargin(clean: number[]): number {
+  if (clean.length < 2) return 0;
+  const t = tCrit95(clean.length);
   const mad = scaledMAD(clean);
   if (mad > 0) {
-    return 1.96 * (mad / Math.sqrt(clean.length));
+    return t * (mad / Math.sqrt(clean.length));
   }
   const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length;
-  const stdDev =
-    clean.length > 1 ?
-      Math.sqrt(
-        clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1),
-      )
-    : 0;
-  return 1.96 * (stdDev / Math.sqrt(clean.length));
+  const stdDev = Math.sqrt(
+    clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1),
+  );
+  return t * (stdDev / Math.sqrt(clean.length));
 }
 
 /**
  * Check whether a scenario's samples have converged: 95% CI margin
  * is within targetMarginPct of the median.  Zero-variance metrics
- * (e.g. ref-stability counts) converge after minSamples.
+ * (e.g. ref-stability counts) converge after minSamples only when
+ * the margin is also zero.
  *
  * Outliers are trimmed via IQR before computing the CI so that a
  * single GC spike doesn't prevent convergence.
@@ -67,9 +111,9 @@ export function isConverged(
   if (measured.length < minSamples) return false;
   const sorted = [...measured].sort((a, b) => a - b);
   const clean = trimOutliers(sorted);
-  const median = clean[Math.floor(clean.length / 2)];
-  if (median === 0) return true;
+  const median = sortedMedian(clean);
   const margin = ciMargin(clean);
+  if (median === 0) return margin === 0;
   return (margin / Math.abs(median)) * 100 <= targetMarginPct;
 }
 
@@ -93,7 +137,7 @@ export function computeStats(
   }
   const sorted = [...measured].sort((a, b) => a - b);
   const clean = trimOutliers(sorted);
-  const median = clean[Math.floor(clean.length / 2)] ?? 0;
+  const median = sortedMedian(clean);
   const p95Idx = Math.floor(sorted.length * 0.95);
   const p95 = sorted[Math.min(p95Idx, sorted.length - 1)] ?? median;
   const margin = ciMargin(clean);
diff --git a/examples/benchmark-react/src/data-client/index.tsx b/examples/benchmark-react/src/data-client/index.tsx
index 97c435c2c876..cf1af16ad3af 100644
--- a/examples/benchmark-react/src/data-client/index.tsx
+++ b/examples/benchmark-react/src/data-client/index.tsx
@@ -29,7 +29,7 @@ import {
   IssueResource,
   sortedIssuesEndpoint,
 } from '@shared/resources';
-import { getIssue, patchIssue } from '@shared/server';
+import { patchIssue } from '@shared/server';
 import type { Issue } from '@shared/types';
 import React, { useCallback, useRef } from 'react';
 
@@ -222,10 +222,11 @@ function BenchmarkHarness() {
 
   const invalidateAndResolve = useCallback(
     async (number: number) => {
-      const issue = await getIssue(number);
-      if (issue) {
-        await patchIssue(number, { title: `${issue.title} (refetched)` });
-      }
+      const issue = FIXTURE_ISSUES_BY_NUMBER.get(number);
+      if (!issue) return;
+      const v = ++mutationCounter;
+      const expected = `${issue.title} (v${v})`;
+      await patchIssue(number, { title: expected });
       measureUpdate(
         () => {
           if (doubleListCount != null) {
@@ -243,7 +244,7 @@ function BenchmarkHarness() {
           const el = containerRef.current!.querySelector(
             `[data-issue-number="${number}"] [data-title]`,
           );
-          return el?.textContent?.includes('(refetched)') ?? false;
+          return el?.textContent === expected;
         },
       );
     },
diff --git a/examples/benchmark-react/src/shared/types.ts b/examples/benchmark-react/src/shared/types.ts
index fdabce92d382..bdde003695d8 100644
--- a/examples/benchmark-react/src/shared/types.ts
+++ b/examples/benchmark-react/src/shared/types.ts
@@ -153,6 +153,8 @@ export interface Scenario {
   preMountAction?: keyof BenchAPI;
   /** Result is deterministic (zero variance); run exactly once with no warmup. */
   deterministic?: boolean;
+  /** Override the default sub-iterations per page visit for this scenario. */
+  opsPerRound?: number;
   /** Cap DOM rendering to first N items while keeping all data in the store. */
   renderLimit?: number;
   /** If set, scenario applies only to these libs; dropped when any selected library is not listed. */