From bad8ca1c1f3d8a834cc63e901995e50879d1c84b Mon Sep 17 00:00:00 2001 From: Nathaniel Tucker Date: Sun, 22 Mar 2026 08:38:28 -0400 Subject: [PATCH 1/6] =?UTF-8?q?demo(benchmark-react):=20reduce=20CI=20benc?= =?UTF-8?q?hmark=20variance=20below=20=C2=B110?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Increase warmup and measurement iterations for CI (small: 5+25, large: 3+20) - Tighten convergence targets (small: 5%, large: 8%) - Switch from stddev to MAD-based CI margin for robustness against outliers - Increase inter-scenario GC settle time from 50ms to 200ms Made-with: Cursor --- examples/benchmark-react/bench/runner.ts | 2 +- examples/benchmark-react/bench/scenarios.ts | 16 +++--- examples/benchmark-react/bench/stats.ts | 58 +++++++++++++++------ 3 files changed, 52 insertions(+), 24 deletions(-) diff --git a/examples/benchmark-react/bench/runner.ts b/examples/benchmark-react/bench/runner.ts index e23ff88b3ca3..f6ed9b691653 100644 --- a/examples/benchmark-react/bench/runner.ts +++ b/examples/benchmark-react/bench/runner.ts @@ -443,7 +443,7 @@ async function runRound( try { await cdp.send('HeapProfiler.collectGarbage'); } catch {} - await page.waitForTimeout(50); + await page.waitForTimeout(200); done++; const prefix = opts.showProgress ? `[${done}/${total}] ` : ''; diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts index ce8c756b572f..63099b594267 100644 --- a/examples/benchmark-react/bench/scenarios.ts +++ b/examples/benchmark-react/bench/scenarios.ts @@ -17,16 +17,16 @@ export interface RunProfile { export const RUN_CONFIG: Record = { small: { - warmup: 3, - minMeasurement: 5, - maxMeasurement: process.env.CI ? 10 : 20, - targetMarginPct: process.env.CI ? 15 : 10, + warmup: 5, + minMeasurement: 8, + maxMeasurement: process.env.CI ? 25 : 20, + targetMarginPct: process.env.CI ? 5 : 10, }, large: { - warmup: 1, - minMeasurement: 3, - maxMeasurement: process.env.CI ? 6 : 10, - targetMarginPct: process.env.CI ? 20 : 15, + warmup: 3, + minMeasurement: 5, + maxMeasurement: process.env.CI ? 20 : 10, + targetMarginPct: process.env.CI ? 8 : 15, }, }; diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts index a0aad7ce618c..84751c00126e 100644 --- a/examples/benchmark-react/bench/stats.ts +++ b/examples/benchmark-react/bench/stats.ts @@ -15,6 +15,40 @@ function trimOutliers(sorted: number[]): number[] { return result.length >= 2 ? result : sorted; } +/** + * Median Absolute Deviation — robust dispersion estimator with 50% + * breakdown point. Scale factor 1.4826 makes it consistent with + * stddev for normal distributions. + */ +function scaledMAD(sorted: number[]): number { + const median = sorted[Math.floor(sorted.length / 2)]; + const deviations = sorted + .map(x => Math.abs(x - median)) + .sort((a, b) => a - b); + const mad = deviations[Math.floor(deviations.length / 2)]; + return 1.4826 * mad; +} + +/** + * Compute the 95% CI margin using MAD-based dispersion. + * Falls back to stddev when MAD is zero (all values identical + * except outliers) to avoid reporting ± 0 misleadingly. + */ +function ciMargin(clean: number[]): number { + const mad = scaledMAD(clean); + if (mad > 0) { + return 1.96 * (mad / Math.sqrt(clean.length)); + } + const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length; + const stdDev = + clean.length > 1 ? + Math.sqrt( + clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1), + ) + : 0; + return 1.96 * (stdDev / Math.sqrt(clean.length)); +} + /** * Check whether a scenario's samples have converged: 95% CI margin * is within targetMarginPct of the median. Zero-variance metrics @@ -33,19 +67,20 @@ export function isConverged( if (measured.length < minSamples) return false; const sorted = [...measured].sort((a, b) => a - b); const clean = trimOutliers(sorted); - const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length; - if (mean === 0) return true; - const stdDev = Math.sqrt( - clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1), - ); - const margin = 1.96 * (stdDev / Math.sqrt(clean.length)); - return (margin / Math.abs(mean)) * 100 <= targetMarginPct; + const median = clean[Math.floor(clean.length / 2)]; + if (median === 0) return true; + const margin = ciMargin(clean); + return (margin / Math.abs(median)) * 100 <= targetMarginPct; } /** * Compute median, p95, and approximate 95% confidence interval from samples. * Discards warmup runs, then trims IQR outliers for median and CI * computation. p95 uses the full (untrimmed) sorted data. + * + * Uses MAD (Median Absolute Deviation) instead of stddev for the CI + * margin — MAD is far more robust to heavy-tailed distributions and + * residual outliers typical of browser benchmarks. */ export function computeStats( samples: number[], @@ -61,14 +96,7 @@ export function computeStats( const median = clean[Math.floor(clean.length / 2)] ?? 0; const p95Idx = Math.floor(sorted.length * 0.95); const p95 = sorted[Math.min(p95Idx, sorted.length - 1)] ?? median; - const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length; - const stdDev = - clean.length > 1 ? - Math.sqrt( - clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1), - ) - : 0; - const margin = 1.96 * (stdDev / Math.sqrt(clean.length)); + const margin = ciMargin(clean); return { median, p95, From 3bda3f3cd35db9a3e37d6866a510c7f2d05d05c0 Mon Sep 17 00:00:00 2001 From: Nathaniel Tucker Date: Sun, 22 Mar 2026 12:39:17 -0400 Subject: [PATCH 2/6] demo(benchmark-react): add in-page sub-iterations and reduce variance - Run multiple ops per page visit (default 5), returning the median duration as one sample. Eliminates page-navigation overhead between measurements and dramatically reduces variance. - Add resetStore() to BenchAPI for clearing caches between mount sub-iterations (data-client, tanstack-query, swr). - Vary mutation data each sub-iteration (incrementing counter for titles, toggling moveItem direction) to ensure real DOM changes. - Add waitForPaint between mutation sub-iterations to prevent server-resolution renders from bleeding into the next measurement. - Report variance as percentage instead of absolute values. - Reduce warmup/minMeasurement counts since sub-iterations provide sufficient noise reduction. - Fix SWR mount sub-iterations: add revalidateOnMount + dedupingInterval: 0 to ensure fresh fetches after cache.clear(). - Update README with latest results showing ~6778% mutation throughput advantage for data-client (up from ~4442% with more accurate measurement). Made-with: Cursor --- examples/benchmark-react/README.md | 40 +-- examples/benchmark-react/bench/runner.ts | 282 ++++++++++++------ examples/benchmark-react/bench/scenarios.ts | 18 +- examples/benchmark-react/bench/stats.ts | 3 +- .../benchmark-react/src/baseline/index.tsx | 27 +- .../benchmark-react/src/data-client/index.tsx | 31 +- .../src/shared/benchHarness.tsx | 11 +- examples/benchmark-react/src/shared/types.ts | 2 + examples/benchmark-react/src/swr/index.tsx | 37 ++- .../src/tanstack-query/index.tsx | 24 +- 10 files changed, 330 insertions(+), 145 deletions(-) diff --git a/examples/benchmark-react/README.md b/examples/benchmark-react/README.md index a4c68fed2014..891e6a95b8e9 100644 --- a/examples/benchmark-react/README.md +++ b/examples/benchmark-react/README.md @@ -13,8 +13,8 @@ The repo has two benchmark suites: - **What we measure:** Wall-clock time from triggering an action (e.g. `init(100)` or `updateUser('user0')`) until a MutationObserver detects the expected DOM change in the benchmark container. Optionally we also record React Profiler commit duration and, with `BENCH_TRACE=true`, Chrome trace duration. - **Why:** Normalized caching should show wins on shared-entity updates (one store write, many components update), ref stability (fewer new object references), and derived-view memoization (`Query` schema avoids re-sorting when entities haven't changed). See [js-framework-benchmark "How the duration is measured"](https://github.com/krausest/js-framework-benchmark/wiki/How-the-duration-is-measured) for a similar timeline-based approach. -- **Statistical:** Warmup runs are discarded; we report median and 95% CI. Libraries are interleaved per round to reduce environmental variance. -- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 3 warmup + 15 measurement runs locally (10 in CI); large (expensive) scenarios use 1 warmup + 4 measurement runs. +- **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs 5 sub-iterations per page visit and reports the median, further reducing per-sample noise. +- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 3 warmup + up to 15 measurement rounds locally; large (expensive) scenarios use 2 warmup + up to 8 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage. ## Scenario categories @@ -55,10 +55,10 @@ Illustrative **relative** results with **baseline = 100%** (plain React useState | Category | Scenarios (representative) | data-client | tanstack-query | swr | baseline | |---|---|---:|---:|---:|---:| -| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~95% | ~97% | ~99% | **100%** | -| Navigation | `list-detail-switch-10` | **~851%** | ~233% | ~247% | 100% | -| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~4442%** | ~97% | ~99% | 100% | -| Scaling (10k items) | `update-user-10000` | **~6408%** | ~94% | ~100% | 100% | +| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~98% | ~100% | ~100% | **100%** | +| Navigation | `list-detail-switch-10` | **~1354%** | ~233% | ~260% | 100% | +| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~6778%** | ~97% | ~99% | 100% | +| Scaling (10k items) | `update-user-10000` | **~9713%** | ~94% | ~100% | 100% | ## Latest measured results (network simulation on) @@ -70,19 +70,19 @@ Run: **2026-03-22**, Linux (WSL2), `yarn build:benchmark-react`, static preview | Scenario | data-client | tanstack-query | swr | baseline | |---|---:|---:|---:|---:| | **Navigation** | | | | | -| `getlist-100` | 18.48 ± 0.02 | 18.62 ± 0.07 | 19.12 ± 0.02 | 19.34 ± 0.09 | -| `getlist-500` | 11.45 ± 0.21 | 11.92 ± 0.18 | 11.96 ± 0.04 | 12.06 ± 0.08 | -| `getlist-500-sorted` | 11.48 ± 0.39 | 11.81 ± 0.22 | 12.00 ± 0.34 | 12.08 ± 0.37 | -| `list-detail-switch-10` | 6.13 ± 0.74 | 1.68 ± 0.07 | 1.78 ± 0.12 | 0.72 ± 0.00 | +| `getlist-100` | 20.28 ± 0.3% | 20.58 ± 1.2% | 20.58 ± 0.3% | 20.62 ± 0.3% | +| `getlist-500` | 12.41 ± 0.4% | 12.61 ± 0.2% | 12.59 ± 0.2% | 12.63 ± 0.2% | +| `getlist-500-sorted` | 12.55 ± 0.2% | 12.67 ± 1.3% | 12.72 ± 0.4% | 12.79 ± 0.4% | +| `list-detail-switch-10` | 9.75 ± 1.7% | 1.68 ± 0.2% | 1.87 ± 0.9% | 0.72 ± 0.0% | | **Mutations** | | | | | -| `update-entity` | 333.33 ± 4.22 | 6.95 ± 0.00 | 6.94 ± 0.02 | 7.17 ± 0.00 | -| `update-user` | 322.58 ± 11.79 | 6.97 ± 0.01 | 7.15 ± 0.00 | 7.15 ± 0.02 | -| `update-entity-sorted` | 285.71 ± 30.41 | 7.04 ± 0.01 | 7.05 ± 0.02 | 7.23 ± 0.01 | -| `update-entity-multi-view` | 344.83 ± 16.69 | 5.89 ± 0.77 | 5.89 ± 0.82 | 5.97 ± 0.05 | -| `update-user-10000` | 98.04 ± 5.79 | 1.44 ± 0.01 | 1.53 ± 0.00 | 1.53 ± 0.01 | -| `unshift-item` | 285.71 ± 11.11 | 6.89 ± 0.02 | 7.11 ± 0.01 | 7.11 ± 0.01 | -| `delete-item` | 312.50 ± 14.76 | 6.87 ± 0.01 | 7.09 ± 0.01 | 7.10 ± 0.00 | -| `move-item` | 256.41 ± 8.77 | 6.34 ± 0.06 | 6.80 ± 0.01 | 6.77 ± 0.01 | +| `update-entity` | 555.56 ± 2.9% | 7.00 ± 0.3% | 6.98 ± 0.1% | 7.18 ± 0.2% | +| `update-user` | 625.00 ± 11.2% | 6.95 ± 0.1% | 7.15 ± 0.2% | 7.17 ± 0.2% | +| `update-entity-sorted` | 476.19 ± 0.0% | 7.06 ± 0.3% | 7.06 ± 0.0% | 7.24 ± 0.0% | +| `update-entity-multi-view` | 500.00 ± 3.4% | 7.05 ± 0.2% | 7.09 ± 0.1% | 7.25 ± 0.1% | +| `update-user-10000` | 151.52 ± 2.8% | 1.47 ± 0.0% | 1.56 ± 0.1% | 1.56 ± 0.3% | +| `unshift-item` | 434.78 ± 6.1% | 6.91 ± 0.2% | 7.13 ± 0.2% | 7.15 ± 0.3% | +| `delete-item` | 526.32 ± 2.7% | 6.89 ± 0.0% | 7.13 ± 0.4% | 7.12 ± 0.0% | +| `move-item` | 277.78 ± 3.9% | 6.52 ± 0.2% | 6.98 ± 0.6% | 6.87 ± 0.2% | [Measured on a Ryzen 9 7950X; 64 GB RAM; Ubuntu (WSL2); Node 24.12.0; Chromium (Playwright)] @@ -189,8 +189,8 @@ Regressions >5% on stable scenarios or >15% on volatile scenarios are worth inve Scenarios are classified as `small` or `large` based on their cost: - - **Small** (3 warmup + 15 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item` - - **Large** (1 warmup + 4 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10` + - **Small** (3 warmup + 4–15 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item` + - **Large** (2 warmup + 3–8 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10` - **Memory** (opt-in, 1 warmup + 3 measurement): `memory-mount-unmount-cycle` — run with `--action memory` When running all scenarios (`yarn bench`), each group runs with its own warmup/measurement count. Use `--size` to run only one group. diff --git a/examples/benchmark-react/bench/runner.ts b/examples/benchmark-react/bench/runner.ts index f6ed9b691653..c9f0ac9e507f 100644 --- a/examples/benchmark-react/bench/runner.ts +++ b/examples/benchmark-react/bench/runner.ts @@ -26,6 +26,7 @@ function parseArgs(): { actions?: string[]; scenario?: string; networkSim: boolean; + opsPerRound?: number; } { const argv = process.argv.slice(2); const get = (flag: string, envVar: string): string | undefined => { @@ -39,6 +40,7 @@ function parseArgs(): { const actionRaw = get('--action', 'BENCH_ACTION'); const scenarioRaw = get('--scenario', 'BENCH_SCENARIO'); const networkSimRaw = get('--network-sim', 'BENCH_NETWORK_SIM'); + const opsRaw = get('--ops-per-round', 'BENCH_OPS_PER_ROUND'); const libs = libRaw ? libRaw.split(',').map(s => s.trim()) : undefined; const size = sizeRaw === 'small' || sizeRaw === 'large' ? sizeRaw : undefined; @@ -46,14 +48,23 @@ function parseArgs(): { actionRaw ? actionRaw.split(',').map(s => s.trim()) : undefined; const networkSim = networkSimRaw != null ? networkSimRaw !== 'false' : !process.env.CI; + const opsPerRound = opsRaw ? parseInt(opsRaw, 10) : undefined; - return { libs, size, actions, scenario: scenarioRaw, networkSim }; + return { + libs, + size, + actions, + scenario: scenarioRaw, + networkSim, + opsPerRound, + }; } function filterScenarios(scenarios: Scenario[]): { filtered: Scenario[]; libraries: string[]; networkSim: boolean; + opsPerRound?: number; } { const { libs, @@ -61,6 +72,7 @@ function filterScenarios(scenarios: Scenario[]): { actions, scenario: scenarioFilter, networkSim, + opsPerRound, } = parseArgs(); const libraries = libs ?? (process.env.CI ? ['data-client'] : [...LIBRARIES]); @@ -114,7 +126,7 @@ function filterScenarios(scenarios: Scenario[]): { !s.onlyLibs?.length || libraries.every(lib => s.onlyLibs!.includes(lib)), ); - return { filtered, libraries, networkSim }; + return { filtered, libraries, networkSim, opsPerRound }; } // --------------------------------------------------------------------------- @@ -200,6 +212,7 @@ async function runScenario( ); } + // --- Memory path (unchanged, always ops=1) --- const isMemory = scenario.action === 'mountUnmountCycle' && scenario.resultMetric === 'heapDelta'; @@ -236,17 +249,17 @@ async function runScenario( return { value: heapAfter - heapBefore }; } - const isUpdate = - scenario.action === 'updateEntity' || - scenario.action === 'updateEntityMultiView' || - scenario.action === 'updateUser' || - scenario.action === 'invalidateAndResolve' || - scenario.action === 'unshiftItem' || - scenario.action === 'deleteEntity' || - scenario.action === 'moveItem'; - const isRefStability = isRefStabilityScenario(scenario); + // --- Classify scenario --- const isInit = scenario.action === 'init'; + const isMountLike = + isInit || + scenario.action === 'mountSortedView' || + scenario.action === 'initDoubleList' || + scenario.action === 'listDetailSwitch'; + const isUpdate = !isMountLike; + const isRefStability = isRefStabilityScenario(scenario); + // --- Pre-mount for update/ref-stability scenarios (once) --- const mountCount = scenario.mountCount ?? 100; if (isUpdate || isRefStability) { const preMountAction = scenario.preMountAction ?? 'init'; @@ -277,102 +290,175 @@ async function runScenario( }); } + // --- Ref stability (deterministic, single run, early return) --- if (isRefStability) { await (bench as any).evaluate((api: any) => api.captureRefSnapshot()); - } - await harness.evaluate(el => { - el.removeAttribute('data-bench-complete'); - el.removeAttribute('data-bench-timeout'); - }); - const cdpTracing = - USE_TRACE && !isRefStability ? - await page.context().newCDPSession(page) - : undefined; - const traceChunks: object[] = []; - if (cdpTracing) { - cdpTracing.on('Tracing.dataCollected', (params: { value: object[] }) => { - traceChunks.push(...params.value); + await harness.evaluate(el => { + el.removeAttribute('data-bench-complete'); + el.removeAttribute('data-bench-timeout'); }); - await cdpTracing.send('Tracing.start', { - categories: 'devtools.timeline,blink', + await page.evaluate(() => { + performance.clearMarks(); + performance.clearMeasures(); }); - } - - await page.evaluate(() => { - performance.clearMarks(); - performance.clearMeasures(); - }); - await (bench as any).evaluate( - (api: any, { action, args }: { action: string; args: unknown[] }) => { - api[action](...args); - }, - { action: scenario.action, args: scenario.args }, - ); + await (bench as any).evaluate( + (api: any, { action, args }: { action: string; args: unknown[] }) => { + api[action](...args); + }, + { action: scenario.action, args: scenario.args }, + ); - const completeTimeout = networkSim ? 60000 : 10000; - await page.waitForSelector('[data-bench-complete]', { - timeout: completeTimeout, - state: 'attached', - }); + const completeTimeout = networkSim ? 60000 : 10000; + await page.waitForSelector('[data-bench-complete]', { + timeout: completeTimeout, + state: 'attached', + }); + const timedOut = await harness.evaluate(el => + el.hasAttribute('data-bench-timeout'), + ); + if (timedOut) { + throw new Error( + `Harness timeout: MutationObserver did not detect expected DOM update within 30 s`, + ); + } - const timedOut = await harness.evaluate(el => - el.hasAttribute('data-bench-timeout'), - ); - if (timedOut) { - throw new Error( - `Harness timeout: MutationObserver did not detect expected DOM update within 30 s`, + const report = await (bench as any).evaluate((api: any) => + api.getRefStabilityReport(), ); + await bench.dispose(); + return { value: report[scenario.resultMetric!] as number }; } - await (bench as any).evaluate((api: any) => api.flushPendingMutations()); + // --- Sub-iteration loop --- + const ops = effectiveOpsPerRound(scenario); + const durations: number[] = []; + const commitTimes: number[] = []; + const traceDurations: number[] = []; + const traceSubIdx = Math.floor(ops / 2); + + for (let subIdx = 0; subIdx < ops; subIdx++) { + // Mount scenarios: unmount + detach + resetStore + waitForPaint (skip first iteration — nothing mounted yet) + if (isMountLike && subIdx > 0) { + await (bench as any).evaluate((api: any) => api.unmountAll()); + await page + .waitForSelector('[data-bench-item], [data-sorted-list]', { + state: 'detached', + timeout: 10000, + }) + .catch(() => {}); + await (bench as any).evaluate((api: any) => { + if (api.resetStore) api.resetStore(); + }); + await page.evaluate( + () => + new Promise(r => + requestAnimationFrame(() => requestAnimationFrame(() => r())), + ), + ); + } - let traceDuration: number | undefined; - if (cdpTracing) { - try { - const done = new Promise(resolve => { - cdpTracing!.on('Tracing.tracingComplete', () => resolve()); + // Mutation scenarios: flush pending from prior sub-iteration + let React commit the resolution + if (isUpdate && subIdx > 0) { + await (bench as any).evaluate((api: any) => api.flushPendingMutations()); + await page.evaluate( + () => + new Promise(r => + requestAnimationFrame(() => requestAnimationFrame(() => r())), + ), + ); + } + + // Clear perf marks/measures + reset harness flags + await page.evaluate(() => { + performance.clearMarks(); + performance.clearMeasures(); + }); + await harness.evaluate(el => { + el.removeAttribute('data-bench-complete'); + el.removeAttribute('data-bench-timeout'); + }); + + // Chrome tracing: only for the middle sub-iteration + const shouldTrace = USE_TRACE && subIdx === traceSubIdx; + let cdpTracing: CDPSession | undefined; + const traceChunks: object[] = []; + if (shouldTrace) { + cdpTracing = await page.context().newCDPSession(page); + cdpTracing.on('Tracing.dataCollected', (params: { value: object[] }) => { + traceChunks.push(...params.value); + }); + await cdpTracing.send('Tracing.start', { + categories: 'devtools.timeline,blink', }); - await cdpTracing.send('Tracing.end'); - await done; - const traceJson = - '[\n' + traceChunks.map(e => JSON.stringify(e)).join(',\n') + '\n]'; - traceDuration = parseTraceDuration(Buffer.from(traceJson)); - } catch { - traceDuration = undefined; - } finally { - await cdpTracing.detach().catch(() => {}); } - } - if (isRefStability && scenario.resultMetric) { - const report = await (bench as any).evaluate((api: any) => - api.getRefStabilityReport(), + // Execute action (vary args for deleteEntity across sub-iterations) + const actionArgs = + scenario.action === 'deleteEntity' ? [subIdx + 1] : scenario.args; + await (bench as any).evaluate( + (api: any, { action, args }: { action: string; args: unknown[] }) => { + api[action](...args); + }, + { action: scenario.action, args: actionArgs }, ); - await bench.dispose(); - return { value: report[scenario.resultMetric] as number }; - } - const measures = await collectMeasures(page); - const isMountLike = - isInit || - scenario.action === 'mountSortedView' || - scenario.action === 'initDoubleList' || - scenario.action === 'listDetailSwitch'; - const duration = - isMountLike ? - getMeasureDuration(measures, 'mount-duration') - : getMeasureDuration(measures, 'update-duration'); - // Both mount-like and update scenarios trigger state updates (setItems/etc.), - // so React Profiler always fires with phase: 'update' for the measured action. - const reactCommit = getMeasureDuration(measures, 'react-commit-update'); + // Wait for completion + const completeTimeout = networkSim ? 60000 : 10000; + await page.waitForSelector('[data-bench-complete]', { + timeout: completeTimeout, + state: 'attached', + }); + const timedOut = await harness.evaluate(el => + el.hasAttribute('data-bench-timeout'), + ); + if (timedOut) { + throw new Error( + `Harness timeout: MutationObserver did not detect expected DOM update within 30 s`, + ); + } + + await (bench as any).evaluate((api: any) => api.flushPendingMutations()); + + // Collect trace + let traceDuration: number | undefined; + if (shouldTrace && cdpTracing) { + try { + const done = new Promise(resolve => { + cdpTracing!.on('Tracing.tracingComplete', () => resolve()); + }); + await cdpTracing.send('Tracing.end'); + await done; + const traceJson = + '[\n' + traceChunks.map(e => JSON.stringify(e)).join(',\n') + '\n]'; + traceDuration = parseTraceDuration(Buffer.from(traceJson)); + } catch { + traceDuration = undefined; + } finally { + await cdpTracing.detach().catch(() => {}); + } + } + + // Collect performance measures + const measures = await collectMeasures(page); + const duration = + isMountLike ? + getMeasureDuration(measures, 'mount-duration') + : getMeasureDuration(measures, 'update-duration'); + const reactCommit = getMeasureDuration(measures, 'react-commit-update'); + + durations.push(duration); + if (reactCommit > 0) commitTimes.push(reactCommit); + if (traceDuration != null) traceDurations.push(traceDuration); + } await bench.dispose(); return { - value: duration, - reactCommit: reactCommit > 0 ? reactCommit : undefined, - traceDuration, + value: simpleMedian(durations), + reactCommit: commitTimes.length > 0 ? simpleMedian(commitTimes) : undefined, + traceDuration: + traceDurations.length > 0 ? simpleMedian(traceDurations) : undefined, }; } @@ -380,6 +466,22 @@ async function runScenario( // Helpers // --------------------------------------------------------------------------- +function effectiveOpsPerRound(scenario: Scenario): number { + if (scenario.deterministic) return 1; + if (scenario.category === 'memory') return 1; + if (scenario.action === 'listDetailSwitch') return 1; + return RUN_CONFIG[scenario.size ?? 'small'].opsPerRound; +} + +function simpleMedian(arr: number[]): number { + if (arr.length === 0) return 0; + const sorted = [...arr].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? + (sorted[mid - 1] + sorted[mid]) / 2 + : sorted[mid]; +} + function shuffle(arr: T[]): T[] { const out = [...arr]; for (let i = out.length - 1; i > 0; i--) { @@ -484,8 +586,14 @@ async function main() { filtered: SCENARIOS_TO_RUN, libraries, networkSim, + opsPerRound, } = filterScenarios(SCENARIOS); + if (opsPerRound != null) { + RUN_CONFIG.small.opsPerRound = opsPerRound; + RUN_CONFIG.large.opsPerRound = opsPerRound; + } + if (networkSim) { process.stderr.write('Network simulation: ON\n'); } diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts index 63099b594267..aa1b1a8f345c 100644 --- a/examples/benchmark-react/bench/scenarios.ts +++ b/examples/benchmark-react/bench/scenarios.ts @@ -13,20 +13,26 @@ export interface RunProfile { maxMeasurement: number; /** Stop early when 95% CI margin is within this % of the median. */ targetMarginPct: number; + /** Sub-iterations per page visit; median of N is returned as one sample. */ + opsPerRound: number; } +const defaultOpsPerRound = parseInt(process.env.BENCH_OPS_PER_ROUND ?? '5', 10); + export const RUN_CONFIG: Record = { small: { - warmup: 5, - minMeasurement: 8, - maxMeasurement: process.env.CI ? 25 : 20, + warmup: 3, + minMeasurement: 4, + maxMeasurement: process.env.CI ? 20 : 15, targetMarginPct: process.env.CI ? 5 : 10, + opsPerRound: defaultOpsPerRound, }, large: { - warmup: 3, - minMeasurement: 5, - maxMeasurement: process.env.CI ? 20 : 10, + warmup: 2, + minMeasurement: 3, + maxMeasurement: process.env.CI ? 15 : 8, targetMarginPct: process.env.CI ? 8 : 15, + opsPerRound: defaultOpsPerRound, }, }; diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts index 84751c00126e..56667529e8ce 100644 --- a/examples/benchmark-react/bench/stats.ts +++ b/examples/benchmark-react/bench/stats.ts @@ -97,9 +97,10 @@ export function computeStats( const p95Idx = Math.floor(sorted.length * 0.95); const p95 = sorted[Math.min(p95Idx, sorted.length - 1)] ?? median; const margin = ciMargin(clean); + const pct = median !== 0 ? (margin / Math.abs(median)) * 100 : 0; return { median, p95, - range: `± ${margin.toFixed(2)}`, + range: `± ${pct.toFixed(1)}%`, }; } diff --git a/examples/benchmark-react/src/baseline/index.tsx b/examples/benchmark-react/src/baseline/index.tsx index 3011a175811c..4a5185f34e55 100644 --- a/examples/benchmark-react/src/baseline/index.tsx +++ b/examples/benchmark-react/src/baseline/index.tsx @@ -26,7 +26,15 @@ import { deleteIssue, } from '@shared/server'; import type { Issue } from '@shared/types'; -import React, { useCallback, useEffect, useMemo, useState } from 'react'; +import React, { + useCallback, + useEffect, + useMemo, + useRef, + useState, +} from 'react'; + +let mutationCounter = 0; function SortedListView({ limit, @@ -188,10 +196,11 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; + const v = ++mutationCounter; measureUpdate(() => updateIssue({ number, - title: `${issue.title} (updated)`, + title: `${issue.title} (v${v})`, }).then(triggerRefetch), ); }, @@ -202,10 +211,11 @@ function BenchmarkHarness() { (login: string) => { const user = FIXTURE_USERS_BY_LOGIN.get(login); if (!user) return; + const v = ++mutationCounter; measureUpdate(() => serverUpdateUser({ login, - name: `${user.name} (updated)`, + name: `${user.name} (v${v})`, }).then(triggerRefetch), ); }, @@ -226,11 +236,15 @@ function BenchmarkHarness() { [measureUpdate, triggerRefetch], ); + const moveStateRef = useRef<'open' | 'closed'>('closed'); + const moveItem = useCallback( (number: number) => { + const targetState = moveStateRef.current; + moveStateRef.current = targetState === 'closed' ? 'open' : 'closed'; measureUpdate( - () => updateIssue({ number, state: 'closed' }).then(triggerRefetch), - () => moveItemIsReady(containerRef, number), + () => updateIssue({ number, state: targetState }).then(triggerRefetch), + () => moveItemIsReady(containerRef, number, targetState), ); }, [measureUpdate, triggerRefetch, containerRef], @@ -240,7 +254,8 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; - const expected = `${issue.title} (updated)`; + const v = ++mutationCounter; + const expected = `${issue.title} (v${v})`; measureUpdate( () => updateIssue({ number, title: expected }).then(triggerRefetch), () => { diff --git a/examples/benchmark-react/src/data-client/index.tsx b/examples/benchmark-react/src/data-client/index.tsx index 6c2c57260201..97c435c2c876 100644 --- a/examples/benchmark-react/src/data-client/index.tsx +++ b/examples/benchmark-react/src/data-client/index.tsx @@ -31,7 +31,9 @@ import { } from '@shared/resources'; import { getIssue, patchIssue } from '@shared/server'; import type { Issue } from '@shared/types'; -import React, { useCallback } from 'react'; +import React, { useCallback, useRef } from 'react'; + +let mutationCounter = 0; /** GCPolicy with no interval (won't fire during timing scenarios) and instant * expiry so an explicit sweep() collects all unreferenced data immediately. */ @@ -147,11 +149,12 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; + const v = ++mutationCounter; measureUpdate(() => { controller.fetch( IssueResource.update, { number }, - { title: `${issue.title} (updated)` }, + { title: `${issue.title} (v${v})` }, ); }); }, @@ -162,11 +165,12 @@ function BenchmarkHarness() { (login: string) => { const user = FIXTURE_USERS_BY_LOGIN.get(login); if (!user) return; + const v = ++mutationCounter; measureUpdate(() => { controller.fetch( UserResource.update, { login }, - { name: `${user.name} (updated)` }, + { name: `${user.name} (v${v})` }, ); }); }, @@ -196,13 +200,21 @@ function BenchmarkHarness() { [measureUpdate, controller], ); + const moveStateRef = useRef<'open' | 'closed'>('closed'); + const moveItem = useCallback( (number: number) => { + const targetState = moveStateRef.current; + moveStateRef.current = targetState === 'closed' ? 'open' : 'closed'; measureUpdate( () => { - controller.fetch(IssueResource.move, { number }, { state: 'closed' }); + controller.fetch( + IssueResource.move, + { number }, + { state: targetState }, + ); }, - () => moveItemIsReady(containerRef, number), + () => moveItemIsReady(containerRef, number, targetState), ); }, [measureUpdate, controller, containerRef], @@ -242,7 +254,8 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; - const expected = `${issue.title} (updated)`; + const v = ++mutationCounter; + const expected = `${issue.title} (v${v})`; measureUpdate( () => { controller.fetch( @@ -271,6 +284,11 @@ function BenchmarkHarness() { [measureUpdate, controller, containerRef], ); + const resetStore = useCallback( + () => controller.resetEntireStore(), + [controller], + ); + registerAPI({ updateEntity, updateUser, @@ -280,6 +298,7 @@ function BenchmarkHarness() { deleteEntity, moveItem, triggerGC: () => benchGC.sweep(), + resetStore, }); return ( diff --git a/examples/benchmark-react/src/shared/benchHarness.tsx b/examples/benchmark-react/src/shared/benchHarness.tsx index 8661839db897..00c7341a6db5 100644 --- a/examples/benchmark-react/src/shared/benchHarness.tsx +++ b/examples/benchmark-react/src/shared/benchHarness.tsx @@ -36,16 +36,18 @@ const OBSERVE_MUTATIONS: MutationObserverInit = { characterData: true, }; -/** Check whether an issue has moved from the "open" to the "closed" state list. */ +/** Check whether an issue has moved to the target state list and left the source. */ export function moveItemIsReady( containerRef: React.RefObject, number: number, + targetState: 'open' | 'closed' = 'closed', ): boolean { + const sourceState = targetState === 'closed' ? 'open' : 'closed'; const source = containerRef.current?.querySelector( - '[data-state-list="open"]', + `[data-state-list="${sourceState}"]`, ); const dest = containerRef.current?.querySelector( - '[data-state-list="closed"]', + `[data-state-list="${targetState}"]`, ); return ( source?.querySelector(`[data-issue-number="${number}"]`) == null && @@ -324,6 +326,8 @@ export function useBenchState() { * Libraries only pass their own actions + any overrides; standard actions * (init, unmountAll, etc.) are included automatically. */ + const resetStoreNoop = useCallback(() => {}, []); + const registerAPI = (libraryActions: LibraryActions) => { apiRef.current = { init, @@ -340,6 +344,7 @@ export function useBenchState() { setNetworkSim, flushPendingMutations, setRenderLimit, + resetStore: resetStoreNoop, ...libraryActions, } as BenchAPI; }; diff --git a/examples/benchmark-react/src/shared/types.ts b/examples/benchmark-react/src/shared/types.ts index 6bc9e48221e9..fdabce92d382 100644 --- a/examples/benchmark-react/src/shared/types.ts +++ b/examples/benchmark-react/src/shared/types.ts @@ -55,6 +55,8 @@ export interface BenchAPI { triggerGC?(): void; /** Cap DOM rendering to the first N items while keeping all data in the store. */ setRenderLimit?(n: number | undefined): void; + /** Clear client-side cache/store so the next mount triggers a fresh fetch. Called between sub-iterations for mount scenarios. */ + resetStore?(): void; } declare global { diff --git a/examples/benchmark-react/src/swr/index.tsx b/examples/benchmark-react/src/swr/index.tsx index f02f6677140e..e83c0ef8d9c2 100644 --- a/examples/benchmark-react/src/swr/index.tsx +++ b/examples/benchmark-react/src/swr/index.tsx @@ -19,9 +19,11 @@ import { import { setCurrentIssues } from '@shared/refStability'; import { UserResource, IssueResource } from '@shared/resources'; import type { Issue } from '@shared/types'; -import React, { useCallback, useMemo } from 'react'; +import React, { useCallback, useMemo, useRef } from 'react'; import useSWR, { SWRConfig, useSWRConfig } from 'swr'; +let mutationCounter = 0; + /** SWR fetcher: dispatches to shared resource fetch methods based on cache key */ const fetcher = (key: string): Promise => { if (key.startsWith('issue:')) @@ -134,10 +136,11 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; + const v = ++mutationCounter; measureUpdate(() => IssueResource.update( { number }, - { title: `${issue.title} (updated)` }, + { title: `${issue.title} (v${v})` }, ).then(() => mutate(key => typeof key === 'string' && key.startsWith('issues:')), ), @@ -150,13 +153,14 @@ function BenchmarkHarness() { (login: string) => { const user = FIXTURE_USERS_BY_LOGIN.get(login); if (!user) return; + const v = ++mutationCounter; measureUpdate( () => - UserResource.update( - { login }, - { name: `${user.name} (updated)` }, - ).then(() => - mutate(key => typeof key === 'string' && key.startsWith('issues:')), + UserResource.update({ login }, { name: `${user.name} (v${v})` }).then( + () => + mutate( + key => typeof key === 'string' && key.startsWith('issues:'), + ), ) as Promise, ); }, @@ -183,14 +187,18 @@ function BenchmarkHarness() { [measureUpdate, mutate], ); + const moveStateRef = useRef<'open' | 'closed'>('closed'); + const moveItem = useCallback( (number: number) => { + const targetState = moveStateRef.current; + moveStateRef.current = targetState === 'closed' ? 'open' : 'closed'; measureUpdate( () => - IssueResource.update({ number }, { state: 'closed' }).then(() => + IssueResource.update({ number }, { state: targetState }).then(() => mutate(key => typeof key === 'string' && key.startsWith('issues:')), ), - () => moveItemIsReady(containerRef, number), + () => moveItemIsReady(containerRef, number, targetState), ); }, [measureUpdate, mutate, containerRef], @@ -200,7 +208,8 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; - const expected = `${issue.title} (updated)`; + const v = ++mutationCounter; + const expected = `${issue.title} (v${v})`; measureUpdate( () => IssueResource.update({ number }, { title: expected }).then(() => @@ -230,6 +239,11 @@ function BenchmarkHarness() { [measureUpdate, mutate, containerRef], ); + const { cache } = useSWRConfig(); + const resetStore = useCallback(() => { + if (typeof (cache as any).clear === 'function') (cache as any).clear(); + }, [cache]); + registerAPI({ updateEntity, updateUser, @@ -237,6 +251,7 @@ function BenchmarkHarness() { unshiftItem, deleteEntity, moveItem, + resetStore, }); return ( @@ -261,6 +276,8 @@ function BenchProvider({ children }: { children: React.ReactNode }) { revalidateOnFocus: false, revalidateOnReconnect: false, revalidateIfStale: false, + revalidateOnMount: true, + dedupingInterval: 0, }} > {children} diff --git a/examples/benchmark-react/src/tanstack-query/index.tsx b/examples/benchmark-react/src/tanstack-query/index.tsx index 71d7b45855b8..f13a7a268358 100644 --- a/examples/benchmark-react/src/tanstack-query/index.tsx +++ b/examples/benchmark-react/src/tanstack-query/index.tsx @@ -25,7 +25,9 @@ import { useQuery, useQueryClient, } from '@tanstack/react-query'; -import React, { useCallback, useMemo } from 'react'; +import React, { useCallback, useMemo, useRef } from 'react'; + +let mutationCounter = 0; function queryFn({ queryKey }: { queryKey: readonly unknown[] }): Promise { const [type, id] = queryKey as [string, string | number | undefined]; @@ -160,10 +162,11 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; + const v = ++mutationCounter; measureUpdate(() => IssueResource.update( { number }, - { title: `${issue.title} (updated)` }, + { title: `${issue.title} (v${v})` }, ).then(() => client.invalidateQueries({ queryKey: ['issues'], @@ -178,8 +181,9 @@ function BenchmarkHarness() { (login: string) => { const user = FIXTURE_USERS_BY_LOGIN.get(login); if (!user) return; + const v = ++mutationCounter; measureUpdate(() => - UserResource.update({ login }, { name: `${user.name} (updated)` }).then( + UserResource.update({ login }, { name: `${user.name} (v${v})` }).then( () => client.invalidateQueries({ queryKey: ['issues'], @@ -212,14 +216,18 @@ function BenchmarkHarness() { [measureUpdate, client], ); + const moveStateRef = useRef<'open' | 'closed'>('closed'); + const moveItem = useCallback( (number: number) => { + const targetState = moveStateRef.current; + moveStateRef.current = targetState === 'closed' ? 'open' : 'closed'; measureUpdate( () => - IssueResource.update({ number }, { state: 'closed' }).then(() => + IssueResource.update({ number }, { state: targetState }).then(() => client.invalidateQueries({ queryKey: ['issues'] }), ), - () => moveItemIsReady(containerRef, number), + () => moveItemIsReady(containerRef, number, targetState), ); }, [measureUpdate, client, containerRef], @@ -229,7 +237,8 @@ function BenchmarkHarness() { (number: number) => { const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); if (!issue) return; - const expected = `${issue.title} (updated)`; + const v = ++mutationCounter; + const expected = `${issue.title} (v${v})`; measureUpdate( () => IssueResource.update({ number }, { title: expected }).then(() => @@ -258,6 +267,8 @@ function BenchmarkHarness() { [measureUpdate, client, containerRef], ); + const resetStore = useCallback(() => queryClient.clear(), []); + registerAPI({ updateEntity, updateUser, @@ -265,6 +276,7 @@ function BenchmarkHarness() { unshiftItem, deleteEntity, moveItem, + resetStore, }); return ( From 8598d7250fbfb2830b730e32f6a12744cac22b99 Mon Sep 17 00:00:00 2001 From: Nathaniel Tucker Date: Sun, 22 Mar 2026 12:55:41 -0400 Subject: [PATCH 3/6] fix: bugbot --- examples/benchmark-react/bench/validate.ts | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/benchmark-react/bench/validate.ts b/examples/benchmark-react/bench/validate.ts index 427d13293b7c..3006c11a896c 100644 --- a/examples/benchmark-react/bench/validate.ts +++ b/examples/benchmark-react/bench/validate.ts @@ -187,19 +187,19 @@ test('updateEntity changes issue title in DOM', async (page, lib) => { await waitFor( page, - async () => (await getIssueTitles(page))[1]?.includes('(updated)') ?? false, - 'issue #1 title contains "(updated)"', + async () => (await getIssueTitles(page))[1]?.includes('(v') ?? false, + 'issue #1 title contains "(v"', ); const titles = await getIssueTitles(page); assert( - titles[1]?.includes('(updated)'), + titles[1]?.includes('(v'), lib, 'updateEntity', - `issue #1 should contain "(updated)", got "${titles[1]}"`, + `issue #1 should contain "(v…)", got "${titles[1]}"`, ); assert( - !titles[2]?.includes('(updated)'), + !titles[2]?.includes('(v'), lib, 'updateEntity unchanged', `issue #2 should be unchanged, got "${titles[2]}"`, @@ -239,7 +239,7 @@ test('ref-stability after updateEntity', async (page, lib) => { await waitFor( page, - async () => (await getIssueTitles(page))[1]?.includes('(updated)') ?? false, + async () => (await getIssueTitles(page))[1]?.includes('(v') ?? false, 'issue #1 title updated before ref check', ); @@ -552,7 +552,7 @@ test('updateEntity timing: DOM reflects change at measurement end', async (page, const titles = await getIssueTitles(page); assert( - titles[1]?.includes('(updated)') ?? false, + titles[1]?.includes('(v') ?? false, lib, 'updateEntity timing', `DOM not updated when data-bench-complete fired. ` + From be1071763c1ab4577a1d3e04c765785f70715d1f Mon Sep 17 00:00:00 2001 From: Nathaniel Tucker Date: Sun, 22 Mar 2026 12:56:34 -0400 Subject: [PATCH 4/6] demo(benchmark-react): reduce warmup/measurement counts for faster CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 5 sub-iterations per round providing sufficient noise reduction, lower warmup (small: 3→2, large: 2→1) and max measurement caps (small CI: 20→15, large CI: 15→12) to cut ~30-40s from CI runtime. Made-with: Cursor --- examples/benchmark-react/README.md | 6 +++--- examples/benchmark-react/bench/scenarios.ts | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/benchmark-react/README.md b/examples/benchmark-react/README.md index 891e6a95b8e9..10d55c6ef321 100644 --- a/examples/benchmark-react/README.md +++ b/examples/benchmark-react/README.md @@ -14,7 +14,7 @@ The repo has two benchmark suites: - **What we measure:** Wall-clock time from triggering an action (e.g. `init(100)` or `updateUser('user0')`) until a MutationObserver detects the expected DOM change in the benchmark container. Optionally we also record React Profiler commit duration and, with `BENCH_TRACE=true`, Chrome trace duration. - **Why:** Normalized caching should show wins on shared-entity updates (one store write, many components update), ref stability (fewer new object references), and derived-view memoization (`Query` schema avoids re-sorting when entities haven't changed). See [js-framework-benchmark "How the duration is measured"](https://github.com/krausest/js-framework-benchmark/wiki/How-the-duration-is-measured) for a similar timeline-based approach. - **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs 5 sub-iterations per page visit and reports the median, further reducing per-sample noise. -- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 3 warmup + up to 15 measurement rounds locally; large (expensive) scenarios use 2 warmup + up to 8 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage. +- **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 2 warmup + up to 12 measurement rounds locally; large (expensive) scenarios use 1 warmup + up to 6 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage. ## Scenario categories @@ -189,8 +189,8 @@ Regressions >5% on stable scenarios or >15% on volatile scenarios are worth inve Scenarios are classified as `small` or `large` based on their cost: - - **Small** (3 warmup + 4–15 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item` - - **Large** (2 warmup + 3–8 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10` + - **Small** (2 warmup + 3–12 measurement): `getlist-100`, `update-entity`, `ref-stability-*`, `invalidate-and-resolve`, `unshift-item`, `delete-item` + - **Large** (1 warmup + 3–6 measurement): `getlist-500`, `getlist-500-sorted`, `update-user`, `update-user-10000`, `update-entity-sorted`, `update-entity-multi-view`, `list-detail-switch-10` - **Memory** (opt-in, 1 warmup + 3 measurement): `memory-mount-unmount-cycle` — run with `--action memory` When running all scenarios (`yarn bench`), each group runs with its own warmup/measurement count. Use `--size` to run only one group. diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts index aa1b1a8f345c..2213d44d7f41 100644 --- a/examples/benchmark-react/bench/scenarios.ts +++ b/examples/benchmark-react/bench/scenarios.ts @@ -21,16 +21,16 @@ const defaultOpsPerRound = parseInt(process.env.BENCH_OPS_PER_ROUND ?? '5', 10); export const RUN_CONFIG: Record = { small: { - warmup: 3, - minMeasurement: 4, - maxMeasurement: process.env.CI ? 20 : 15, + warmup: 2, + minMeasurement: 3, + maxMeasurement: process.env.CI ? 15 : 12, targetMarginPct: process.env.CI ? 5 : 10, opsPerRound: defaultOpsPerRound, }, large: { - warmup: 2, + warmup: 1, minMeasurement: 3, - maxMeasurement: process.env.CI ? 15 : 8, + maxMeasurement: process.env.CI ? 12 : 6, targetMarginPct: process.env.CI ? 8 : 15, opsPerRound: defaultOpsPerRound, }, From f06f7306ca35615c786fba421520c39c9e13b03c Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 22 Mar 2026 17:10:25 +0000 Subject: [PATCH 5/6] Fix benchmark range format for single sample Co-authored-by: Nathaniel Tucker --- examples/benchmark-react/bench/stats.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts index 56667529e8ce..b0559bb9349a 100644 --- a/examples/benchmark-react/bench/stats.ts +++ b/examples/benchmark-react/bench/stats.ts @@ -89,7 +89,7 @@ export function computeStats( const measured = samples.slice(warmupCount); if (measured.length <= 1) { const v = measured[0] ?? 0; - return { median: v, p95: v, range: '± 0' }; + return { median: v, p95: v, range: '± 0.0%' }; } const sorted = [...measured].sort((a, b) => a - b); const clean = trimOutliers(sorted); From c94a63accc1f9e3ca9ada75dae9614e33299b544 Mon Sep 17 00:00:00 2001 From: Nathaniel Tucker Date: Sun, 22 Mar 2026 15:35:48 -0400 Subject: [PATCH 6/6] demo(benchmark-react): fix stats bugs and add per-scenario opsPerRound - Replace z=1.96 with t-distribution critical values for accurate CI on small samples (n=3-15) - Fix even-length median calculation in scaledMAD/isConverged/computeStats - Fix median===0 premature convergence (now requires margin===0 too) - Fix invalidateAndResolve title accumulation bug using fixture data - Clamp deleteEntity sub-iteration args to mountCount bound - Remove dead cdp parameter from runScenario - Add per-scenario opsPerRound override to Scenario type - Set opsPerRound=9 for update-entity-sorted, =5 for list-detail-switch-10 - Update README with remeasured results and variance tiers Made-with: Cursor --- examples/benchmark-react/README.md | 36 ++++---- examples/benchmark-react/bench/runner.ts | 14 +-- examples/benchmark-react/bench/scenarios.ts | 2 + examples/benchmark-react/bench/stats.ts | 86 ++++++++++++++----- .../benchmark-react/src/data-client/index.tsx | 13 +-- examples/benchmark-react/src/shared/types.ts | 2 + 6 files changed, 102 insertions(+), 51 deletions(-) diff --git a/examples/benchmark-react/README.md b/examples/benchmark-react/README.md index 10d55c6ef321..6dcd2d2a2995 100644 --- a/examples/benchmark-react/README.md +++ b/examples/benchmark-react/README.md @@ -13,7 +13,7 @@ The repo has two benchmark suites: - **What we measure:** Wall-clock time from triggering an action (e.g. `init(100)` or `updateUser('user0')`) until a MutationObserver detects the expected DOM change in the benchmark container. Optionally we also record React Profiler commit duration and, with `BENCH_TRACE=true`, Chrome trace duration. - **Why:** Normalized caching should show wins on shared-entity updates (one store write, many components update), ref stability (fewer new object references), and derived-view memoization (`Query` schema avoids re-sorting when entities haven't changed). See [js-framework-benchmark "How the duration is measured"](https://github.com/krausest/js-framework-benchmark/wiki/How-the-duration-is-measured) for a similar timeline-based approach. -- **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs 5 sub-iterations per page visit and reports the median, further reducing per-sample noise. +- **Statistical:** Warmup runs are discarded; we report median and 95% CI (as percentage of median). Libraries are interleaved per round to reduce environmental variance. Each round runs multiple sub-iterations per page visit and reports the median, further reducing per-sample noise. The default is 5 sub-iterations; individual scenarios can override this via `opsPerRound` in `bench/scenarios.ts` (e.g. `update-entity-sorted` uses 9, `list-detail-switch-10` uses 5). - **No CPU throttling:** Runs at native speed with more samples for statistical significance rather than artificial slowdown. Small (cheap) scenarios use 2 warmup + up to 12 measurement rounds locally; large (expensive) scenarios use 1 warmup + up to 6 measurement rounds. Early stopping triggers when 95% CI margin drops below the target percentage. ## Scenario categories @@ -55,9 +55,9 @@ Illustrative **relative** results with **baseline = 100%** (plain React useState | Category | Scenarios (representative) | data-client | tanstack-query | swr | baseline | |---|---|---:|---:|---:|---:| -| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~98% | ~100% | ~100% | **100%** | -| Navigation | `list-detail-switch-10` | **~1354%** | ~233% | ~260% | 100% | -| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~6778%** | ~97% | ~99% | 100% | +| Navigation | `getlist-100`, `getlist-500`, `getlist-500-sorted` | ~97% | ~100% | ~100% | **100%** | +| Navigation | `list-detail-switch-10` | **~1652%** | ~231% | ~230% | 100% | +| Mutations | `update-entity`, `update-user`, `update-entity-sorted`, `update-entity-multi-view`, `unshift-item`, `delete-item`, `move-item` | **~6994%** | ~97% | ~99% | 100% | | Scaling (10k items) | `update-user-10000` | **~9713%** | ~94% | ~100% | 100% | @@ -70,19 +70,19 @@ Run: **2026-03-22**, Linux (WSL2), `yarn build:benchmark-react`, static preview | Scenario | data-client | tanstack-query | swr | baseline | |---|---:|---:|---:|---:| | **Navigation** | | | | | -| `getlist-100` | 20.28 ± 0.3% | 20.58 ± 1.2% | 20.58 ± 0.3% | 20.62 ± 0.3% | -| `getlist-500` | 12.41 ± 0.4% | 12.61 ± 0.2% | 12.59 ± 0.2% | 12.63 ± 0.2% | -| `getlist-500-sorted` | 12.55 ± 0.2% | 12.67 ± 1.3% | 12.72 ± 0.4% | 12.79 ± 0.4% | -| `list-detail-switch-10` | 9.75 ± 1.7% | 1.68 ± 0.2% | 1.87 ± 0.9% | 0.72 ± 0.0% | +| `getlist-100` | 20.16 ± 0.7% | 20.58 ± 0.8% | 20.58 ± 0.8% | 20.58 ± 0.0% | +| `getlist-500` | 12.05 ± 0.9% | 12.55 ± 0.0% | 12.61 ± 0.9% | 12.69 ± 1.4% | +| `getlist-500-sorted` | 12.56 ± 1.4% | 12.72 ± 0.5% | 12.79 ± 0.9% | 12.80 ± 1.4% | +| `list-detail-switch-10` | 12.06 ± 12.5% | 1.69 ± 1.0% | 1.68 ± 1.1% | 0.73 ± 0.1% | | **Mutations** | | | | | -| `update-entity` | 555.56 ± 2.9% | 7.00 ± 0.3% | 6.98 ± 0.1% | 7.18 ± 0.2% | -| `update-user` | 625.00 ± 11.2% | 6.95 ± 0.1% | 7.15 ± 0.2% | 7.17 ± 0.2% | -| `update-entity-sorted` | 476.19 ± 0.0% | 7.06 ± 0.3% | 7.06 ± 0.0% | 7.24 ± 0.0% | -| `update-entity-multi-view` | 500.00 ± 3.4% | 7.05 ± 0.2% | 7.09 ± 0.1% | 7.25 ± 0.1% | -| `update-user-10000` | 151.52 ± 2.8% | 1.47 ± 0.0% | 1.56 ± 0.1% | 1.56 ± 0.3% | -| `unshift-item` | 434.78 ± 6.1% | 6.91 ± 0.2% | 7.13 ± 0.2% | 7.15 ± 0.3% | -| `delete-item` | 526.32 ± 2.7% | 6.89 ± 0.0% | 7.13 ± 0.4% | 7.12 ± 0.0% | -| `move-item` | 277.78 ± 3.9% | 6.52 ± 0.2% | 6.98 ± 0.6% | 6.87 ± 0.2% | +| `update-entity` | 555.56 ± 8.4% | 6.99 ± 0.3% | 6.99 ± 0.3% | 7.17 ± 0.3% | +| `update-user` | 571.90 ± 12.8% | 6.94 ± 0.5% | 7.18 ± 0.0% | 7.16 ± 0.0% | +| `update-entity-sorted` | 588.24 ± 8.0% | 7.10 ± 0.3% | 7.09 ± 0.4% | 7.28 ± 0.0% | +| `update-entity-multi-view` | 555.56 ± 0.0% | 7.06 ± 0.3% | 7.08 ± 0.3% | 7.26 ± 0.2% | +| `update-user-10000` | 151.52 ± 10.8% | 1.46 ± 0.5% | 1.56 ± 0.2% | 1.56 ± 1.3% | +| `unshift-item` | 425.72 ± 5.0% | 6.90 ± 0.1% | 7.13 ± 0.3% | 7.14 ± 0.3% | +| `delete-item` | 526.32 ± 7.2% | 6.89 ± 0.3% | 7.13 ± 0.5% | 7.12 ± 1.0% | +| `move-item` | 285.71 ± 4.0% | 6.55 ± 0.5% | 6.99 ± 0.5% | 6.92 ± 0.8% | [Measured on a Ryzen 9 7950X; 64 GB RAM; Ubuntu (WSL2); Node 24.12.0; Chromium (Playwright)] @@ -90,8 +90,8 @@ Run: **2026-03-22**, Linux (WSL2), `yarn build:benchmark-react`, static preview | Category | Scenarios | Typical run-to-run spread | |---|---|---| -| **Stable** | `getlist-*`, `update-entity`, `ref-stability-*` | 2-5% | -| **Moderate** | `update-user-*`, `update-entity-sorted`, `update-entity-multi-view` | 5-10% | +| **Stable** | `getlist-*`, `update-entity`, `update-entity-sorted`, `ref-stability-*` | 2-5% | +| **Moderate** | `update-user-*`, `update-entity-multi-view`, `list-detail-switch-10` | 5-10% | | **Volatile** | `memory-mount-unmount-cycle`, `startup-*`, `(react commit)` suffixes | 10-25% | Regressions >5% on stable scenarios or >15% on volatile scenarios are worth investigating. diff --git a/examples/benchmark-react/bench/runner.ts b/examples/benchmark-react/bench/runner.ts index c9f0ac9e507f..88cd0ee96f19 100644 --- a/examples/benchmark-react/bench/runner.ts +++ b/examples/benchmark-react/bench/runner.ts @@ -178,7 +178,6 @@ async function runScenario( lib: string, scenario: Scenario, networkSim: boolean, - cdp?: CDPSession, ): Promise { const appPath = `/${lib}/`; await page.goto(`${BASE_URL}${appPath}`, { @@ -394,9 +393,11 @@ async function runScenario( }); } - // Execute action (vary args for deleteEntity across sub-iterations) + // Vary args for deleteEntity so each sub-iteration deletes a different item const actionArgs = - scenario.action === 'deleteEntity' ? [subIdx + 1] : scenario.args; + scenario.action === 'deleteEntity' ? + [Math.min(subIdx + 1, mountCount)] + : scenario.args; await (bench as any).evaluate( (api: any, { action, args }: { action: string; args: unknown[] }) => { api[action](...args); @@ -469,8 +470,9 @@ async function runScenario( function effectiveOpsPerRound(scenario: Scenario): number { if (scenario.deterministic) return 1; if (scenario.category === 'memory') return 1; - if (scenario.action === 'listDetailSwitch') return 1; - return RUN_CONFIG[scenario.size ?? 'small'].opsPerRound; + return ( + scenario.opsPerRound ?? RUN_CONFIG[scenario.size ?? 'small'].opsPerRound + ); } function simpleMedian(arr: number[]): number { @@ -550,7 +552,7 @@ async function runRound( done++; const prefix = opts.showProgress ? `[${done}/${total}] ` : ''; try { - const result = await runScenario(page, lib, scenario, networkSim, cdp); + const result = await runScenario(page, lib, scenario, networkSim); recordResult(samples, scenario, result); const unit = scenarioUnit(scenario); const displayValue = diff --git a/examples/benchmark-react/bench/scenarios.ts b/examples/benchmark-react/bench/scenarios.ts index 2213d44d7f41..2ce7df4f56ec 100644 --- a/examples/benchmark-react/bench/scenarios.ts +++ b/examples/benchmark-react/bench/scenarios.ts @@ -123,6 +123,7 @@ const BASE_SCENARIOS: BaseScenario[] = [ renderLimit: 100, preMountAction: 'mountSortedView', size: 'large', + opsPerRound: 9, }, { nameSuffix: 'update-entity-multi-view', @@ -141,6 +142,7 @@ const BASE_SCENARIOS: BaseScenario[] = [ category: 'hotPath', size: 'large', renderLimit: 100, + opsPerRound: 5, }, { nameSuffix: 'update-user-10000', diff --git a/examples/benchmark-react/bench/stats.ts b/examples/benchmark-react/bench/stats.ts index b0559bb9349a..fd5476458301 100644 --- a/examples/benchmark-react/bench/stats.ts +++ b/examples/benchmark-react/bench/stats.ts @@ -1,3 +1,50 @@ +/** + * Two-tailed t critical values for 95% CI (α = 0.05) keyed by + * degrees of freedom. Falls back to z = 1.96 for df > 30. + */ +const T_CRIT_95: Record = { + 1: 12.706, + 2: 4.303, + 3: 3.182, + 4: 2.776, + 5: 2.571, + 6: 2.447, + 7: 2.365, + 8: 2.306, + 9: 2.262, + 10: 2.228, + 11: 2.201, + 12: 2.179, + 13: 2.16, + 14: 2.145, + 15: 2.131, + 20: 2.086, + 25: 2.06, + 30: 2.042, +}; + +function tCrit95(n: number): number { + const df = n - 1; + if (df <= 0) return 1.96; + if (df in T_CRIT_95) return T_CRIT_95[df]; + const keys = Object.keys(T_CRIT_95) + .map(Number) + .sort((a, b) => a - b); + const lower = keys.filter(k => k <= df).pop(); + const upper = keys.find(k => k >= df); + if (lower == null) return T_CRIT_95[keys[0]]; + if (upper == null || lower === upper) return T_CRIT_95[lower]; + const frac = (df - lower) / (upper - lower); + return T_CRIT_95[lower] + frac * (T_CRIT_95[upper] - T_CRIT_95[lower]); +} + +function sortedMedian(sorted: number[]): number { + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? + (sorted[mid - 1] + sorted[mid]) / 2 + : sorted[mid]; +} + /** * Remove outliers using the IQR method (1.5×IQR fence). * Input must be sorted ascending. Falls back to the full array when @@ -21,38 +68,35 @@ function trimOutliers(sorted: number[]): number[] { * stddev for normal distributions. */ function scaledMAD(sorted: number[]): number { - const median = sorted[Math.floor(sorted.length / 2)]; - const deviations = sorted - .map(x => Math.abs(x - median)) - .sort((a, b) => a - b); - const mad = deviations[Math.floor(deviations.length / 2)]; - return 1.4826 * mad; + const med = sortedMedian(sorted); + const deviations = sorted.map(x => Math.abs(x - med)).sort((a, b) => a - b); + return 1.4826 * sortedMedian(deviations); } /** - * Compute the 95% CI margin using MAD-based dispersion. - * Falls back to stddev when MAD is zero (all values identical - * except outliers) to avoid reporting ± 0 misleadingly. + * Compute the 95% CI margin using MAD-based dispersion and t-distribution + * critical values for small samples. Falls back to stddev when MAD is zero + * (all values identical except outliers) to avoid reporting ± 0 misleadingly. */ function ciMargin(clean: number[]): number { + if (clean.length < 2) return 0; + const t = tCrit95(clean.length); const mad = scaledMAD(clean); if (mad > 0) { - return 1.96 * (mad / Math.sqrt(clean.length)); + return t * (mad / Math.sqrt(clean.length)); } const mean = clean.reduce((sum, x) => sum + x, 0) / clean.length; - const stdDev = - clean.length > 1 ? - Math.sqrt( - clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1), - ) - : 0; - return 1.96 * (stdDev / Math.sqrt(clean.length)); + const stdDev = Math.sqrt( + clean.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (clean.length - 1), + ); + return t * (stdDev / Math.sqrt(clean.length)); } /** * Check whether a scenario's samples have converged: 95% CI margin * is within targetMarginPct of the median. Zero-variance metrics - * (e.g. ref-stability counts) converge after minSamples. + * (e.g. ref-stability counts) converge after minSamples only when + * the margin is also zero. * * Outliers are trimmed via IQR before computing the CI so that a * single GC spike doesn't prevent convergence. @@ -67,9 +111,9 @@ export function isConverged( if (measured.length < minSamples) return false; const sorted = [...measured].sort((a, b) => a - b); const clean = trimOutliers(sorted); - const median = clean[Math.floor(clean.length / 2)]; - if (median === 0) return true; + const median = sortedMedian(clean); const margin = ciMargin(clean); + if (median === 0) return margin === 0; return (margin / Math.abs(median)) * 100 <= targetMarginPct; } @@ -93,7 +137,7 @@ export function computeStats( } const sorted = [...measured].sort((a, b) => a - b); const clean = trimOutliers(sorted); - const median = clean[Math.floor(clean.length / 2)] ?? 0; + const median = sortedMedian(clean); const p95Idx = Math.floor(sorted.length * 0.95); const p95 = sorted[Math.min(p95Idx, sorted.length - 1)] ?? median; const margin = ciMargin(clean); diff --git a/examples/benchmark-react/src/data-client/index.tsx b/examples/benchmark-react/src/data-client/index.tsx index 97c435c2c876..cf1af16ad3af 100644 --- a/examples/benchmark-react/src/data-client/index.tsx +++ b/examples/benchmark-react/src/data-client/index.tsx @@ -29,7 +29,7 @@ import { IssueResource, sortedIssuesEndpoint, } from '@shared/resources'; -import { getIssue, patchIssue } from '@shared/server'; +import { patchIssue } from '@shared/server'; import type { Issue } from '@shared/types'; import React, { useCallback, useRef } from 'react'; @@ -222,10 +222,11 @@ function BenchmarkHarness() { const invalidateAndResolve = useCallback( async (number: number) => { - const issue = await getIssue(number); - if (issue) { - await patchIssue(number, { title: `${issue.title} (refetched)` }); - } + const issue = FIXTURE_ISSUES_BY_NUMBER.get(number); + if (!issue) return; + const v = ++mutationCounter; + const expected = `${issue.title} (v${v})`; + await patchIssue(number, { title: expected }); measureUpdate( () => { if (doubleListCount != null) { @@ -243,7 +244,7 @@ function BenchmarkHarness() { const el = containerRef.current!.querySelector( `[data-issue-number="${number}"] [data-title]`, ); - return el?.textContent?.includes('(refetched)') ?? false; + return el?.textContent === expected; }, ); }, diff --git a/examples/benchmark-react/src/shared/types.ts b/examples/benchmark-react/src/shared/types.ts index fdabce92d382..bdde003695d8 100644 --- a/examples/benchmark-react/src/shared/types.ts +++ b/examples/benchmark-react/src/shared/types.ts @@ -153,6 +153,8 @@ export interface Scenario { preMountAction?: keyof BenchAPI; /** Result is deterministic (zero variance); run exactly once with no warmup. */ deterministic?: boolean; + /** Override the default sub-iterations per page visit for this scenario. */ + opsPerRound?: number; /** Cap DOM rendering to first N items while keeping all data in the store. */ renderLimit?: number; /** If set, scenario applies only to these libs; dropped when any selected library is not listed. */