From d3f4f7b42b48f379a603a876a9c0f23f2f207779 Mon Sep 17 00:00:00 2001 From: Jason Marshall Date: Sun, 4 Jan 2026 18:18:46 -0800 Subject: [PATCH 1/2] feat: use histogram samples for t-test analysis Justification: Each sample in the histogram represents a durationPerOp sample calculated by dividing a certain number of iterations of executing the function under test divided by the cumulative time of those runs. Which is the average of the execution time of each execution. opsSec and opsSecPerRun are then an average of the samples, which are themselves averages. Therefore, using opsSecPerRun as a t-test inaccurately applying the calculation to an average of averages, when it is meant to be applied to a set of averages totalling a minimum of 30 samples, with 40 preferable. In other words, it's a histogram entry that represents a valid t-test sample. --- .DS_Store | Bin 0 -> 6148 bytes README.md | 7 +-- examples/.DS_Store | Bin 0 -> 6148 bytes examples/statistical-significance/README.md | 3 +- examples/statistical-significance/node.js | 1 + lib/index.js | 10 ++-- lib/utils/analyze.js | 5 +- test/ttest.js | 54 ++++++++++---------- 8 files changed, 44 insertions(+), 36 deletions(-) create mode 100644 .DS_Store create mode 100644 examples/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..34c19f72621762b66da6b77134538776a4fc41fc GIT binary patch literal 6148 zcmeHKJ8r`;3?*9+2D*66s4Mgag5jQ^7f8~ilM5Fx3Uu_8bLD7#{0t4;Z0Qm_0@RZz zJ_&k*X^M#UUx#OrPDEO`p`2`3o9&yg>>(oxgyW1)8RaPR`(b`tW&a&8ZVS%0?BwG& ze?Lv5ZPKU!6`%rCfC^B7n<|jywSBtjsXUAdP=R|;z`hR!ZdenSK>u`L@D>2rAnb;@ z_Y%Nj0bosB0ug~}P=P_!Y%w(Gh?mT(iA!M6MYH+PyjinDQNJDM7f%= minSamples x repeatSuite }); suite diff --git a/examples/.DS_Store b/examples/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9b65e0a17641ec7e2c5cebd5bfa3bea3654bbc83 GIT binary patch literal 6148 zcmeHKyKVwO477m)iH4GLe}E!?utMYu_yOJs2|+>Vugh;UexL*$6f`NqShDBV=h@Xx zaXyQPPA=;Yk)eq6;fC^Up>6iwyk(utC=iZ4_Hvdl#FZ>s)sGX#9XV^^%pd+i#>nqw zCr89>?s~-Do9+6%TAoe^Bh#n=6`%rCfC^B7mlVkM*&n>bVlfq<0`H)JeIE+kuqHNv z{^>yQ5dhdj+6~V>O8|={fHkoRLH-DnJGPl>*wE&ZZMwDSK<{a@K1Ld=9@EYP}r6 lTQSgEF&5T}N5ATdJ>z{%YyzE*ywidF5inh7RNyZZxB}$B8fE|h literal 0 HcmV?d00001 diff --git a/examples/statistical-significance/README.md b/examples/statistical-significance/README.md index 17819fc..1ec83c0 100644 --- a/examples/statistical-significance/README.md +++ b/examples/statistical-significance/README.md @@ -20,7 +20,8 @@ Enable t-test mode with `ttest: true`: const { Suite } = require('bench-node'); const suite = new Suite({ - ttest: true, // Automatically sets repeatSuite=30 + ttest: true, + minSamples: 30, // minSamples x repeatSuite must be > 30 }); suite.add('baseline', { baseline: true }, () => { diff --git a/examples/statistical-significance/node.js b/examples/statistical-significance/node.js index 4e28636..6902610 100644 --- a/examples/statistical-significance/node.js +++ b/examples/statistical-significance/node.js @@ -16,6 +16,7 @@ const { Suite } = require('../../lib'); // Enable t-test mode - this automatically sets repeatSuite=30 for all benchmarks const suite = new Suite({ ttest: true, + minSamples: 30 }); // Baseline: Simple array sum using for loop diff --git a/lib/index.js b/lib/index.js index 9bd1455..78aaecf 100644 --- a/lib/index.js +++ b/lib/index.js @@ -103,7 +103,7 @@ const defaultBenchOptions = { }; // Minimum repeatSuite runs required for reliable t-test results -const MIN_REPEAT_FOR_TTEST = 30; +const MIN_SAMPLES_FOR_TTEST = 30; function throwIfNoNativesSyntax() { if (process.execArgv.includes("--allow-natives-syntax") === false) { @@ -180,8 +180,6 @@ class Suite { if (options.repeatSuite !== undefined) { validateNumber(options.repeatSuite, "options.repeatSuite", 1); repeatSuite = options.repeatSuite; - } else if (this.#ttest) { - repeatSuite = MIN_REPEAT_FOR_TTEST; } this.#repeatSuite = repeatSuite; @@ -231,6 +229,12 @@ class Suite { throw new Error("There is already a baseline benchmark"); } + if (this.#ttest && (options.minSamples * options.repeatSuite < MIN_SAMPLES_FOR_TTEST)) { + process.emitWarning( + `The benchmark "${name}" may not have enough samples to run t-test analysis. Please set minSamples x repeatSuite >= ${MIN_SAMPLES_FOR_TTEST}`, + ); + } + const benchmark = new Benchmark( name, fn, diff --git a/lib/utils/analyze.js b/lib/utils/analyze.js index a6b8b27..f1baa96 100644 --- a/lib/utils/analyze.js +++ b/lib/utils/analyze.js @@ -44,8 +44,9 @@ function analyze(results, sorted = true, options = {}) { } if (ttest) { - const resultSamples = result.opsSecPerRun; - const baselineSamplesForTest = baselineResult.opsSecPerRun; + const resultSamples = result.histogram?.sampleData ?? []; + const baselineSamplesForTest = + baselineResult.histogram?.sampleData ?? []; if ( baselineSamplesForTest?.length >= 30 && diff --git a/test/ttest.js b/test/ttest.js index 84f1ed0..ebcdcca 100644 --- a/test/ttest.js +++ b/test/ttest.js @@ -238,13 +238,13 @@ describe("T-Test Integration with analyze", () => { assert.strictEqual(testResult.significanceTest, undefined); }); - it("should include significanceTest when ttest is true and opsSecPerRun >= 30", () => { - // Generate 30+ opsSecPerRun samples (from repeatSuite) - const baselineOpsSecPerRun = Array.from( + it("should include significanceTest when ttest is true and samples >= 30", () => { + // Generate 30+ samples + const baselineSampleData = Array.from( { length: 30 }, (_, i) => 100 + (i % 3) - 1, ); - const testOpsSecPerRun = Array.from( + const testSampleData = Array.from( { length: 30 }, (_, i) => 200 + (i % 3) - 1, ); @@ -254,12 +254,12 @@ describe("T-Test Integration with analyze", () => { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: baselineOpsSecPerRun, + histogram: { sampleData: baselineSampleData }, }, { name: "test", opsSec: 200, - opsSecPerRun: testOpsSecPerRun, + histogram: { sampleData: testSampleData }, }, ]; @@ -272,7 +272,7 @@ describe("T-Test Integration with analyze", () => { assert.ok(typeof testResult.significanceTest.confidence === "string"); }); - it("should not include significanceTest without opsSecPerRun", () => { + it("should not include significanceTest without sufficient samples", () => { const results = [ { name: "baseline", @@ -288,22 +288,22 @@ describe("T-Test Integration with analyze", () => { const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - // Should not throw, and significanceTest should not be set (no opsSecPerRun) + // Should not throw, and significanceTest should not be set (no samples) assert.strictEqual(testResult.significanceTest, undefined); }); - it("should not include significanceTest when opsSecPerRun < 30", () => { + it("should not include significanceTest when samples < 30", () => { const results = [ { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: Array.from({ length: 10 }, () => 100), + histogram: { samples: Array.from({ length: 10 }, () => 100) }, }, { name: "test", opsSec: 200, - opsSecPerRun: Array.from({ length: 10 }, () => 200), + histogram: { samples: Array.from({ length: 10 }, () => 200) }, }, ]; @@ -315,12 +315,12 @@ describe("T-Test Integration with analyze", () => { }); it("should detect significant difference between clearly different benchmarks", () => { - // Generate 30+ opsSecPerRun with clearly different means - const baselineOpsSecPerRun = Array.from( + // Generate 30+ samples with clearly different means + const baselineSampleData = Array.from( { length: 30 }, (_, i) => 100 + (i % 5) - 2, ); - const fastOpsSecPerRun = Array.from( + const fastSampleData = Array.from( { length: 30 }, (_, i) => 200 + (i % 5) - 2, ); @@ -330,12 +330,12 @@ describe("T-Test Integration with analyze", () => { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: baselineOpsSecPerRun, + histogram: { sampleData: baselineSampleData }, }, { name: "fast", opsSec: 200, - opsSecPerRun: fastOpsSecPerRun, + histogram: { sampleData: fastSampleData }, }, ]; @@ -348,12 +348,12 @@ describe("T-Test Integration with analyze", () => { it("should not mark as significant when differences are within noise", () => { // Same benchmark run twice - should have similar results with high variance overlap - // Generate 30+ opsSecPerRun with overlapping distributions - const baselineOpsSecPerRun = Array.from( + // Generate 30+ samples with overlapping distributions + const baselineSampleData = Array.from( { length: 30 }, (_, i) => 100 + ((i % 5) - 2) * 2, ); - const similarOpsSecPerRun = Array.from( + const similarSampleData = Array.from( { length: 30 }, (_, i) => 101 + ((i % 5) - 2) * 2, ); @@ -363,12 +363,12 @@ describe("T-Test Integration with analyze", () => { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: baselineOpsSecPerRun, + histogram: { sampleData: baselineSampleData }, }, { name: "similar", opsSec: 101, // Very close to baseline - opsSecPerRun: similarOpsSecPerRun, + histogram: { sampleData: similarSampleData }, }, ]; @@ -384,18 +384,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => { const { analyze } = require("../lib/utils/analyze"); it("should only compute significance when repeatSuite provides 30+ samples", () => { - // With 30+ opsSecPerRun, significance should be computed + // With 30+ samples, significance should be computed const results = [ { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: Array.from({ length: 30 }, () => 100), + histogram: { sampleData: Array.from({ length: 30 }, () => 100) }, }, { name: "test", opsSec: 200, - opsSecPerRun: Array.from({ length: 30 }, () => 200), + histogram: { sampleData: Array.from({ length: 30 }, () => 200) }, }, ]; @@ -406,18 +406,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => { }); it("should not compute significance when repeatSuite < 30", () => { - // With fewer than 30 opsSecPerRun, significance should not be computed + // With fewer than 30 samples, significance should not be computed const results = [ { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: Array.from({ length: 10 }, () => 100), + histogram: { sampleData: Array.from({ length: 10 }, () => 100) }, }, { name: "test", opsSec: 200, - opsSecPerRun: Array.from({ length: 10 }, () => 200), + histogram: { sampleData: Array.from({ length: 10 }, () => 200) }, }, ]; From a0b4160dc898c7e1a5a3ae3fa720582ce3c4d14c Mon Sep 17 00:00:00 2001 From: Jason Marshall Date: Sun, 4 Jan 2026 20:12:36 -0800 Subject: [PATCH 2/2] feat: report significant: false when t-test == true but sample size is too small. This will help me sort out inconclusive tests without missing misconfigured ones. This is necessitated by the changes in the previous commit that allow for failure instead of forcing success. --- lib/utils/analyze.js | 4 ++++ test/ttest.js | 10 ++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/utils/analyze.js b/lib/utils/analyze.js index f1baa96..7a5cb2a 100644 --- a/lib/utils/analyze.js +++ b/lib/utils/analyze.js @@ -63,6 +63,10 @@ function analyze(results, sorted = true, options = {}) { confidence: ttestResult.confidence, stars: ttestResult.stars, }; + } else { + result.significanceTest = { + significant: false + } } } } diff --git a/test/ttest.js b/test/ttest.js index ebcdcca..835e0a3 100644 --- a/test/ttest.js +++ b/test/ttest.js @@ -272,7 +272,7 @@ describe("T-Test Integration with analyze", () => { assert.ok(typeof testResult.significanceTest.confidence === "string"); }); - it("should not include significanceTest without sufficient samples", () => { + it("should mark significanceTest as failed without samples", () => { const results = [ { name: "baseline", @@ -288,8 +288,7 @@ describe("T-Test Integration with analyze", () => { const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - // Should not throw, and significanceTest should not be set (no samples) - assert.strictEqual(testResult.significanceTest, undefined); + assert.deepEqual(testResult.significanceTest, { significant: false}); }); it("should not include significanceTest when samples < 30", () => { @@ -310,8 +309,7 @@ describe("T-Test Integration with analyze", () => { const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - // Should not throw, and significanceTest should not be set (not enough samples) - assert.strictEqual(testResult.significanceTest, undefined); + assert.deepEqual(testResult.significanceTest, { significant: false}); }); it("should detect significant difference between clearly different benchmarks", () => { @@ -424,6 +422,6 @@ describe("Statistical significance requires repeatSuite >= 30", () => { const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - assert.strictEqual(testResult.significanceTest, undefined); + assert.deepEqual(testResult.significanceTest, { significant: false}); }); });