diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..34c19f7 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index 0739aed..e8449a5 100644 --- a/README.md +++ b/README.md @@ -779,14 +779,15 @@ the two samples, which is common in benchmark scenarios. ### Enabling T-Test Mode -Enable t-test mode with `ttest: true`. This automatically sets `repeatSuite=30` to collect enough -independent samples for reliable statistical analysis (per the Central Limit Theorem): +Enable t-test mode with `ttest: true`. Requires 30 independent samples for reliable statistical analysis (per the +Central Limit Theorem): ```js const { Suite } = require('bench-node'); const suite = new Suite({ - ttest: true, // Enables t-test and auto-sets repeatSuite=30 + ttest: true, // Enables t-test, which requires 30, preferably 40 samples for statistical significance + minSamples: 40 // sample count is >= minSamples x repeatSuite }); suite diff --git a/examples/.DS_Store b/examples/.DS_Store new file mode 100644 index 0000000..9b65e0a Binary files /dev/null and b/examples/.DS_Store differ diff --git a/examples/statistical-significance/README.md b/examples/statistical-significance/README.md index 17819fc..1ec83c0 100644 --- a/examples/statistical-significance/README.md +++ b/examples/statistical-significance/README.md @@ -20,7 +20,8 @@ Enable t-test mode with `ttest: true`: const { Suite } = require('bench-node'); const suite = new Suite({ - ttest: true, // Automatically sets repeatSuite=30 + ttest: true, + minSamples: 30, // minSamples x repeatSuite must be > 30 }); suite.add('baseline', { baseline: true }, () => { diff --git a/examples/statistical-significance/node.js b/examples/statistical-significance/node.js index 4e28636..6902610 100644 --- a/examples/statistical-significance/node.js +++ b/examples/statistical-significance/node.js @@ -16,6 +16,7 @@ const { Suite } = require('../../lib'); // Enable t-test mode - this automatically sets repeatSuite=30 for all benchmarks const suite = new Suite({ ttest: true, + minSamples: 30 }); // Baseline: Simple array sum using for loop diff --git a/lib/index.js b/lib/index.js index 9bd1455..78aaecf 100644 --- a/lib/index.js +++ b/lib/index.js @@ -103,7 +103,7 @@ const defaultBenchOptions = { }; // Minimum repeatSuite runs required for reliable t-test results -const MIN_REPEAT_FOR_TTEST = 30; +const MIN_SAMPLES_FOR_TTEST = 30; function throwIfNoNativesSyntax() { if (process.execArgv.includes("--allow-natives-syntax") === false) { @@ -180,8 +180,6 @@ class Suite { if (options.repeatSuite !== undefined) { validateNumber(options.repeatSuite, "options.repeatSuite", 1); repeatSuite = options.repeatSuite; - } else if (this.#ttest) { - repeatSuite = MIN_REPEAT_FOR_TTEST; } this.#repeatSuite = repeatSuite; @@ -231,6 +229,12 @@ class Suite { throw new Error("There is already a baseline benchmark"); } + if (this.#ttest && (options.minSamples * options.repeatSuite < MIN_SAMPLES_FOR_TTEST)) { + process.emitWarning( + `The benchmark "${name}" may not have enough samples to run t-test analysis. Please set minSamples x repeatSuite >= ${MIN_SAMPLES_FOR_TTEST}`, + ); + } + const benchmark = new Benchmark( name, fn, diff --git a/lib/utils/analyze.js b/lib/utils/analyze.js index a6b8b27..7a5cb2a 100644 --- a/lib/utils/analyze.js +++ b/lib/utils/analyze.js @@ -44,8 +44,9 @@ function analyze(results, sorted = true, options = {}) { } if (ttest) { - const resultSamples = result.opsSecPerRun; - const baselineSamplesForTest = baselineResult.opsSecPerRun; + const resultSamples = result.histogram?.sampleData ?? []; + const baselineSamplesForTest = + baselineResult.histogram?.sampleData ?? []; if ( baselineSamplesForTest?.length >= 30 && @@ -62,6 +63,10 @@ function analyze(results, sorted = true, options = {}) { confidence: ttestResult.confidence, stars: ttestResult.stars, }; + } else { + result.significanceTest = { + significant: false + } } } } diff --git a/test/ttest.js b/test/ttest.js index 84f1ed0..835e0a3 100644 --- a/test/ttest.js +++ b/test/ttest.js @@ -238,13 +238,13 @@ describe("T-Test Integration with analyze", () => { assert.strictEqual(testResult.significanceTest, undefined); }); - it("should include significanceTest when ttest is true and opsSecPerRun >= 30", () => { - // Generate 30+ opsSecPerRun samples (from repeatSuite) - const baselineOpsSecPerRun = Array.from( + it("should include significanceTest when ttest is true and samples >= 30", () => { + // Generate 30+ samples + const baselineSampleData = Array.from( { length: 30 }, (_, i) => 100 + (i % 3) - 1, ); - const testOpsSecPerRun = Array.from( + const testSampleData = Array.from( { length: 30 }, (_, i) => 200 + (i % 3) - 1, ); @@ -254,12 +254,12 @@ describe("T-Test Integration with analyze", () => { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: baselineOpsSecPerRun, + histogram: { sampleData: baselineSampleData }, }, { name: "test", opsSec: 200, - opsSecPerRun: testOpsSecPerRun, + histogram: { sampleData: testSampleData }, }, ]; @@ -272,7 +272,7 @@ describe("T-Test Integration with analyze", () => { assert.ok(typeof testResult.significanceTest.confidence === "string"); }); - it("should not include significanceTest without opsSecPerRun", () => { + it("should mark significanceTest as failed without samples", () => { const results = [ { name: "baseline", @@ -288,39 +288,37 @@ describe("T-Test Integration with analyze", () => { const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - // Should not throw, and significanceTest should not be set (no opsSecPerRun) - assert.strictEqual(testResult.significanceTest, undefined); + assert.deepEqual(testResult.significanceTest, { significant: false}); }); - it("should not include significanceTest when opsSecPerRun < 30", () => { + it("should not include significanceTest when samples < 30", () => { const results = [ { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: Array.from({ length: 10 }, () => 100), + histogram: { samples: Array.from({ length: 10 }, () => 100) }, }, { name: "test", opsSec: 200, - opsSecPerRun: Array.from({ length: 10 }, () => 200), + histogram: { samples: Array.from({ length: 10 }, () => 200) }, }, ]; const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - // Should not throw, and significanceTest should not be set (not enough samples) - assert.strictEqual(testResult.significanceTest, undefined); + assert.deepEqual(testResult.significanceTest, { significant: false}); }); it("should detect significant difference between clearly different benchmarks", () => { - // Generate 30+ opsSecPerRun with clearly different means - const baselineOpsSecPerRun = Array.from( + // Generate 30+ samples with clearly different means + const baselineSampleData = Array.from( { length: 30 }, (_, i) => 100 + (i % 5) - 2, ); - const fastOpsSecPerRun = Array.from( + const fastSampleData = Array.from( { length: 30 }, (_, i) => 200 + (i % 5) - 2, ); @@ -330,12 +328,12 @@ describe("T-Test Integration with analyze", () => { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: baselineOpsSecPerRun, + histogram: { sampleData: baselineSampleData }, }, { name: "fast", opsSec: 200, - opsSecPerRun: fastOpsSecPerRun, + histogram: { sampleData: fastSampleData }, }, ]; @@ -348,12 +346,12 @@ describe("T-Test Integration with analyze", () => { it("should not mark as significant when differences are within noise", () => { // Same benchmark run twice - should have similar results with high variance overlap - // Generate 30+ opsSecPerRun with overlapping distributions - const baselineOpsSecPerRun = Array.from( + // Generate 30+ samples with overlapping distributions + const baselineSampleData = Array.from( { length: 30 }, (_, i) => 100 + ((i % 5) - 2) * 2, ); - const similarOpsSecPerRun = Array.from( + const similarSampleData = Array.from( { length: 30 }, (_, i) => 101 + ((i % 5) - 2) * 2, ); @@ -363,12 +361,12 @@ describe("T-Test Integration with analyze", () => { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: baselineOpsSecPerRun, + histogram: { sampleData: baselineSampleData }, }, { name: "similar", opsSec: 101, // Very close to baseline - opsSecPerRun: similarOpsSecPerRun, + histogram: { sampleData: similarSampleData }, }, ]; @@ -384,18 +382,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => { const { analyze } = require("../lib/utils/analyze"); it("should only compute significance when repeatSuite provides 30+ samples", () => { - // With 30+ opsSecPerRun, significance should be computed + // With 30+ samples, significance should be computed const results = [ { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: Array.from({ length: 30 }, () => 100), + histogram: { sampleData: Array.from({ length: 30 }, () => 100) }, }, { name: "test", opsSec: 200, - opsSecPerRun: Array.from({ length: 30 }, () => 200), + histogram: { sampleData: Array.from({ length: 30 }, () => 200) }, }, ]; @@ -406,24 +404,24 @@ describe("Statistical significance requires repeatSuite >= 30", () => { }); it("should not compute significance when repeatSuite < 30", () => { - // With fewer than 30 opsSecPerRun, significance should not be computed + // With fewer than 30 samples, significance should not be computed const results = [ { name: "baseline", opsSec: 100, baseline: true, - opsSecPerRun: Array.from({ length: 10 }, () => 100), + histogram: { sampleData: Array.from({ length: 10 }, () => 100) }, }, { name: "test", opsSec: 200, - opsSecPerRun: Array.from({ length: 10 }, () => 200), + histogram: { sampleData: Array.from({ length: 10 }, () => 200) }, }, ]; const analyzed = analyze(results, true, { ttest: true }); const testResult = analyzed.find((r) => r.name === "test"); - assert.strictEqual(testResult.significanceTest, undefined); + assert.deepEqual(testResult.significanceTest, { significant: false}); }); });