From d3f4f7b42b48f379a603a876a9c0f23f2f207779 Mon Sep 17 00:00:00 2001
From: Jason Marshall <jdmarshall@users.noreply.github.com>
Date: Sun, 4 Jan 2026 18:18:46 -0800
Subject: [PATCH 1/2] feat: use histogram samples for t-test analysis

Justification:

Each sample in the histogram represents a durationPerOp
sample calculated by dividing a certain number of iterations of
executing the function under test divided by the cumulative time
of those runs. Which is the average of the execution time of each
execution. opsSec and opsSecPerRun are then an average of the
samples, which are themselves averages.

Therefore, using opsSecPerRun as a t-test inaccurately applying the
calculation to an average of averages, when it is meant to be applied
to a set of averages totalling a minimum of 30 samples, with 40
preferable.

In other words, it's a histogram entry that represents a valid t-test
sample.
---
 .DS_Store                                   | Bin 0 -> 6148 bytes
 README.md                                   |   7 +--
 examples/.DS_Store                          | Bin 0 -> 6148 bytes
 examples/statistical-significance/README.md |   3 +-
 examples/statistical-significance/node.js   |   1 +
 lib/index.js                                |  10 ++--
 lib/utils/analyze.js                        |   5 +-
 test/ttest.js                               |  54 ++++++++++----------
 8 files changed, 44 insertions(+), 36 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 examples/.DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..34c19f72621762b66da6b77134538776a4fc41fc
GIT binary patch
literal 6148
zcmeHKJ8r`;3?*9+2D*66s4Mgag5jQ^7f8~ilM5Fx3Uu_8bLD7#{0t4;Z0Qm_0@RZz
zJ_&k*X^M#UUx#OrPDEO`p`2`3o9&yg>>(oxgyW1)8RaPR`(b`tW&a&8ZVS%0?BwG&
ze?Lv5ZPKU!6`%rCfC^B7n<|jywSBtjsXUAdP=R|;z`hR!ZdenSK>u`L@D>2rAnb;@
z_Y%Nj0bosB0ug~}P=P_!Y%w(Gh?mT(iA!M6MYH+PyjinDQNJDM7f%<hfgGs-6}VL3
zF_xXx{}=d`{{NE16&0WYccp+f!|TxFNm*N0kF#1^;9I!moZ)7eI|YN6W1yE~EUX-F
dJSp;u&9Pq-mq4c@?sOo3222+k75HxjegKCC6_x-1

literal 0
HcmV?d00001

diff --git a/README.md b/README.md
index 0739aed..e8449a5 100644
--- a/README.md
+++ b/README.md
@@ -779,14 +779,15 @@ the two samples, which is common in benchmark scenarios.
 
 ### Enabling T-Test Mode
 
-Enable t-test mode with `ttest: true`. This automatically sets `repeatSuite=30` to collect enough
-independent samples for reliable statistical analysis (per the Central Limit Theorem):
+Enable t-test mode with `ttest: true`. Requires 30 independent samples for reliable statistical analysis (per the 
+Central Limit Theorem):
 
 ```js
 const { Suite } = require('bench-node');
 
 const suite = new Suite({
-  ttest: true,  // Enables t-test and auto-sets repeatSuite=30
+  ttest: true,  // Enables t-test, which requires 30, preferably 40 samples for statistical significance
+  minSamples: 40 // sample count is >= minSamples x repeatSuite 
 });
 
 suite
diff --git a/examples/.DS_Store b/examples/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..9b65e0a17641ec7e2c5cebd5bfa3bea3654bbc83
GIT binary patch
literal 6148
zcmeHKyKVwO477m)iH4GLe}E!?utMYu_yOJs2|+>Vugh;UexL*$6f`NqShDBV=h@Xx
zaXyQPPA=;Yk)eq6;fC^Up>6iwyk(utC=iZ4_Hvdl#FZ>s)sGX#9XV^^%pd+i#>nqw
zCr89>?s~-Do9+6%TAoe^Bh#n=6`%rCfC^B7mlVkM*&n>bVlfq<0`H)JeIE+kuqHNv
z{^>yQ5dhdj+6~V>O8|={fHkoRL<Xio1qM}f#L%E4U$U+yHi1DG&EZ4yWX%ah{dDYK
zyj-*fGExC5aIZi&mP70RpYSL1|2>H-DnJGPl>*wE&ZZMwDSK<{a@K1Ld=9@EYP}r6
lTQSgEF&5T}N5ATdJ>z{%YyzE*ywidF5inh7RNyZZxB}$B8fE|h

literal 0
HcmV?d00001

diff --git a/examples/statistical-significance/README.md b/examples/statistical-significance/README.md
index 17819fc..1ec83c0 100644
--- a/examples/statistical-significance/README.md
+++ b/examples/statistical-significance/README.md
@@ -20,7 +20,8 @@ Enable t-test mode with `ttest: true`:
 const { Suite } = require('bench-node');
 
 const suite = new Suite({
-  ttest: true,  // Automatically sets repeatSuite=30
+  ttest: true,
+  minSamples: 30, // minSamples x repeatSuite must be > 30
 });
 
 suite.add('baseline', { baseline: true }, () => {
diff --git a/examples/statistical-significance/node.js b/examples/statistical-significance/node.js
index 4e28636..6902610 100644
--- a/examples/statistical-significance/node.js
+++ b/examples/statistical-significance/node.js
@@ -16,6 +16,7 @@ const { Suite } = require('../../lib');
 // Enable t-test mode - this automatically sets repeatSuite=30 for all benchmarks
 const suite = new Suite({
   ttest: true,
+  minSamples: 30
 });
 
 // Baseline: Simple array sum using for loop
diff --git a/lib/index.js b/lib/index.js
index 9bd1455..78aaecf 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -103,7 +103,7 @@ const defaultBenchOptions = {
 };
 
 // Minimum repeatSuite runs required for reliable t-test results
-const MIN_REPEAT_FOR_TTEST = 30;
+const MIN_SAMPLES_FOR_TTEST = 30;
 
 function throwIfNoNativesSyntax() {
 	if (process.execArgv.includes("--allow-natives-syntax") === false) {
@@ -180,8 +180,6 @@ class Suite {
 		if (options.repeatSuite !== undefined) {
 			validateNumber(options.repeatSuite, "options.repeatSuite", 1);
 			repeatSuite = options.repeatSuite;
-		} else if (this.#ttest) {
-			repeatSuite = MIN_REPEAT_FOR_TTEST;
 		}
 		this.#repeatSuite = repeatSuite;
 
@@ -231,6 +229,12 @@ class Suite {
 			throw new Error("There is already a baseline benchmark");
 		}
 
+		if (this.#ttest && (options.minSamples * options.repeatSuite < MIN_SAMPLES_FOR_TTEST)) {
+			process.emitWarning(
+				`The benchmark "${name}" may not have enough samples to run t-test analysis. Please set minSamples x repeatSuite >= ${MIN_SAMPLES_FOR_TTEST}`,
+			);
+		}
+
 		const benchmark = new Benchmark(
 			name,
 			fn,
diff --git a/lib/utils/analyze.js b/lib/utils/analyze.js
index a6b8b27..f1baa96 100644
--- a/lib/utils/analyze.js
+++ b/lib/utils/analyze.js
@@ -44,8 +44,9 @@ function analyze(results, sorted = true, options = {}) {
 			}
 
 			if (ttest) {
-				const resultSamples = result.opsSecPerRun;
-				const baselineSamplesForTest = baselineResult.opsSecPerRun;
+				const resultSamples = result.histogram?.sampleData ?? [];
+				const baselineSamplesForTest =
+					baselineResult.histogram?.sampleData ?? [];
 
 				if (
 					baselineSamplesForTest?.length >= 30 &&
diff --git a/test/ttest.js b/test/ttest.js
index 84f1ed0..ebcdcca 100644
--- a/test/ttest.js
+++ b/test/ttest.js
@@ -238,13 +238,13 @@ describe("T-Test Integration with analyze", () => {
 		assert.strictEqual(testResult.significanceTest, undefined);
 	});
 
-	it("should include significanceTest when ttest is true and opsSecPerRun >= 30", () => {
-		// Generate 30+ opsSecPerRun samples (from repeatSuite)
-		const baselineOpsSecPerRun = Array.from(
+	it("should include significanceTest when ttest is true and samples >= 30", () => {
+		// Generate 30+ samples
+		const baselineSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 100 + (i % 3) - 1,
 		);
-		const testOpsSecPerRun = Array.from(
+		const testSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 200 + (i % 3) - 1,
 		);
@@ -254,12 +254,12 @@ describe("T-Test Integration with analyze", () => {
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: baselineOpsSecPerRun,
+				histogram: { sampleData: baselineSampleData },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: testOpsSecPerRun,
+				histogram: { sampleData: testSampleData },
 			},
 		];
 
@@ -272,7 +272,7 @@ describe("T-Test Integration with analyze", () => {
 		assert.ok(typeof testResult.significanceTest.confidence === "string");
 	});
 
-	it("should not include significanceTest without opsSecPerRun", () => {
+	it("should not include significanceTest without sufficient samples", () => {
 		const results = [
 			{
 				name: "baseline",
@@ -288,22 +288,22 @@ describe("T-Test Integration with analyze", () => {
 		const analyzed = analyze(results, true, { ttest: true });
 		const testResult = analyzed.find((r) => r.name === "test");
 
-		// Should not throw, and significanceTest should not be set (no opsSecPerRun)
+		// Should not throw, and significanceTest should not be set (no samples)
 		assert.strictEqual(testResult.significanceTest, undefined);
 	});
 
-	it("should not include significanceTest when opsSecPerRun < 30", () => {
+	it("should not include significanceTest when samples < 30", () => {
 		const results = [
 			{
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: Array.from({ length: 10 }, () => 100),
+				histogram: { samples: Array.from({ length: 10 }, () => 100) },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: Array.from({ length: 10 }, () => 200),
+				histogram: { samples: Array.from({ length: 10 }, () => 200) },
 			},
 		];
 
@@ -315,12 +315,12 @@ describe("T-Test Integration with analyze", () => {
 	});
 
 	it("should detect significant difference between clearly different benchmarks", () => {
-		// Generate 30+ opsSecPerRun with clearly different means
-		const baselineOpsSecPerRun = Array.from(
+		// Generate 30+ samples with clearly different means
+		const baselineSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 100 + (i % 5) - 2,
 		);
-		const fastOpsSecPerRun = Array.from(
+		const fastSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 200 + (i % 5) - 2,
 		);
@@ -330,12 +330,12 @@ describe("T-Test Integration with analyze", () => {
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: baselineOpsSecPerRun,
+				histogram: { sampleData: baselineSampleData },
 			},
 			{
 				name: "fast",
 				opsSec: 200,
-				opsSecPerRun: fastOpsSecPerRun,
+				histogram: { sampleData: fastSampleData },
 			},
 		];
 
@@ -348,12 +348,12 @@ describe("T-Test Integration with analyze", () => {
 
 	it("should not mark as significant when differences are within noise", () => {
 		// Same benchmark run twice - should have similar results with high variance overlap
-		// Generate 30+ opsSecPerRun with overlapping distributions
-		const baselineOpsSecPerRun = Array.from(
+		// Generate 30+ samples with overlapping distributions
+		const baselineSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 100 + ((i % 5) - 2) * 2,
 		);
-		const similarOpsSecPerRun = Array.from(
+		const similarSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 101 + ((i % 5) - 2) * 2,
 		);
@@ -363,12 +363,12 @@ describe("T-Test Integration with analyze", () => {
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: baselineOpsSecPerRun,
+				histogram: { sampleData: baselineSampleData },
 			},
 			{
 				name: "similar",
 				opsSec: 101, // Very close to baseline
-				opsSecPerRun: similarOpsSecPerRun,
+				histogram: { sampleData: similarSampleData },
 			},
 		];
 
@@ -384,18 +384,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => {
 	const { analyze } = require("../lib/utils/analyze");
 
 	it("should only compute significance when repeatSuite provides 30+ samples", () => {
-		// With 30+ opsSecPerRun, significance should be computed
+		// With 30+ samples, significance should be computed
 		const results = [
 			{
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: Array.from({ length: 30 }, () => 100),
+				histogram: { sampleData: Array.from({ length: 30 }, () => 100) },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: Array.from({ length: 30 }, () => 200),
+				histogram: { sampleData: Array.from({ length: 30 }, () => 200) },
 			},
 		];
 
@@ -406,18 +406,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => {
 	});
 
 	it("should not compute significance when repeatSuite < 30", () => {
-		// With fewer than 30 opsSecPerRun, significance should not be computed
+		// With fewer than 30 samples, significance should not be computed
 		const results = [
 			{
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: Array.from({ length: 10 }, () => 100),
+				histogram: { sampleData: Array.from({ length: 10 }, () => 100) },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: Array.from({ length: 10 }, () => 200),
+				histogram: { sampleData: Array.from({ length: 10 }, () => 200) },
 			},
 		];
 

From a0b4160dc898c7e1a5a3ae3fa720582ce3c4d14c Mon Sep 17 00:00:00 2001
From: Jason Marshall <jdmarshall@users.noreply.github.com>
Date: Sun, 4 Jan 2026 20:12:36 -0800
Subject: [PATCH 2/2] feat: report significant: false when t-test == true but
 sample size is too small.

This will help me sort out inconclusive tests without missing misconfigured
ones.

This is necessitated by the changes in the previous commit that allow
for failure instead of forcing success.
---
 lib/utils/analyze.js |  4 ++++
 test/ttest.js        | 10 ++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/utils/analyze.js b/lib/utils/analyze.js
index f1baa96..7a5cb2a 100644
--- a/lib/utils/analyze.js
+++ b/lib/utils/analyze.js
@@ -63,6 +63,10 @@ function analyze(results, sorted = true, options = {}) {
 						confidence: ttestResult.confidence,
 						stars: ttestResult.stars,
 					};
+				} else {
+					result.significanceTest = {
+						significant: false
+					}
 				}
 			}
 		}
diff --git a/test/ttest.js b/test/ttest.js
index ebcdcca..835e0a3 100644
--- a/test/ttest.js
+++ b/test/ttest.js
@@ -272,7 +272,7 @@ describe("T-Test Integration with analyze", () => {
 		assert.ok(typeof testResult.significanceTest.confidence === "string");
 	});
 
-	it("should not include significanceTest without sufficient samples", () => {
+	it("should mark significanceTest as failed without samples", () => {
 		const results = [
 			{
 				name: "baseline",
@@ -288,8 +288,7 @@ describe("T-Test Integration with analyze", () => {
 		const analyzed = analyze(results, true, { ttest: true });
 		const testResult = analyzed.find((r) => r.name === "test");
 
-		// Should not throw, and significanceTest should not be set (no samples)
-		assert.strictEqual(testResult.significanceTest, undefined);
+		assert.deepEqual(testResult.significanceTest, { significant: false});
 	});
 
 	it("should not include significanceTest when samples < 30", () => {
@@ -310,8 +309,7 @@ describe("T-Test Integration with analyze", () => {
 		const analyzed = analyze(results, true, { ttest: true });
 		const testResult = analyzed.find((r) => r.name === "test");
 
-		// Should not throw, and significanceTest should not be set (not enough samples)
-		assert.strictEqual(testResult.significanceTest, undefined);
+		assert.deepEqual(testResult.significanceTest, { significant: false});
 	});
 
 	it("should detect significant difference between clearly different benchmarks", () => {
@@ -424,6 +422,6 @@ describe("Statistical significance requires repeatSuite >= 30", () => {
 		const analyzed = analyze(results, true, { ttest: true });
 		const testResult = analyzed.find((r) => r.name === "test");
 
-		assert.strictEqual(testResult.significanceTest, undefined);
+		assert.deepEqual(testResult.significanceTest, { significant: false});
 	});
 });