feat: use histogram samples for t-test analysis

jdmarshall · jdmarshall · commit 069cb5531bad · 2026-01-04T19:04:10.000-08:00
Justification:

Each sample in the histogram represents a durationPerOp
sample calculated by dividing a certain number of iterations of
executing the function under test divided by the cumulative time
of those runs. Which is the average of the execution time of each
execution. opsSec and opsSecPerRun are then an average of the
samples, which are themselves averages.

Therefore, using opsSecPerRun as a t-test inaccurately applying the
calculation to an average of averages, when it is meant to be applied
to a set of averages totalling a minimum of 30 samples, with 40
preferable.

In other words, it's a histogram entry that represents a valid t-test
sample.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -779,14 +779,15 @@ the two samples, which is common in benchmark scenarios.
 
 ### Enabling T-Test Mode
 
-Enable t-test mode with `ttest: true`. This automatically sets `repeatSuite=30` to collect enough
-independent samples for reliable statistical analysis (per the Central Limit Theorem):
+Enable t-test mode with `ttest: true`. Requires 30 independent samples for reliable statistical analysis (per the 
+Central Limit Theorem):
 
 ```js
 const { Suite } = require('bench-node');
 
 const suite = new Suite({
-  ttest: true,  // Enables t-test and auto-sets repeatSuite=30
+  ttest: true,  // Enables t-test, which requires 30, preferably 40 samples for statistical significance
+  minSamples: 40 // sample count is >= minSamples x repeatSuite 
 });
 
 suite
diff --git a/examples/.DS_Store b/examples/.DS_Store
diff --git a/examples/statistical-significance/README.md b/examples/statistical-significance/README.md
@@ -20,7 +20,8 @@ Enable t-test mode with `ttest: true`:
 const { Suite } = require('bench-node');
 
 const suite = new Suite({
-  ttest: true,  // Automatically sets repeatSuite=30
+  ttest: true,
+  minSamples: 30, // minSamples x repeatSuite must be > 30
 });
 
 suite.add('baseline', { baseline: true }, () => {
diff --git a/examples/statistical-significance/node.js b/examples/statistical-significance/node.js
@@ -16,6 +16,7 @@ const { Suite } = require('../../lib');
 // Enable t-test mode - this automatically sets repeatSuite=30 for all benchmarks
 const suite = new Suite({
   ttest: true,
+  minSamples: 30
 });
 
 // Baseline: Simple array sum using for loop
diff --git a/lib/index.js b/lib/index.js
@@ -103,7 +103,7 @@ const defaultBenchOptions = {
 };
 
 // Minimum repeatSuite runs required for reliable t-test results
-const MIN_REPEAT_FOR_TTEST = 30;
+const MIN_SAMPLES_FOR_TTEST = 30;
 
 function throwIfNoNativesSyntax() {
 	if (process.execArgv.includes("--allow-natives-syntax") === false) {
@@ -180,8 +180,6 @@ class Suite {
 		if (options.repeatSuite !== undefined) {
 			validateNumber(options.repeatSuite, "options.repeatSuite", 1);
 			repeatSuite = options.repeatSuite;
-		} else if (this.#ttest) {
-			repeatSuite = MIN_REPEAT_FOR_TTEST;
 		}
 		this.#repeatSuite = repeatSuite;
 
@@ -231,6 +229,12 @@ class Suite {
 			throw new Error("There is already a baseline benchmark");
 		}
 
+		if (options.minSamples * options.repeatSuite < MIN_SAMPLES_FOR_TTEST) {
+			process.emitWarning(
+				`The benchmark "${name}" may not have enough samples to run t-test analysis. Please set minSamples x repeatSuite >= ${MIN_SAMPLES_FOR_TTEST}`,
+			);
+		}
+
 		const benchmark = new Benchmark(
 			name,
 			fn,
diff --git a/lib/utils/analyze.js b/lib/utils/analyze.js
@@ -44,8 +44,9 @@ function analyze(results, sorted = true, options = {}) {
 			}
 
 			if (ttest) {
-				const resultSamples = result.opsSecPerRun;
-				const baselineSamplesForTest = baselineResult.opsSecPerRun;
+				const resultSamples = result.histogram?.sampleData ?? [];
+				const baselineSamplesForTest =
+					baselineResult.histogram?.sampleData ?? [];
 
 				if (
 					baselineSamplesForTest?.length >= 30 &&
diff --git a/test/ttest.js b/test/ttest.js
@@ -238,13 +238,13 @@ describe("T-Test Integration with analyze", () => {
 		assert.strictEqual(testResult.significanceTest, undefined);
 	});
 
-	it("should include significanceTest when ttest is true and opsSecPerRun >= 30", () => {
-		// Generate 30+ opsSecPerRun samples (from repeatSuite)
-		const baselineOpsSecPerRun = Array.from(
+	it("should include significanceTest when ttest is true and samples >= 30", () => {
+		// Generate 30+ samples
+		const baselineSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 100 + (i % 3) - 1,
 		);
-		const testOpsSecPerRun = Array.from(
+		const testSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 200 + (i % 3) - 1,
 		);
@@ -254,12 +254,12 @@ describe("T-Test Integration with analyze", () => {
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: baselineOpsSecPerRun,
+				histogram: { sampleData: baselineSampleData },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: testOpsSecPerRun,
+				histogram: { sampleData: testSampleData },
 			},
 		];
 
@@ -272,7 +272,7 @@ describe("T-Test Integration with analyze", () => {
 		assert.ok(typeof testResult.significanceTest.confidence === "string");
 	});
 
-	it("should not include significanceTest without opsSecPerRun", () => {
+	it("should not include significanceTest without sufficient samples", () => {
 		const results = [
 			{
 				name: "baseline",
@@ -288,22 +288,22 @@ describe("T-Test Integration with analyze", () => {
 		const analyzed = analyze(results, true, { ttest: true });
 		const testResult = analyzed.find((r) => r.name === "test");
 
-		// Should not throw, and significanceTest should not be set (no opsSecPerRun)
+		// Should not throw, and significanceTest should not be set (no samples)
 		assert.strictEqual(testResult.significanceTest, undefined);
 	});
 
-	it("should not include significanceTest when opsSecPerRun < 30", () => {
+	it("should not include significanceTest when samples < 30", () => {
 		const results = [
 			{
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: Array.from({ length: 10 }, () => 100),
+				histogram: { samples: Array.from({ length: 10 }, () => 100) },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: Array.from({ length: 10 }, () => 200),
+				histogram: { samples: Array.from({ length: 10 }, () => 200) },
 			},
 		];
 
@@ -315,12 +315,12 @@ describe("T-Test Integration with analyze", () => {
 	});
 
 	it("should detect significant difference between clearly different benchmarks", () => {
-		// Generate 30+ opsSecPerRun with clearly different means
-		const baselineOpsSecPerRun = Array.from(
+		// Generate 30+ samples with clearly different means
+		const baselineSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 100 + (i % 5) - 2,
 		);
-		const fastOpsSecPerRun = Array.from(
+		const fastSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 200 + (i % 5) - 2,
 		);
@@ -330,12 +330,12 @@ describe("T-Test Integration with analyze", () => {
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: baselineOpsSecPerRun,
+				histogram: { sampleData: baselineSampleData },
 			},
 			{
 				name: "fast",
 				opsSec: 200,
-				opsSecPerRun: fastOpsSecPerRun,
+				histogram: { sampleData: fastSampleData },
 			},
 		];
 
@@ -348,12 +348,12 @@ describe("T-Test Integration with analyze", () => {
 
 	it("should not mark as significant when differences are within noise", () => {
 		// Same benchmark run twice - should have similar results with high variance overlap
-		// Generate 30+ opsSecPerRun with overlapping distributions
-		const baselineOpsSecPerRun = Array.from(
+		// Generate 30+ samples with overlapping distributions
+		const baselineSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 100 + ((i % 5) - 2) * 2,
 		);
-		const similarOpsSecPerRun = Array.from(
+		const similarSampleData = Array.from(
 			{ length: 30 },
 			(_, i) => 101 + ((i % 5) - 2) * 2,
 		);
@@ -363,12 +363,12 @@ describe("T-Test Integration with analyze", () => {
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: baselineOpsSecPerRun,
+				histogram: { sampleData: baselineSampleData },
 			},
 			{
 				name: "similar",
 				opsSec: 101, // Very close to baseline
-				opsSecPerRun: similarOpsSecPerRun,
+				histogram: { sampleData: similarSampleData },
 			},
 		];
 
@@ -384,18 +384,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => {
 	const { analyze } = require("../lib/utils/analyze");
 
 	it("should only compute significance when repeatSuite provides 30+ samples", () => {
-		// With 30+ opsSecPerRun, significance should be computed
+		// With 30+ samples, significance should be computed
 		const results = [
 			{
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: Array.from({ length: 30 }, () => 100),
+				histogram: { sampleData: Array.from({ length: 30 }, () => 100) },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: Array.from({ length: 30 }, () => 200),
+				histogram: { sampleData: Array.from({ length: 30 }, () => 200) },
 			},
 		];
 
@@ -406,18 +406,18 @@ describe("Statistical significance requires repeatSuite >= 30", () => {
 	});
 
 	it("should not compute significance when repeatSuite < 30", () => {
-		// With fewer than 30 opsSecPerRun, significance should not be computed
+		// With fewer than 30 samples, significance should not be computed
 		const results = [
 			{
 				name: "baseline",
 				opsSec: 100,
 				baseline: true,
-				opsSecPerRun: Array.from({ length: 10 }, () => 100),
+				histogram: { sampleData: Array.from({ length: 10 }, () => 100) },
 			},
 			{
 				name: "test",
 				opsSec: 200,
-				opsSecPerRun: Array.from({ length: 10 }, () => 200),
+				histogram: { sampleData: Array.from({ length: 10 }, () => 200) },
 			},
 		];