add proportions_ztest to test examples (#62)

paulz · carl · Copilot · web-flow · commit a56e21a14eed · 2025-03-24T17:56:37.000-07:00
Add [proportions_ztest](https://www.statsmodels.org/dev/generated/statsmodels.stats.proportion.proportions_ztest.html) and compare it with current StatisticalAnalysis ### Testing framework updates: * [`examples/team_recommender/tests/test_helpers.py`](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL95-R103): Added new test cases and modified existing ones to improve the accuracy of success rate measurements and sample size calculations. [[1]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL95-R103) [[2]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL110-R125) [[3]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL148-R167) [[4]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eR176-R195) * [`examples/team_recommender/tests/test_proportions_ztest.py`](diffhunk://#diff-ef60d571344b1948a6f2edcc6cb72c9875987bfe7d2d568e74e118cf37694e9cR1-R108): Added new tests for the `proportions_ztest` function to check for statistical significance and improvements in success rates. ### Contributors update: * [`CONTRIBUTORS.md`](diffhunk://#diff-c0f86987c556ec52d97b9acf0f35bb2ad0521f65c3113e1b15362ca76502eed2L4-R4): Added Carl Jackson to the list of contributors. ### Dependency update: * [`pyproject.toml`](diffhunk://#diff-50c86b7ed8ac2cf95bd48334961bf0530cdc77b5a56f852c5c61b89d735fd711R28): Added `statsmodels` to the list of test dependencies. --------- Signed-off-by: Paul Zabelin <paulzabelin@artium.ai> Co-authored-by: Carl Jackson <carl@realvr.ai> Co-authored-by: Paul Zabelin <paulz@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -1,4 +1,4 @@
 ## Contributors
 - Ian McFarland [@imf](https://github.com/imf)
 - Dave Schinkel [@dschinkel](https://github.com/dschinkel)
-
+- Carl Jackson [@carl](https://github.com/carl)
diff --git a/examples/team_recommender/tests/test_helpers.py b/examples/team_recommender/tests/test_helpers.py
@@ -50,7 +50,7 @@ def test_assert_success_rate_pass(row):
             [
                 "New Success rate 0.900 with 90% confidence exceeds expected: 0.7",
                 "Broken Record:",
-                "Expecting: 0.744 <= 0.700 <= 1.056",
+                "Expecting: 0.744 <= 0.700 <= 1.000",
                 "Got: expected=0.7 <= analysis.lower_interval=0.74",
             ],
         ),
@@ -61,7 +61,7 @@ def test_assert_success_rate_pass(row):
             [
                 "New Success rate 0.999 with 90% confidence exceeds expected: 0.98",
                 "Broken Record:",
-                "Expecting: 0.997 <= 0.980 <= 1.001",
+                "Expecting: 0.997 <= 0.980 <= 1.000",
                 "Got: expected=0.98 <= analysis.lower_interval=0.997",
             ],
         ),
@@ -92,8 +92,15 @@ def test_beyond_expected_success_rate(assert_success_rate, row):
         (0.8, 14, 100, None),
         (0.97, 1, 8, None),
         (0.97, 0, 1, "after measuring 2x 100 runs and getting 3 failures"),
-        (0.975, 0, 100, "97.5% success rate is within 100% success rate"),
-        (0.9737, 0, 100, "97.37% success rate is within 100% success rate"),
+        (
+            0.97,
+            1,
+            133,
+            "At 133 we can say that with 90% confidence 1 failure is within 97% success rate",
+        ),
+        (0.98, 0, 100, "97.5% success rate is within 100% success rate"),
+        (0.97999999999999999, 0, 100, "97.37% success rate is within 100% success rate"),
+        (0.5, 1, 2, None),
     ],
 )
 def test_is_within_expected(success_rate, failure_count, sample_size, message):
@@ -107,9 +114,15 @@ def test_is_within_expected(success_rate, failure_count, sample_size, message):
     "failure_count, sample_size, expected_rate, message",
     [
         (3, 5, 0.8, "40% success rate is below expected 80% success rate"),
-        (1, 2, 0.97, "50% success rate is below expected 97% success rate"),
         (0, 100, 0.97, "100% success rate is not within 97% success rate"),
+        (1, 50000, 0.9997, "99.99% success rate is below expected 97% success rate"),
         (0, 100, 0.9736, "97.36% success rate is not within 100% success rate"),
+        (
+            1,
+            134,
+            0.97,
+            "At 134 we can say that with 90% confidence 1 failure is within 97% success rate",
+        ),
     ],
 )
 def test_not_is_within_expected(failure_count, sample_size, expected_rate, message):
@@ -145,7 +158,13 @@ def test_next_success_rate():
 
 @pytest.mark.parametrize(
     "success_rate, largest_sample_size",
-    [(0.7, 12), (next_success_rate(12), 55), (next_success_rate(55), 248)],
+    [
+        (0.7, 10),
+        (next_success_rate(10), 44),
+        (next_success_rate(45), 184),
+        (next_success_rate(185), 744),
+        (next_success_rate(745), 2984),
+    ],
 )
 def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample_size):
     assert is_within_expected(success_rate, 1, largest_sample_size), "should be within expected"
@@ -154,6 +173,43 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
     )
 
 
+def test_next_sample_size():
+    ## Next sample size should be larger than the current one by at least 4 times
+    assert next_sample_size(10) == 45, (
+        "passing 10 out of 10 should require 45 successful runs to be statistically significant"
+    )
+    assert next_sample_size(45) == 185, (
+        "passing 45 out of 45 should require 185 successful runs to be statistically significant"
+    )
+    assert next_sample_size(185) == 745
+    assert next_sample_size(745) == 2985
+    assert next_sample_size(29) == 121
+    assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"
+
+    assert 28 / 29 == pytest.approx(0.96, rel=0.01)
+    before = analyse_measure_from_test_sample(28, 29)
+    assert before.proportion == pytest.approx(0.96, rel=0.01)
+    assert before.confidence_interval_prop == pytest.approx((0.91, 1.00), 0.01)
+
+    analysis = analyse_measure_from_test_sample(120, 121)
+    assert analysis.proportion == pytest.approx(0.99, rel=0.01)
+    assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)
+
+
+def next_sample_size(current):
+    ## How many successful runs are needed to be statistically significant improvement
+    # compared to the current sample size with 100% success rate
+    return 4 * current + 5
+
+
+def next_sample_size_via_loop(sample_size: int) -> int:
+    goal_success_rate = next_success_rate(sample_size)
+    for i in range(sample_size, 5 * sample_size):
+        if not is_within_expected(goal_success_rate, 1, i):
+            return i
+    return 0
+
+
 def test_success_rate():
     tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
     assert tiny_set_analysis.proportion == 0.5
diff --git a/examples/team_recommender/tests/test_proportions_ztest.py b/examples/team_recommender/tests/test_proportions_ztest.py
@@ -0,0 +1,108 @@
+import pytest
+from helpers import is_within_expected
+from statsmodels.stats.proportion import proportions_ztest
+from test_helpers import next_success_rate
+
+
+def test_proportions_ztest_improvement():
+    successes = [70, 90]
+    n_observations = [100, 100]
+
+    stat, p_value = proportions_ztest(successes, n_observations)
+    assert p_value == pytest.approx(0.00040695, rel=0.001)
+    assert p_value < 0.05, "statistically significant result"
+    assert stat == pytest.approx(-3.5355, rel=0.001)
+
+
+def test_proportions_ztest_exact_match():
+    stat, p_value = proportions_ztest(7, 10, 0.7)
+    assert p_value == 1.0, "statistically insignificant result"
+    assert stat == 0
+
+
+def test_proportions_ztest_significantly_better():
+    stat, p_value = proportions_ztest(9, 10, 0.7)
+    assert p_value < 0.05, "statistically significant improvement"
+    assert proportions_ztest(9, 10, 0.7, alternative="larger")[1] < 0.05, (
+        "statistically proportion is larger than expected value"
+    )
+    assert proportions_ztest(9, 10, 0.7, alternative="two-sided")[1] < 0.05, (
+        "statistically proportion is larger or smaller than expected value"
+    )
+
+
+def test_proportions_ztest_not_statistically_significantly():
+    for count in range(4, 8):
+        stat, p_value = proportions_ztest(count, 10, 0.7)
+        assert p_value > 0.05, "NO statistically significant deviation"
+
+
+def test_proportions_ztest_significantly_worse():
+    stat, p_value = proportions_ztest(3, 10, 0.7)
+    assert p_value < 0.05, "statistically significant result"
+    assert proportions_ztest(3, 10, 0.7, alternative="smaller")[1] < 0.05, (
+        "statistically proportion is smaller than expected value"
+    )
+    assert proportions_ztest(3, 10, 0.7, alternative="two-sided")[1] < 0.05, (
+        "statistically proportion is smaller than expected value"
+    )
+
+
+def calculate_p_value(success, failure, sample_size) -> float:
+    return calculate_ztest(success, failure, sample_size)[1]
+
+
+def calculate_ztest(success, failure, sample_size) -> tuple[float, float]:
+    measurements = [int(success * sample_size), sample_size - failure]
+    samples = [sample_size, sample_size]
+    zstat, p_value = proportions_ztest(measurements, samples)
+    return zstat, p_value
+
+
+def is_statistically_significant(success, failure, sample_size):
+    return calculate_p_value(success, failure, sample_size) < 0.05
+
+
+def test_not_is_statistically_significant():
+    assert not is_statistically_significant(0.7, 3, 10), "same proportion"
+    assert not is_statistically_significant(0.9, 10, 100), "same proportion"
+    assert not is_statistically_significant(0.7, 30, 100), "same proportion"
+    assert not is_statistically_significant(0.7, 0, 10), "covers 100% success rate"
+
+
+def test_is_statistically_significant():
+    assert is_statistically_significant(0.9, 0, 100), "0 out of 100 > 90% success rate"
+    assert is_statistically_significant(0.7, 0, 11), "0 out of 11 > 70% success rate"
+    assert is_statistically_significant(0.9, 0, 31), "0 out of 31 > 90% success rate"
+    assert is_statistically_significant(0.909090, 0, 33), "0 out of 33 > 90.9% success rate"
+
+
+def test_is_statistically_significant_with_next_success_rate():
+    sample_size = 10
+    assert not is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
+    assert is_statistically_significant(next_success_rate(sample_size), 0, 34)
+    assert is_statistically_significant(next_success_rate(35), 0, 109)
+
+
+def test_compare_is_within_expected_and_is_statistically_significant():
+    assert is_within_expected(0.7, 3, 10), "not significant result for 3/10=70%"
+    assert not is_statistically_significant(0.7, 3, 10), "not significant for 3/10=70%"
+
+    assert is_within_expected(0.7, 0, 3), "not significant result for 0 out of 3"
+    assert not is_statistically_significant(0.7, 0, 3), "not significant result for 0 out of 3"
+
+
+def test_improvement_from_70_percent():
+    assert is_within_expected(0.7, 0, 3), "no improvement detected at 3"
+    assert not is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
+
+    assert not is_within_expected(0.7, 0, 4), "improvement detected at 4"
+    assert is_statistically_significant(0.7, 0, 11), "improvement detected at 11"
+
+
+def test_improvement_from_97_percent():
+    assert is_within_expected(0.97, 0, 66), "no improvement detected at 66"
+    assert not is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
+
+    assert not is_within_expected(0.97, 0, 67), "significantly better at 67"
+    assert is_statistically_significant(0.97, 0, 101), "significantly better at 101"
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ test = [
   "pytest-asyncio>=0.21.0,<0.22",
   "mypy>=1.8.0,<2",
   "pytest-snapshot>=0.9.0",
+  "statsmodels>=0.14.4",
 ]
 examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
 dev = [
diff --git a/src/cat_ai/statistical_analysis.py b/src/cat_ai/statistical_analysis.py
@@ -90,8 +90,8 @@ def analyse_measure_from_test_sample(measure: int, sample_size: int) -> Statisti
     me = z * se
 
     # Calculate confidence interval bounds as proportions
-    lower_bound_prop = p_hat - me
-    upper_bound_prop = p_hat + me
+    lower_bound_prop = max(0, p_hat - me)
+    upper_bound_prop = min(1, p_hat + me)
 
     # Convert proportion bounds to integer counts
     lower_bound_count: int = math.ceil(lower_bound_prop * sample_size)
diff --git a/tests/snapshots/test_statistical_analysis/test_failure_rate_bar_graph/failure_rate_results.csv b/tests/snapshots/test_statistical_analysis/test_failure_rate_bar_graph/failure_rate_results.csv
@@ -1,7 +1,7 @@
 failure_count,sample_size,margin_of_error_count,confidence_lower,confidence_upper,proportion,standard_error,margin_of_error,confidence_proportion_lower,confidence_proportion_upper
-0,100,0,0,0,0.0,0.0,0.0,0.0,0.0
-1,100,1,0,2,0.01,0.0099498743710662,0.01636608694695973,-0.006366086946959731,0.02636608694695973
-2,100,2,0,4,0.02,0.014,0.023027950777320602,-0.0030279507773206017,0.043027950777320606
+0,100,0,0,0,0.0,0.0,0.0,0,0.0
+1,100,1,0,2,0.01,0.0099498743710662,0.01636608694695973,0,0.02636608694695973
+2,100,2,0,4,0.02,0.014,0.023027950777320602,0,0.043027950777320606
 3,100,2,1,5,0.03,0.01705872210923198,0.02805910093252748,0.00194089906747252,0.058059100932527474
 4,100,3,1,7,0.04,0.019595917942265423,0.0322324167007787,0.007767583299221302,0.0722324167007787
 5,100,3,2,8,0.05,0.021794494717703367,0.03584875368398907,0.014151246316010932,0.08584875368398907
@@ -97,6 +97,6 @@ failure_count,sample_size,margin_of_error_count,confidence_lower,confidence_uppe
 95,100,3,92,98,0.95,0.021794494717703377,0.035848753683989085,0.9141512463160109,0.985848753683989
 96,100,3,93,99,0.96,0.019595917942265433,0.03223241670077871,0.9277675832992213,0.9922324167007787
 97,100,2,95,99,0.97,0.017058722109231986,0.02805910093252749,0.9419408990674725,0.9980591009325275
-98,100,2,96,100,0.98,0.014000000000000005,0.02302795077732061,0.9569720492226794,1.0030279507773205
-99,100,1,98,100,0.99,0.009949874371066205,0.016366086946959738,0.9736339130530403,1.0063660869469597
-100,100,0,100,100,1.0,0.0,0.0,1.0,1.0
+98,100,2,96,100,0.98,0.014000000000000005,0.02302795077732061,0.9569720492226794,1
+99,100,1,98,100,0.99,0.009949874371066205,0.016366086946959738,0.9736339130530403,1
+100,100,0,100,100,1.0,0.0,0.0,1.0,1
diff --git a/tests/snapshots/test_statistical_analysis/test_failure_rate_graph/failure_rate_graph.png b/tests/snapshots/test_statistical_analysis/test_failure_rate_graph/failure_rate_graph.png
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ test = [`
`25`	`25`	`"pytest-asyncio>=0.21.0,<0.22",`
`26`	`26`	`"mypy>=1.8.0,<2",`
`27`	`27`	`"pytest-snapshot>=0.9.0",`
	`28`	`+ "statsmodels>=0.14.4",`
`28`	`29`	`]`
`29`	`30`	`examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]`
`30`	`31`	`dev = [`