thisisartium · paulz · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/examples/team_recommender/tests/test_helpers.py b/examples/team_recommender/tests/test_helpers.py
@@ -142,12 +142,12 @@ def test_seventy_percent_confidence_ranges_from_fifty_to_ninety():
 
 
 def next_success_rate(sample_size) -> float:
-    return 1 - 1 / (sample_size + 1)
+    return sample_size / (sample_size + 1)
 
 
 def test_next_success_rate():
     assert next_success_rate(1) == 0.5
-    assert next_success_rate(2) == 0.6666666666666667
+    assert next_success_rate(2) == pytest.approx(0.6667, rel=0.01)
     assert next_success_rate(3) == 0.75
     assert next_success_rate(4) == 0.8
     assert next_success_rate(10) == 0.9090909090909091
@@ -175,16 +175,18 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
 
 def test_next_sample_size():
     ## Next sample size should be larger than the current one by at least 4 times
-    assert next_sample_size(10) == 45, (
+    assert next_sample_size_with_1_failure(10) == 45, (
         "passing 10 out of 10 should require 45 successful runs to be statistically significant"
     )
-    assert next_sample_size(45) == 185, (
+    assert next_sample_size_with_1_failure(45) == 185, (
         "passing 45 out of 45 should require 185 successful runs to be statistically significant"
     )
-    assert next_sample_size(185) == 745
-    assert next_sample_size(745) == 2985
-    assert next_sample_size(29) == 121
-    assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"
+    assert next_sample_size_with_1_failure(185) == 745
+    assert next_sample_size_with_1_failure(745) == 2985
+    assert next_sample_size_with_1_failure(29) == 121
+    assert next_sample_size_with_1_failure(29) == next_sample_size_via_loop_with_1_failure(29), (
+        "calculated via loop should match"
+    )
 
     assert 28 / 29 == pytest.approx(0.96, rel=0.01)
     before = analyse_measure_from_test_sample(28, 29)
@@ -196,20 +198,60 @@ def test_next_sample_size():
     assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)
 
 
-def next_sample_size(current):
+def next_sample_size_with_1_failure(current):
     ## How many successful runs are needed to be statistically significant improvement
-    # compared to the current sample size with 100% success rate
+    # compared to the current sample size with 100% success rate at 90% confidence
     return 4 * current + 5
 
 
-def next_sample_size_via_loop(sample_size: int) -> int:
+def next_sample_size_via_loop_with_1_failure(sample_size: int) -> int:
     goal_success_rate = next_success_rate(sample_size)
     for i in range(sample_size, 5 * sample_size):
         if not is_within_expected(goal_success_rate, 1, i):
             return i
     return 0
 
 
+def next_sample_size_via_loop_no_failure(sample_size: int) -> int:
+    goal_success_rate = next_success_rate(sample_size)
+    for i in range(sample_size, 5 * sample_size):
+        if not is_within_expected(goal_success_rate, 0, i):
+            return i
+    return 0
+
+
+def next_sample_size_no_failure(sample_size: int) -> int:
+    return 2 * sample_size + 3
+
+
+@pytest.mark.parametrize(
+    "sample_size, expected",
+    [
+        (10, 45),
+        (45, 185),
+        (185, 745),
+        (745, 2985),
+        (29, 121),
+    ],
+)
+def test_next_sample_size_via_loop(sample_size, expected):
+    assert next_sample_size_via_loop_with_1_failure(sample_size) == expected
+
+
+@pytest.mark.parametrize(
+    "sample_size, expected",
+    [
+        (10, 23),
+        (23, 49),
+        (49, 101),
+        (101, 205),
+        (205, 413),
+    ],
+)
+def test_next_no_failure_sample_size_via_loop(sample_size, expected):
+    assert next_sample_size_via_loop_no_failure(sample_size) == expected
+
+
 def test_success_rate():
     tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
     assert tiny_set_analysis.proportion == 0.5

diff --git a/examples/team_recommender/tests/test_proportions_ztest.py b/examples/team_recommender/tests/test_proportions_ztest.py
@@ -1,7 +1,14 @@
+from math import isnan
+
 import pytest
 from helpers import is_within_expected
 from statsmodels.stats.proportion import proportions_ztest
-from test_helpers import next_success_rate
+from test_helpers import (
+    next_sample_size_no_failure,
+    next_sample_size_via_loop_with_1_failure,
+    next_sample_size_with_1_failure,
+    next_success_rate,
+)
 
 
 def test_proportions_ztest_improvement():
@@ -19,6 +26,14 @@ def test_proportions_ztest_exact_match():
     assert p_value == 1.0, "statistically insignificant result"
     assert stat == 0
 
+    stat, p_value = proportions_ztest(7, 10, 0.7, prop_var=1)
+    assert isnan(p_value)
+    assert isnan(stat)
+
+    stat, p_value = proportions_ztest(1, 10, 0.7, prop_var=0.5)
+    assert p_value == pytest.approx(0.00014, rel=0.1)
+    assert stat == pytest.approx(-3.79, rel=0.01)
+
 
 def test_proportions_ztest_significantly_better():
     stat, p_value = proportions_ztest(9, 10, 0.7)
@@ -53,24 +68,22 @@ def calculate_p_value(success, failure, sample_size) -> float:
 
 
 def calculate_ztest(success, failure, sample_size) -> tuple[float, float]:
-    measurements = [int(success * sample_size), sample_size - failure]
-    samples = [sample_size, sample_size]
-    zstat, p_value = proportions_ztest(measurements, samples)
+    zstat, p_value = proportions_ztest(sample_size - failure, sample_size, value=success)
     return zstat, p_value
 
 
 def is_statistically_significant(success, failure, sample_size):
-    return calculate_p_value(success, failure, sample_size) < 0.05
+    return calculate_p_value(success, failure, sample_size) <= 0.05
 
 
 def test_not_is_statistically_significant():
     assert not is_statistically_significant(0.7, 3, 10), "same proportion"
     assert not is_statistically_significant(0.9, 10, 100), "same proportion"
     assert not is_statistically_significant(0.7, 30, 100), "same proportion"
-    assert not is_statistically_significant(0.7, 0, 10), "covers 100% success rate"
 
 
 def test_is_statistically_significant():
+    assert is_statistically_significant(0.7, 0, 10), "70% does not covers 100% success rate"
     assert is_statistically_significant(0.9, 0, 100), "0 out of 100 > 90% success rate"
     assert is_statistically_significant(0.7, 0, 11), "0 out of 11 > 70% success rate"
     assert is_statistically_significant(0.9, 0, 31), "0 out of 31 > 90% success rate"
@@ -79,30 +92,51 @@ def test_is_statistically_significant():
 
 def test_is_statistically_significant_with_next_success_rate():
     sample_size = 10
-    assert not is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
-    assert is_statistically_significant(next_success_rate(sample_size), 0, 34)
+    assert is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
+    assert is_statistically_significant(
+        next_success_rate(sample_size), 0, next_sample_size_with_1_failure(sample_size)
+    )
     assert is_statistically_significant(next_success_rate(35), 0, 109)
 
 
+def test_example_on_wiki():
+    sample_size = 47
+    success_rate = 0.950
+    assert is_within_expected(success_rate, 1, sample_size)
+    assert not is_statistically_significant(success_rate, 1, sample_size)
+    next_rate = next_success_rate(sample_size)
+    next_size = next_sample_size_no_failure(sample_size)
+    assert next_sample_size_via_loop_with_1_failure(sample_size) == 193
+    assert next_size == 97
+    assert next_rate == pytest.approx(0.98, rel=0.01)
+
+    assert not is_within_expected(0.95, 1, next_size)
+    assert not is_within_expected(next_rate, 0, next_size)
+    assert is_within_expected(next_rate, 1, next_size)
+
+    assert is_statistically_significant(next_rate, 0, next_size)
+    assert not is_statistically_significant(next_rate, 1, next_size)
+
+
 def test_compare_is_within_expected_and_is_statistically_significant():
     assert is_within_expected(0.7, 3, 10), "not significant result for 3/10=70%"
     assert not is_statistically_significant(0.7, 3, 10), "not significant for 3/10=70%"
 
     assert is_within_expected(0.7, 0, 3), "not significant result for 0 out of 3"
-    assert not is_statistically_significant(0.7, 0, 3), "not significant result for 0 out of 3"
+    assert is_statistically_significant(0.7, 0, 1000), "not significant result for 0 out of 3"
 
 
 def test_improvement_from_70_percent():
     assert is_within_expected(0.7, 0, 3), "no improvement detected at 3"
-    assert not is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
+    assert is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
 
     assert not is_within_expected(0.7, 0, 4), "improvement detected at 4"
     assert is_statistically_significant(0.7, 0, 11), "improvement detected at 11"
 
 
 def test_improvement_from_97_percent():
     assert is_within_expected(0.97, 0, 66), "no improvement detected at 66"
-    assert not is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
+    assert is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
 
     assert not is_within_expected(0.97, 0, 67), "significantly better at 67"
     assert is_statistically_significant(0.97, 0, 101), "significantly better at 101"