Skip to content

Commit a56e21a

Browse files
paulzcarlCopilot
authored
add proportions_ztest to test examples (#62)
Add [proportions_ztest](https://www.statsmodels.org/dev/generated/statsmodels.stats.proportion.proportions_ztest.html) and compare it with current StatisticalAnalysis ### Testing framework updates: * [`examples/team_recommender/tests/test_helpers.py`](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL95-R103): Added new test cases and modified existing ones to improve the accuracy of success rate measurements and sample size calculations. [[1]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL95-R103) [[2]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL110-R125) [[3]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eL148-R167) [[4]](diffhunk://#diff-976a531e145fe670a98681dd9a62a49f27706f59e8287f4fb60e6d6898ed2b3eR176-R195) * [`examples/team_recommender/tests/test_proportions_ztest.py`](diffhunk://#diff-ef60d571344b1948a6f2edcc6cb72c9875987bfe7d2d568e74e118cf37694e9cR1-R108): Added new tests for the `proportions_ztest` function to check for statistical significance and improvements in success rates. ### Contributors update: * [`CONTRIBUTORS.md`](diffhunk://#diff-c0f86987c556ec52d97b9acf0f35bb2ad0521f65c3113e1b15362ca76502eed2L4-R4): Added Carl Jackson to the list of contributors. ### Dependency update: * [`pyproject.toml`](diffhunk://#diff-50c86b7ed8ac2cf95bd48334961bf0530cdc77b5a56f852c5c61b89d735fd711R28): Added `statsmodels` to the list of test dependencies. --------- Signed-off-by: Paul Zabelin <paulzabelin@artium.ai> Co-authored-by: Carl Jackson <carl@realvr.ai> Co-authored-by: Paul Zabelin <paulz@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent e4ca299 commit a56e21a

File tree

8 files changed

+289
-15
lines changed

8 files changed

+289
-15
lines changed

CONTRIBUTORS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
## Contributors
22
- Ian McFarland [@imf](https://github.com/imf)
33
- Dave Schinkel [@dschinkel](https://github.com/dschinkel)
4-
4+
- Carl Jackson [@carl](https://github.com/carl)

examples/team_recommender/tests/test_helpers.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def test_assert_success_rate_pass(row):
5050
[
5151
"New Success rate 0.900 with 90% confidence exceeds expected: 0.7",
5252
"Broken Record:",
53-
"Expecting: 0.744 <= 0.700 <= 1.056",
53+
"Expecting: 0.744 <= 0.700 <= 1.000",
5454
"Got: expected=0.7 <= analysis.lower_interval=0.74",
5555
],
5656
),
@@ -61,7 +61,7 @@ def test_assert_success_rate_pass(row):
6161
[
6262
"New Success rate 0.999 with 90% confidence exceeds expected: 0.98",
6363
"Broken Record:",
64-
"Expecting: 0.997 <= 0.980 <= 1.001",
64+
"Expecting: 0.997 <= 0.980 <= 1.000",
6565
"Got: expected=0.98 <= analysis.lower_interval=0.997",
6666
],
6767
),
@@ -92,8 +92,15 @@ def test_beyond_expected_success_rate(assert_success_rate, row):
9292
(0.8, 14, 100, None),
9393
(0.97, 1, 8, None),
9494
(0.97, 0, 1, "after measuring 2x 100 runs and getting 3 failures"),
95-
(0.975, 0, 100, "97.5% success rate is within 100% success rate"),
96-
(0.9737, 0, 100, "97.37% success rate is within 100% success rate"),
95+
(
96+
0.97,
97+
1,
98+
133,
99+
"At 133 we can say that with 90% confidence 1 failure is within 97% success rate",
100+
),
101+
(0.98, 0, 100, "97.5% success rate is within 100% success rate"),
102+
(0.97999999999999999, 0, 100, "97.37% success rate is within 100% success rate"),
103+
(0.5, 1, 2, None),
97104
],
98105
)
99106
def test_is_within_expected(success_rate, failure_count, sample_size, message):
@@ -107,9 +114,15 @@ def test_is_within_expected(success_rate, failure_count, sample_size, message):
107114
"failure_count, sample_size, expected_rate, message",
108115
[
109116
(3, 5, 0.8, "40% success rate is below expected 80% success rate"),
110-
(1, 2, 0.97, "50% success rate is below expected 97% success rate"),
111117
(0, 100, 0.97, "100% success rate is not within 97% success rate"),
118+
(1, 50000, 0.9997, "99.99% success rate is below expected 97% success rate"),
112119
(0, 100, 0.9736, "97.36% success rate is not within 100% success rate"),
120+
(
121+
1,
122+
134,
123+
0.97,
124+
"At 134 we can say that with 90% confidence 1 failure is within 97% success rate",
125+
),
113126
],
114127
)
115128
def test_not_is_within_expected(failure_count, sample_size, expected_rate, message):
@@ -145,7 +158,13 @@ def test_next_success_rate():
145158

146159
@pytest.mark.parametrize(
147160
"success_rate, largest_sample_size",
148-
[(0.7, 12), (next_success_rate(12), 55), (next_success_rate(55), 248)],
161+
[
162+
(0.7, 10),
163+
(next_success_rate(10), 44),
164+
(next_success_rate(45), 184),
165+
(next_success_rate(185), 744),
166+
(next_success_rate(745), 2984),
167+
],
149168
)
150169
def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample_size):
151170
assert is_within_expected(success_rate, 1, largest_sample_size), "should be within expected"
@@ -154,6 +173,43 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
154173
)
155174

156175

176+
def test_next_sample_size():
177+
## Next sample size should be larger than the current one by at least 4 times
178+
assert next_sample_size(10) == 45, (
179+
"passing 10 out of 10 should require 45 successful runs to be statistically significant"
180+
)
181+
assert next_sample_size(45) == 185, (
182+
"passing 45 out of 45 should require 185 successful runs to be statistically significant"
183+
)
184+
assert next_sample_size(185) == 745
185+
assert next_sample_size(745) == 2985
186+
assert next_sample_size(29) == 121
187+
assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"
188+
189+
assert 28 / 29 == pytest.approx(0.96, rel=0.01)
190+
before = analyse_measure_from_test_sample(28, 29)
191+
assert before.proportion == pytest.approx(0.96, rel=0.01)
192+
assert before.confidence_interval_prop == pytest.approx((0.91, 1.00), 0.01)
193+
194+
analysis = analyse_measure_from_test_sample(120, 121)
195+
assert analysis.proportion == pytest.approx(0.99, rel=0.01)
196+
assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)
197+
198+
199+
def next_sample_size(current):
200+
## How many successful runs are needed to be statistically significant improvement
201+
# compared to the current sample size with 100% success rate
202+
return 4 * current + 5
203+
204+
205+
def next_sample_size_via_loop(sample_size: int) -> int:
206+
goal_success_rate = next_success_rate(sample_size)
207+
for i in range(sample_size, 5 * sample_size):
208+
if not is_within_expected(goal_success_rate, 1, i):
209+
return i
210+
return 0
211+
212+
157213
def test_success_rate():
158214
tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
159215
assert tiny_set_analysis.proportion == 0.5
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import pytest
2+
from helpers import is_within_expected
3+
from statsmodels.stats.proportion import proportions_ztest
4+
from test_helpers import next_success_rate
5+
6+
7+
def test_proportions_ztest_improvement():
8+
successes = [70, 90]
9+
n_observations = [100, 100]
10+
11+
stat, p_value = proportions_ztest(successes, n_observations)
12+
assert p_value == pytest.approx(0.00040695, rel=0.001)
13+
assert p_value < 0.05, "statistically significant result"
14+
assert stat == pytest.approx(-3.5355, rel=0.001)
15+
16+
17+
def test_proportions_ztest_exact_match():
18+
stat, p_value = proportions_ztest(7, 10, 0.7)
19+
assert p_value == 1.0, "statistically insignificant result"
20+
assert stat == 0
21+
22+
23+
def test_proportions_ztest_significantly_better():
24+
stat, p_value = proportions_ztest(9, 10, 0.7)
25+
assert p_value < 0.05, "statistically significant improvement"
26+
assert proportions_ztest(9, 10, 0.7, alternative="larger")[1] < 0.05, (
27+
"statistically proportion is larger than expected value"
28+
)
29+
assert proportions_ztest(9, 10, 0.7, alternative="two-sided")[1] < 0.05, (
30+
"statistically proportion is larger or smaller than expected value"
31+
)
32+
33+
34+
def test_proportions_ztest_not_statistically_significantly():
35+
for count in range(4, 8):
36+
stat, p_value = proportions_ztest(count, 10, 0.7)
37+
assert p_value > 0.05, "NO statistically significant deviation"
38+
39+
40+
def test_proportions_ztest_significantly_worse():
41+
stat, p_value = proportions_ztest(3, 10, 0.7)
42+
assert p_value < 0.05, "statistically significant result"
43+
assert proportions_ztest(3, 10, 0.7, alternative="smaller")[1] < 0.05, (
44+
"statistically proportion is smaller than expected value"
45+
)
46+
assert proportions_ztest(3, 10, 0.7, alternative="two-sided")[1] < 0.05, (
47+
"statistically proportion is smaller than expected value"
48+
)
49+
50+
51+
def calculate_p_value(success, failure, sample_size) -> float:
52+
return calculate_ztest(success, failure, sample_size)[1]
53+
54+
55+
def calculate_ztest(success, failure, sample_size) -> tuple[float, float]:
56+
measurements = [int(success * sample_size), sample_size - failure]
57+
samples = [sample_size, sample_size]
58+
zstat, p_value = proportions_ztest(measurements, samples)
59+
return zstat, p_value
60+
61+
62+
def is_statistically_significant(success, failure, sample_size):
63+
return calculate_p_value(success, failure, sample_size) < 0.05
64+
65+
66+
def test_not_is_statistically_significant():
67+
assert not is_statistically_significant(0.7, 3, 10), "same proportion"
68+
assert not is_statistically_significant(0.9, 10, 100), "same proportion"
69+
assert not is_statistically_significant(0.7, 30, 100), "same proportion"
70+
assert not is_statistically_significant(0.7, 0, 10), "covers 100% success rate"
71+
72+
73+
def test_is_statistically_significant():
74+
assert is_statistically_significant(0.9, 0, 100), "0 out of 100 > 90% success rate"
75+
assert is_statistically_significant(0.7, 0, 11), "0 out of 11 > 70% success rate"
76+
assert is_statistically_significant(0.9, 0, 31), "0 out of 31 > 90% success rate"
77+
assert is_statistically_significant(0.909090, 0, 33), "0 out of 33 > 90.9% success rate"
78+
79+
80+
def test_is_statistically_significant_with_next_success_rate():
81+
sample_size = 10
82+
assert not is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
83+
assert is_statistically_significant(next_success_rate(sample_size), 0, 34)
84+
assert is_statistically_significant(next_success_rate(35), 0, 109)
85+
86+
87+
def test_compare_is_within_expected_and_is_statistically_significant():
88+
assert is_within_expected(0.7, 3, 10), "not significant result for 3/10=70%"
89+
assert not is_statistically_significant(0.7, 3, 10), "not significant for 3/10=70%"
90+
91+
assert is_within_expected(0.7, 0, 3), "not significant result for 0 out of 3"
92+
assert not is_statistically_significant(0.7, 0, 3), "not significant result for 0 out of 3"
93+
94+
95+
def test_improvement_from_70_percent():
96+
assert is_within_expected(0.7, 0, 3), "no improvement detected at 3"
97+
assert not is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
98+
99+
assert not is_within_expected(0.7, 0, 4), "improvement detected at 4"
100+
assert is_statistically_significant(0.7, 0, 11), "improvement detected at 11"
101+
102+
103+
def test_improvement_from_97_percent():
104+
assert is_within_expected(0.97, 0, 66), "no improvement detected at 66"
105+
assert not is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
106+
107+
assert not is_within_expected(0.97, 0, 67), "significantly better at 67"
108+
assert is_statistically_significant(0.97, 0, 101), "significantly better at 101"

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ test = [
2525
"pytest-asyncio>=0.21.0,<0.22",
2626
"mypy>=1.8.0,<2",
2727
"pytest-snapshot>=0.9.0",
28+
"statsmodels>=0.14.4",
2829
]
2930
examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
3031
dev = [

src/cat_ai/statistical_analysis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ def analyse_measure_from_test_sample(measure: int, sample_size: int) -> Statisti
9090
me = z * se
9191

9292
# Calculate confidence interval bounds as proportions
93-
lower_bound_prop = p_hat - me
94-
upper_bound_prop = p_hat + me
93+
lower_bound_prop = max(0, p_hat - me)
94+
upper_bound_prop = min(1, p_hat + me)
9595

9696
# Convert proportion bounds to integer counts
9797
lower_bound_count: int = math.ceil(lower_bound_prop * sample_size)

tests/snapshots/test_statistical_analysis/test_failure_rate_bar_graph/failure_rate_results.csv

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
failure_count,sample_size,margin_of_error_count,confidence_lower,confidence_upper,proportion,standard_error,margin_of_error,confidence_proportion_lower,confidence_proportion_upper
2-
0,100,0,0,0,0.0,0.0,0.0,0.0,0.0
3-
1,100,1,0,2,0.01,0.0099498743710662,0.01636608694695973,-0.006366086946959731,0.02636608694695973
4-
2,100,2,0,4,0.02,0.014,0.023027950777320602,-0.0030279507773206017,0.043027950777320606
2+
0,100,0,0,0,0.0,0.0,0.0,0,0.0
3+
1,100,1,0,2,0.01,0.0099498743710662,0.01636608694695973,0,0.02636608694695973
4+
2,100,2,0,4,0.02,0.014,0.023027950777320602,0,0.043027950777320606
55
3,100,2,1,5,0.03,0.01705872210923198,0.02805910093252748,0.00194089906747252,0.058059100932527474
66
4,100,3,1,7,0.04,0.019595917942265423,0.0322324167007787,0.007767583299221302,0.0722324167007787
77
5,100,3,2,8,0.05,0.021794494717703367,0.03584875368398907,0.014151246316010932,0.08584875368398907
@@ -97,6 +97,6 @@ failure_count,sample_size,margin_of_error_count,confidence_lower,confidence_uppe
9797
95,100,3,92,98,0.95,0.021794494717703377,0.035848753683989085,0.9141512463160109,0.985848753683989
9898
96,100,3,93,99,0.96,0.019595917942265433,0.03223241670077871,0.9277675832992213,0.9922324167007787
9999
97,100,2,95,99,0.97,0.017058722109231986,0.02805910093252749,0.9419408990674725,0.9980591009325275
100-
98,100,2,96,100,0.98,0.014000000000000005,0.02302795077732061,0.9569720492226794,1.0030279507773205
101-
99,100,1,98,100,0.99,0.009949874371066205,0.016366086946959738,0.9736339130530403,1.0063660869469597
102-
100,100,0,100,100,1.0,0.0,0.0,1.0,1.0
100+
98,100,2,96,100,0.98,0.014000000000000005,0.02302795077732061,0.9569720492226794,1
101+
99,100,1,98,100,0.99,0.009949874371066205,0.016366086946959738,0.9736339130530403,1
102+
100,100,0,100,100,1.0,0.0,0.0,1.0,1
-213 Bytes
Loading

0 commit comments

Comments
 (0)