Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Contributors
- Ian McFarland [@imf](https://github.com/imf)
- Dave Schinkel [@dschinkel](https://github.com/dschinkel)

- Carl Jackson [@carl](https://github.com/carl)
68 changes: 62 additions & 6 deletions examples/team_recommender/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_assert_success_rate_pass(row):
[
"New Success rate 0.900 with 90% confidence exceeds expected: 0.7",
"Broken Record:",
"Expecting: 0.744 <= 0.700 <= 1.056",
"Expecting: 0.744 <= 0.700 <= 1.000",
"Got: expected=0.7 <= analysis.lower_interval=0.74",
],
),
Expand All @@ -61,7 +61,7 @@ def test_assert_success_rate_pass(row):
[
"New Success rate 0.999 with 90% confidence exceeds expected: 0.98",
"Broken Record:",
"Expecting: 0.997 <= 0.980 <= 1.001",
"Expecting: 0.997 <= 0.980 <= 1.000",
"Got: expected=0.98 <= analysis.lower_interval=0.997",
],
),
Expand Down Expand Up @@ -92,8 +92,15 @@ def test_beyond_expected_success_rate(assert_success_rate, row):
(0.8, 14, 100, None),
(0.97, 1, 8, None),
(0.97, 0, 1, "after measuring 2x 100 runs and getting 3 failures"),
(0.975, 0, 100, "97.5% success rate is within 100% success rate"),
(0.9737, 0, 100, "97.37% success rate is within 100% success rate"),
(
0.97,
1,
133,
"At 133 we can say that with 90% confidence 1 failure is within 97% success rate",
),
(0.98, 0, 100, "97.5% success rate is within 100% success rate"),
(0.97999999999999999, 0, 100, "97.37% success rate is within 100% success rate"),
(0.5, 1, 2, None),
],
)
def test_is_within_expected(success_rate, failure_count, sample_size, message):
Expand All @@ -107,9 +114,15 @@ def test_is_within_expected(success_rate, failure_count, sample_size, message):
"failure_count, sample_size, expected_rate, message",
[
(3, 5, 0.8, "40% success rate is below expected 80% success rate"),
(1, 2, 0.97, "50% success rate is below expected 97% success rate"),
(0, 100, 0.97, "100% success rate is not within 97% success rate"),
(1, 50000, 0.9997, "99.99% success rate is below expected 97% success rate"),
(0, 100, 0.9736, "97.36% success rate is not within 100% success rate"),
(
1,
134,
0.97,
"At 134 we can say that with 90% confidence 1 failure is within 97% success rate",
),
],
)
def test_not_is_within_expected(failure_count, sample_size, expected_rate, message):
Expand Down Expand Up @@ -145,7 +158,13 @@ def test_next_success_rate():

@pytest.mark.parametrize(
"success_rate, largest_sample_size",
[(0.7, 12), (next_success_rate(12), 55), (next_success_rate(55), 248)],
[
(0.7, 10),
(next_success_rate(10), 44),
(next_success_rate(45), 184),
(next_success_rate(185), 744),
(next_success_rate(745), 2984),
],
)
def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample_size):
assert is_within_expected(success_rate, 1, largest_sample_size), "should be within expected"
Expand All @@ -154,6 +173,43 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample
)


def test_next_sample_size():
## Next sample size should be larger than the current one by at least 4 times
assert next_sample_size(10) == 45, (
"passing 10 out of 10 should require 45 successful runs to be statistically significant"
)
assert next_sample_size(45) == 185, (
"passing 45 out of 45 should require 185 successful runs to be statistically significant"
)
assert next_sample_size(185) == 745
assert next_sample_size(745) == 2985
assert next_sample_size(29) == 121
assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"

assert 28 / 29 == pytest.approx(0.96, rel=0.01)
before = analyse_measure_from_test_sample(28, 29)
assert before.proportion == pytest.approx(0.96, rel=0.01)
assert before.confidence_interval_prop == pytest.approx((0.91, 1.00), 0.01)

analysis = analyse_measure_from_test_sample(120, 121)
assert analysis.proportion == pytest.approx(0.99, rel=0.01)
assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)


def next_sample_size(current):
## How many successful runs are needed to be statistically significant improvement
# compared to the current sample size with 100% success rate
return 4 * current + 5


def next_sample_size_via_loop(sample_size: int) -> int:
goal_success_rate = next_success_rate(sample_size)
for i in range(sample_size, 5 * sample_size):
if not is_within_expected(goal_success_rate, 1, i):
return i
return 0


def test_success_rate():
tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
assert tiny_set_analysis.proportion == 0.5
Expand Down
108 changes: 108 additions & 0 deletions examples/team_recommender/tests/test_proportions_ztest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import pytest
from helpers import is_within_expected
from statsmodels.stats.proportion import proportions_ztest
from test_helpers import next_success_rate


def test_proportions_ztest_improvement():
successes = [70, 90]
n_observations = [100, 100]

stat, p_value = proportions_ztest(successes, n_observations)
assert p_value == pytest.approx(0.00040695, rel=0.001)
assert p_value < 0.05, "statistically significant result"
assert stat == pytest.approx(-3.5355, rel=0.001)


def test_proportions_ztest_exact_match():
stat, p_value = proportions_ztest(7, 10, 0.7)
assert p_value == 1.0, "statistically insignificant result"
assert stat == 0


def test_proportions_ztest_significantly_better():
stat, p_value = proportions_ztest(9, 10, 0.7)
assert p_value < 0.05, "statistically significant improvement"
assert proportions_ztest(9, 10, 0.7, alternative="larger")[1] < 0.05, (
"statistically proportion is larger than expected value"
)
assert proportions_ztest(9, 10, 0.7, alternative="two-sided")[1] < 0.05, (
"statistically proportion is larger or smaller than expected value"
)


def test_proportions_ztest_not_statistically_significantly():
for count in range(4, 8):
stat, p_value = proportions_ztest(count, 10, 0.7)
assert p_value > 0.05, "NO statistically significant deviation"


def test_proportions_ztest_significantly_worse():
stat, p_value = proportions_ztest(3, 10, 0.7)
assert p_value < 0.05, "statistically significant result"
assert proportions_ztest(3, 10, 0.7, alternative="smaller")[1] < 0.05, (
"statistically proportion is smaller than expected value"
)
assert proportions_ztest(3, 10, 0.7, alternative="two-sided")[1] < 0.05, (
"statistically proportion is smaller than expected value"
)


def calculate_p_value(success, failure, sample_size) -> float:
return calculate_ztest(success, failure, sample_size)[1]


def calculate_ztest(success, failure, sample_size) -> tuple[float, float]:
measurements = [int(success * sample_size), sample_size - failure]
samples = [sample_size, sample_size]
zstat, p_value = proportions_ztest(measurements, samples)
return zstat, p_value


def is_statistically_significant(success, failure, sample_size):
return calculate_p_value(success, failure, sample_size) < 0.05


def test_not_is_statistically_significant():
assert not is_statistically_significant(0.7, 3, 10), "same proportion"
assert not is_statistically_significant(0.9, 10, 100), "same proportion"
assert not is_statistically_significant(0.7, 30, 100), "same proportion"
assert not is_statistically_significant(0.7, 0, 10), "covers 100% success rate"


def test_is_statistically_significant():
assert is_statistically_significant(0.9, 0, 100), "0 out of 100 > 90% success rate"
assert is_statistically_significant(0.7, 0, 11), "0 out of 11 > 70% success rate"
assert is_statistically_significant(0.9, 0, 31), "0 out of 31 > 90% success rate"
assert is_statistically_significant(0.909090, 0, 33), "0 out of 33 > 90.9% success rate"


def test_is_statistically_significant_with_next_success_rate():
sample_size = 10
assert not is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
assert is_statistically_significant(next_success_rate(sample_size), 0, 34)
assert is_statistically_significant(next_success_rate(35), 0, 109)


def test_compare_is_within_expected_and_is_statistically_significant():
assert is_within_expected(0.7, 3, 10), "not significant result for 3/10=70%"
assert not is_statistically_significant(0.7, 3, 10), "not significant for 3/10=70%"

assert is_within_expected(0.7, 0, 3), "not significant result for 0 out of 3"
assert not is_statistically_significant(0.7, 0, 3), "not significant result for 0 out of 3"


def test_improvement_from_70_percent():
assert is_within_expected(0.7, 0, 3), "no improvement detected at 3"
assert not is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"

assert not is_within_expected(0.7, 0, 4), "improvement detected at 4"
assert is_statistically_significant(0.7, 0, 11), "improvement detected at 11"


def test_improvement_from_97_percent():
assert is_within_expected(0.97, 0, 66), "no improvement detected at 66"
assert not is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"

assert not is_within_expected(0.97, 0, 67), "significantly better at 67"
assert is_statistically_significant(0.97, 0, 101), "significantly better at 101"
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ test = [
"pytest-asyncio>=0.21.0,<0.22",
"mypy>=1.8.0,<2",
"pytest-snapshot>=0.9.0",
"statsmodels>=0.14.4",
]
examples = ["openai>=1.63.2,<2", "python-dotenv>=1.0.1,<2"]
dev = [
Expand Down
4 changes: 2 additions & 2 deletions src/cat_ai/statistical_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def analyse_measure_from_test_sample(measure: int, sample_size: int) -> Statisti
me = z * se

# Calculate confidence interval bounds as proportions
lower_bound_prop = p_hat - me
upper_bound_prop = p_hat + me
lower_bound_prop = max(0, p_hat - me)
upper_bound_prop = min(1, p_hat + me)

# Convert proportion bounds to integer counts
lower_bound_count: int = math.ceil(lower_bound_prop * sample_size)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
failure_count,sample_size,margin_of_error_count,confidence_lower,confidence_upper,proportion,standard_error,margin_of_error,confidence_proportion_lower,confidence_proportion_upper
0,100,0,0,0,0.0,0.0,0.0,0.0,0.0
1,100,1,0,2,0.01,0.0099498743710662,0.01636608694695973,-0.006366086946959731,0.02636608694695973
2,100,2,0,4,0.02,0.014,0.023027950777320602,-0.0030279507773206017,0.043027950777320606
0,100,0,0,0,0.0,0.0,0.0,0,0.0
1,100,1,0,2,0.01,0.0099498743710662,0.01636608694695973,0,0.02636608694695973
2,100,2,0,4,0.02,0.014,0.023027950777320602,0,0.043027950777320606
3,100,2,1,5,0.03,0.01705872210923198,0.02805910093252748,0.00194089906747252,0.058059100932527474
4,100,3,1,7,0.04,0.019595917942265423,0.0322324167007787,0.007767583299221302,0.0722324167007787
5,100,3,2,8,0.05,0.021794494717703367,0.03584875368398907,0.014151246316010932,0.08584875368398907
Expand Down Expand Up @@ -97,6 +97,6 @@ failure_count,sample_size,margin_of_error_count,confidence_lower,confidence_uppe
95,100,3,92,98,0.95,0.021794494717703377,0.035848753683989085,0.9141512463160109,0.985848753683989
96,100,3,93,99,0.96,0.019595917942265433,0.03223241670077871,0.9277675832992213,0.9922324167007787
97,100,2,95,99,0.97,0.017058722109231986,0.02805910093252749,0.9419408990674725,0.9980591009325275
98,100,2,96,100,0.98,0.014000000000000005,0.02302795077732061,0.9569720492226794,1.0030279507773205
99,100,1,98,100,0.99,0.009949874371066205,0.016366086946959738,0.9736339130530403,1.0063660869469597
100,100,0,100,100,1.0,0.0,0.0,1.0,1.0
98,100,2,96,100,0.98,0.014000000000000005,0.02302795077732061,0.9569720492226794,1
99,100,1,98,100,0.99,0.009949874371066205,0.016366086946959738,0.9736339130530403,1
100,100,0,100,100,1.0,0.0,0.0,1.0,1
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading