Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 53 additions & 11 deletions examples/team_recommender/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,12 @@ def test_seventy_percent_confidence_ranges_from_fifty_to_ninety():


def next_success_rate(sample_size) -> float:
return 1 - 1 / (sample_size + 1)
return sample_size / (sample_size + 1)


def test_next_success_rate():
assert next_success_rate(1) == 0.5
assert next_success_rate(2) == 0.6666666666666667
assert next_success_rate(2) == pytest.approx(0.6667, rel=0.01)
assert next_success_rate(3) == 0.75
assert next_success_rate(4) == 0.8
assert next_success_rate(10) == 0.9090909090909091
Expand Down Expand Up @@ -175,16 +175,18 @@ def test_largest_sample_size_for_given_success_rate(success_rate, largest_sample

def test_next_sample_size():
## Next sample size should be larger than the current one by at least 4 times
assert next_sample_size(10) == 45, (
assert next_sample_size_with_1_failure(10) == 45, (
"passing 10 out of 10 should require 45 successful runs to be statistically significant"
)
assert next_sample_size(45) == 185, (
assert next_sample_size_with_1_failure(45) == 185, (
"passing 45 out of 45 should require 185 successful runs to be statistically significant"
)
assert next_sample_size(185) == 745
assert next_sample_size(745) == 2985
assert next_sample_size(29) == 121
assert next_sample_size(29) == next_sample_size_via_loop(29), "calculated via loop should match"
assert next_sample_size_with_1_failure(185) == 745
assert next_sample_size_with_1_failure(745) == 2985
assert next_sample_size_with_1_failure(29) == 121
assert next_sample_size_with_1_failure(29) == next_sample_size_via_loop_with_1_failure(29), (
"calculated via loop should match"
)

assert 28 / 29 == pytest.approx(0.96, rel=0.01)
before = analyse_measure_from_test_sample(28, 29)
Expand All @@ -196,20 +198,60 @@ def test_next_sample_size():
assert analysis.confidence_interval_prop == pytest.approx((0.98, 1.00), 0.01)


def next_sample_size(current):
def next_sample_size_with_1_failure(current):
## How many successful runs are needed to be statistically significant improvement
# compared to the current sample size with 100% success rate
# compared to the current sample size with 100% success rate at 90% confidence
return 4 * current + 5


def next_sample_size_via_loop(sample_size: int) -> int:
def next_sample_size_via_loop_with_1_failure(sample_size: int) -> int:
goal_success_rate = next_success_rate(sample_size)
for i in range(sample_size, 5 * sample_size):
if not is_within_expected(goal_success_rate, 1, i):
return i
return 0


def next_sample_size_via_loop_no_failure(sample_size: int) -> int:
goal_success_rate = next_success_rate(sample_size)
for i in range(sample_size, 5 * sample_size):
if not is_within_expected(goal_success_rate, 0, i):
return i
return 0


def next_sample_size_no_failure(sample_size: int) -> int:
return 2 * sample_size + 3


@pytest.mark.parametrize(
"sample_size, expected",
[
(10, 45),
(45, 185),
(185, 745),
(745, 2985),
(29, 121),
],
)
def test_next_sample_size_via_loop(sample_size, expected):
assert next_sample_size_via_loop_with_1_failure(sample_size) == expected


@pytest.mark.parametrize(
"sample_size, expected",
[
(10, 23),
(23, 49),
(49, 101),
(101, 205),
(205, 413),
],
)
def test_next_no_failure_sample_size_via_loop(sample_size, expected):
assert next_sample_size_via_loop_no_failure(sample_size) == expected


def test_success_rate():
tiny_set_analysis = analyse_measure_from_test_sample(1, 2)
assert tiny_set_analysis.proportion == 0.5
Expand Down
56 changes: 45 additions & 11 deletions examples/team_recommender/tests/test_proportions_ztest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
from math import isnan

import pytest
from helpers import is_within_expected
from statsmodels.stats.proportion import proportions_ztest
from test_helpers import next_success_rate
from test_helpers import (
next_sample_size_no_failure,
next_sample_size_via_loop_with_1_failure,
next_sample_size_with_1_failure,
next_success_rate,
)


def test_proportions_ztest_improvement():
Expand All @@ -19,6 +26,14 @@ def test_proportions_ztest_exact_match():
assert p_value == 1.0, "statistically insignificant result"
assert stat == 0

stat, p_value = proportions_ztest(7, 10, 0.7, prop_var=1)
assert isnan(p_value)
assert isnan(stat)

stat, p_value = proportions_ztest(1, 10, 0.7, prop_var=0.5)
assert p_value == pytest.approx(0.00014, rel=0.1)
assert stat == pytest.approx(-3.79, rel=0.01)


def test_proportions_ztest_significantly_better():
stat, p_value = proportions_ztest(9, 10, 0.7)
Expand Down Expand Up @@ -53,24 +68,22 @@ def calculate_p_value(success, failure, sample_size) -> float:


def calculate_ztest(success, failure, sample_size) -> tuple[float, float]:
measurements = [int(success * sample_size), sample_size - failure]
samples = [sample_size, sample_size]
zstat, p_value = proportions_ztest(measurements, samples)
zstat, p_value = proportions_ztest(sample_size - failure, sample_size, value=success)
return zstat, p_value


def is_statistically_significant(success, failure, sample_size):
return calculate_p_value(success, failure, sample_size) < 0.05
return calculate_p_value(success, failure, sample_size) <= 0.05


def test_not_is_statistically_significant():
assert not is_statistically_significant(0.7, 3, 10), "same proportion"
assert not is_statistically_significant(0.9, 10, 100), "same proportion"
assert not is_statistically_significant(0.7, 30, 100), "same proportion"
assert not is_statistically_significant(0.7, 0, 10), "covers 100% success rate"


def test_is_statistically_significant():
assert is_statistically_significant(0.7, 0, 10), "70% does not covers 100% success rate"
assert is_statistically_significant(0.9, 0, 100), "0 out of 100 > 90% success rate"
assert is_statistically_significant(0.7, 0, 11), "0 out of 11 > 70% success rate"
assert is_statistically_significant(0.9, 0, 31), "0 out of 31 > 90% success rate"
Expand All @@ -79,30 +92,51 @@ def test_is_statistically_significant():

def test_is_statistically_significant_with_next_success_rate():
sample_size = 10
assert not is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
assert is_statistically_significant(next_success_rate(sample_size), 0, 34)
assert is_statistically_significant(next_success_rate(sample_size), 0, sample_size)
assert is_statistically_significant(
next_success_rate(sample_size), 0, next_sample_size_with_1_failure(sample_size)
)
assert is_statistically_significant(next_success_rate(35), 0, 109)


def test_example_on_wiki():
sample_size = 47
success_rate = 0.950
assert is_within_expected(success_rate, 1, sample_size)
assert not is_statistically_significant(success_rate, 1, sample_size)
next_rate = next_success_rate(sample_size)
next_size = next_sample_size_no_failure(sample_size)
assert next_sample_size_via_loop_with_1_failure(sample_size) == 193
assert next_size == 97
assert next_rate == pytest.approx(0.98, rel=0.01)

assert not is_within_expected(0.95, 1, next_size)
assert not is_within_expected(next_rate, 0, next_size)
assert is_within_expected(next_rate, 1, next_size)

assert is_statistically_significant(next_rate, 0, next_size)
assert not is_statistically_significant(next_rate, 1, next_size)


def test_compare_is_within_expected_and_is_statistically_significant():
assert is_within_expected(0.7, 3, 10), "not significant result for 3/10=70%"
assert not is_statistically_significant(0.7, 3, 10), "not significant for 3/10=70%"

assert is_within_expected(0.7, 0, 3), "not significant result for 0 out of 3"
assert not is_statistically_significant(0.7, 0, 3), "not significant result for 0 out of 3"
assert is_statistically_significant(0.7, 0, 1000), "not significant result for 0 out of 3"


def test_improvement_from_70_percent():
assert is_within_expected(0.7, 0, 3), "no improvement detected at 3"
assert not is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"
assert is_statistically_significant(0.7, 0, 10), "no improvement detected at 10"

assert not is_within_expected(0.7, 0, 4), "improvement detected at 4"
assert is_statistically_significant(0.7, 0, 11), "improvement detected at 11"


def test_improvement_from_97_percent():
assert is_within_expected(0.97, 0, 66), "no improvement detected at 66"
assert not is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"
assert is_statistically_significant(0.97, 0, 100), "no improvement detected at 100"

assert not is_within_expected(0.97, 0, 67), "significantly better at 67"
assert is_statistically_significant(0.97, 0, 101), "significantly better at 101"