Skip to content

Commit 5d28592

Browse files
authored
Merge pull request #32 from PySATL/feat/p-value-calc
feat: add P-value calculator
2 parents 2939868 + e4051a2 commit 5d28592

File tree

9 files changed

+500
-70
lines changed

9 files changed

+500
-70
lines changed

pysatl_criterion/cv_calculator/cv_calculator/cv_calculator.py

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
CriticalValueQuery,
66
ILimitDistributionStorage,
77
)
8+
from pysatl_criterion.statistics.models import HypothesisType
89

910

1011
class CVCalculator:
@@ -17,13 +18,20 @@ class CVCalculator:
1718
def __init__(self, limit_distribution_storage: ILimitDistributionStorage):
1819
self.limit_distribution_storage = limit_distribution_storage
1920

20-
def calculate_critical_value(self, criterion_code: str, sample_size: int, sl: float) -> float:
21+
def calculate_critical_value(
22+
self,
23+
criterion_code: str,
24+
sample_size: int,
25+
sl: float,
26+
alternative: HypothesisType = HypothesisType.RIGHT,
27+
) -> float | tuple[float, float]:
2128
"""
2229
Calculate critical value for given criterion.
2330
2431
:param criterion_code: criterion code.
2532
:param sample_size: sample size.
2633
:param sl: significance level.
34+
:param alternative: test alternative
2735
2836
:return: critical value.
2937
"""
@@ -40,36 +48,13 @@ def calculate_critical_value(self, criterion_code: str, sample_size: int, sl: fl
4048

4149
ecdf = scipy_stats.ecdf(statistics_values)
4250

43-
critical_value = float(np.quantile(ecdf.cdf.quantiles, q=1 - sl))
44-
45-
return critical_value
46-
47-
def calculate_two_tailed_critical_values(
48-
self, criterion_code: str, sample_size: int, sl: float
49-
) -> tuple[float, float]:
50-
"""
51-
Calculate critical values for two-tailed criterion.
52-
53-
:param criterion_code: criterion code.
54-
:param sample_size: sample size.
55-
:param sl: significance level.
56-
57-
:return: critical values.
58-
"""
59-
60-
query = CriticalValueQuery(criterion_code=criterion_code, sample_size=sample_size)
61-
62-
limit_distribution_from_db = self.limit_distribution_storage.get_data_for_cv(query)
63-
if limit_distribution_from_db is None:
64-
raise ValueError(
65-
"Limit distribution for given criterion and sample size does not exist."
66-
)
67-
68-
statistics_values = limit_distribution_from_db.results_statistics
69-
70-
ecdf = scipy_stats.ecdf(statistics_values)
71-
72-
critical_value_left = float(np.quantile(ecdf.cdf.quantiles, q=sl / 2))
73-
critical_value_right = float(np.quantile(ecdf.cdf.quantiles, q=1 - sl / 2))
74-
75-
return critical_value_left, critical_value_right
51+
if alternative == HypothesisType.RIGHT:
52+
return float(np.quantile(ecdf.cdf.quantiles, q=1 - sl))
53+
elif alternative == HypothesisType.LEFT:
54+
return float(np.quantile(ecdf.cdf.quantiles, q=sl))
55+
elif alternative == HypothesisType.TWO_TAILED:
56+
left = float(np.quantile(ecdf.cdf.quantiles, q=sl / 2))
57+
right = float(np.quantile(ecdf.cdf.quantiles, q=1 - sl / 2))
58+
return left, right
59+
else:
60+
raise ValueError("Unknown alternative.")

pysatl_criterion/p_value_calculator/__init__.py

Whitespace-only changes.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import scipy.stats as scipy_stats
2+
3+
from pysatl_criterion.persistence.limit_distribution.sqlite.sqlite import (
4+
SQLiteLimitDistributionStorage,
5+
)
6+
from pysatl_criterion.persistence.model.limit_distribution.limit_distribution import (
7+
CriticalValueQuery,
8+
)
9+
from pysatl_criterion.statistics.models import HypothesisType
10+
11+
12+
class PValueCalculator:
13+
"""
14+
P-value calculator.
15+
16+
:param limit_distribution_storage: limit distribution storage
17+
"""
18+
19+
def __init__(self, limit_distribution_storage: SQLiteLimitDistributionStorage):
20+
self.limit_distribution_storage = limit_distribution_storage
21+
22+
def calculate_p_value(
23+
self,
24+
criterion_code: str,
25+
sample_size: int,
26+
statistics_value: float,
27+
alternative: HypothesisType = HypothesisType.RIGHT,
28+
) -> float:
29+
"""
30+
Calculate p-value.
31+
32+
:param criterion_code: criterion code
33+
:param sample_size: sample size
34+
:param statistics_value: statistics value
35+
:param alternative: test alternative
36+
37+
:return: p-value
38+
"""
39+
40+
query = CriticalValueQuery(criterion_code=criterion_code, sample_size=sample_size)
41+
limit_distribution_from_db = self.limit_distribution_storage.get_data_for_cv(query)
42+
43+
if limit_distribution_from_db is None:
44+
raise ValueError(
45+
"Limit distribution for given criterion and sample size does not exist."
46+
)
47+
48+
simulation_results = limit_distribution_from_db.results_statistics
49+
50+
ecdf = scipy_stats.ecdf(simulation_results)
51+
52+
cdf_value = ecdf.cdf.evaluate(statistics_value)
53+
54+
if alternative == HypothesisType.RIGHT:
55+
return 1.0 - cdf_value
56+
elif alternative == HypothesisType.TWO_TAILED:
57+
return 2.0 * min(cdf_value, 1.0 - cdf_value)
58+
elif alternative == HypothesisType.LEFT:
59+
return cdf_value
60+
else:
61+
raise ValueError("Unknown alternative")

pysatl_criterion/statistics/models.py

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,43 @@
11
from abc import ABC, abstractmethod
2+
from enum import Enum, auto
23

34
from numpy import float64
45

56

6-
class AbstractStatistic(ABC):
7-
two_tailed: bool = False
7+
class HypothesisType(Enum):
8+
"""
9+
Alternatives for hypotheses.
10+
"""
11+
12+
RIGHT = auto()
13+
LEFT = auto()
14+
TWO_TAILED = auto()
15+
16+
def check_hypothesis(
17+
self,
18+
statistic_value,
19+
cv: float | tuple[float, float],
20+
) -> bool:
21+
"""
22+
Compares the value of a statistic to a critical value
23+
param statistic_value: statistic value
24+
param cv: critical value
25+
26+
return: True if hypothesis is valid, False otherwise
27+
28+
"""
29+
if self == HypothesisType.RIGHT:
30+
return statistic_value <= cv
31+
if self == HypothesisType.LEFT:
32+
return statistic_value >= cv
33+
if self == HypothesisType.TWO_TAILED:
34+
if not isinstance(cv, tuple):
35+
raise TypeError("For a TWO_SIDED hypothesis, 'cv' must be a tuple of two floats.")
36+
left_cv, right_cv = cv
37+
return left_cv <= statistic_value <= right_cv
38+
839

40+
class AbstractStatistic(ABC):
941
@staticmethod
1042
@abstractmethod
1143
def code() -> str:
@@ -22,19 +54,3 @@ def execute_statistic(self, rvs, **kwargs) -> float | float64:
2254
:param kwargs: arguments for statistic calculation
2355
"""
2456
raise NotImplementedError("Method is not implemented")
25-
26-
def calculate_critical_value(self, rvs_size, sl) -> float | float64 | None:
27-
"""
28-
Calculate critical value for test statistics
29-
:param rvs_size: rvs size
30-
:param sl: significance level
31-
"""
32-
return None
33-
34-
def calculate_two_tailed_critical_values(self, rvs_size: int, sl) -> tuple[float, float] | None:
35-
"""
36-
Calculate two-tailed critical values for test statistics
37-
:param rvs_size: rvs size
38-
:param sl: significance level
39-
"""
40-
return None
Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
from pysatl_criterion.cv_calculator.cv_calculator.cv_calculator import CVCalculator
2+
from pysatl_criterion.p_value_calculator.p_value_calculator.p_value_calculator import (
3+
PValueCalculator,
4+
)
25
from pysatl_criterion.persistence.limit_distribution.sqlite.sqlite import (
36
SQLiteLimitDistributionStorage,
47
)
58
from pysatl_criterion.statistics.goodness_of_fit import AbstractGoodnessOfFitStatistic
9+
from pysatl_criterion.statistics.models import HypothesisType
610

711

812
class GoodnessOfFitTest:
@@ -11,17 +15,24 @@ class GoodnessOfFitTest:
1115
1216
:param statistics: statistics.
1317
:param significance_level: significance level.
18+
:param test_method: test method either 'critical_value' or 'p_value'.
19+
:param alternative: test alternative.
20+
1421
"""
1522

1623
def __init__(
1724
self,
1825
statistics: AbstractGoodnessOfFitStatistic,
1926
significance_level: float,
2027
db_connection_string: str = "sqlite:///limit_distributions.sqlite",
28+
test_method: str = "critical_value",
29+
alternative: HypothesisType = HypothesisType.RIGHT,
2130
):
2231
self.statistics = statistics
2332
self.significance_level = significance_level
2433
self.db_connection_string = db_connection_string
34+
self.test_method = test_method
35+
self.alternative = alternative
2536

2637
def test(self, data: list[float]) -> bool:
2738
"""
@@ -35,26 +46,29 @@ def test(self, data: list[float]) -> bool:
3546
limit_distribution_storage = SQLiteLimitDistributionStorage(self.db_connection_string)
3647
limit_distribution_storage.init()
3748

38-
cv_calculator = CVCalculator(limit_distribution_storage)
39-
4049
data_size = len(data)
4150
criterion_code = self.statistics.code()
4251
statistics_value = self.statistics.execute_statistic(data)
43-
if self.statistics.two_tailed:
44-
critical_value_left, critical_value_right = (
45-
cv_calculator.calculate_two_tailed_critical_values(
46-
criterion_code, data_size, self.significance_level
47-
)
52+
53+
if self.test_method == "critical_value":
54+
cv_calculator = CVCalculator(limit_distribution_storage)
55+
56+
critical_values = cv_calculator.calculate_critical_value(
57+
criterion_code,
58+
data_size,
59+
self.significance_level,
60+
self.alternative,
4861
)
49-
if critical_value_left <= statistics_value <= critical_value_right:
50-
return True
51-
else:
52-
return False
53-
else:
54-
critical_value = cv_calculator.calculate_critical_value(
55-
criterion_code, data_size, self.significance_level
62+
return self.alternative.check_hypothesis(statistics_value, critical_values)
63+
64+
elif self.test_method == "p_value":
65+
p_value_calculator = PValueCalculator(limit_distribution_storage)
66+
p_value = p_value_calculator.calculate_p_value(
67+
criterion_code,
68+
data_size,
69+
statistics_value,
70+
self.alternative,
5671
)
57-
if statistics_value <= critical_value:
58-
return True
59-
else:
60-
return False
72+
return p_value >= self.significance_level
73+
else:
74+
raise ValueError("Invalid test method.")

tests/calc/test_cv.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from unittest.mock import MagicMock
2+
3+
import numpy as np
4+
import pytest
5+
6+
from pysatl_criterion.cv_calculator.cv_calculator.cv_calculator import CVCalculator
7+
from pysatl_criterion.statistics.models import HypothesisType
8+
9+
10+
def test_calc_critical_value_right_tailed():
11+
mock_storage = MagicMock()
12+
mock_distribution = MagicMock()
13+
14+
mock_distribution.results_statistics = np.array(range(100))
15+
16+
mock_storage.get_data_for_cv.return_value = mock_distribution
17+
calculator = CVCalculator(limit_distribution_storage=mock_storage)
18+
19+
critical_value = calculator.calculate_critical_value(
20+
criterion_code="any_code", sample_size=100, sl=0.05, alternative=HypothesisType.RIGHT
21+
)
22+
assert critical_value == pytest.approx(94.05)
23+
24+
25+
def test_calc_critical_value_left_tailed():
26+
mock_storage = MagicMock()
27+
mock_distribution = MagicMock()
28+
29+
mock_distribution.results_statistics = np.array(range(100))
30+
31+
mock_storage.get_data_for_cv.return_value = mock_distribution
32+
calculator = CVCalculator(limit_distribution_storage=mock_storage)
33+
34+
critical_value = calculator.calculate_critical_value(
35+
criterion_code="any_code", sample_size=100, sl=0.05, alternative=HypothesisType.LEFT
36+
)
37+
assert critical_value == pytest.approx(4.95)
38+
39+
40+
def test_calc_critical_value_two_tailed():
41+
mock_storage = MagicMock()
42+
mock_distribution = MagicMock()
43+
44+
mock_distribution.results_statistics = np.array(range(100))
45+
46+
mock_storage.get_data_for_cv.return_value = mock_distribution
47+
calculator = CVCalculator(limit_distribution_storage=mock_storage)
48+
critical_value = calculator.calculate_critical_value(
49+
criterion_code="any_code", sample_size=100, sl=0.05, alternative=HypothesisType.TWO_TAILED
50+
)
51+
values = (2.475, 96.525)
52+
assert critical_value == pytest.approx(values)
53+
54+
55+
def test_calc_critical_value_raises_limit_distribution_error():
56+
mock_storage = MagicMock()
57+
mock_storage.get_data_for_cv.return_value = None
58+
59+
calculator = CVCalculator(limit_distribution_storage=mock_storage)
60+
61+
with pytest.raises(
62+
ValueError, match="Limit distribution for given criterion and sample size does not exist."
63+
):
64+
calculator.calculate_critical_value(criterion_code="any_code", sample_size=100, sl=0.05)
65+
mock_storage.get_data_for_cv.assert_called_once()
66+
67+
68+
def test_calc_critical_value_raises_unknown_alternative():
69+
mock_storage = MagicMock()
70+
mock_distribution = MagicMock()
71+
72+
mock_distribution.results_statistics = np.array(range(100))
73+
mock_storage.get_data_for_cv.return_value = mock_distribution
74+
calculator = CVCalculator(limit_distribution_storage=mock_storage)
75+
with pytest.raises(ValueError, match="Unknown alternative"):
76+
calculator.calculate_critical_value(
77+
criterion_code="any_code", sample_size=100, sl=0.05, alternative="not_valid"
78+
)
79+
mock_storage.get_data_for_cv.assert_called_once()

0 commit comments

Comments
 (0)