Skip to content

Commit 92a01b7

Browse files
adam2392bloebp
authored andcommitted
Fix build docs
Signed-off-by: Adam Li <[email protected]>
1 parent b2f32cc commit 92a01b7

File tree

4 files changed

+49
-48
lines changed

4 files changed

+49
-48
lines changed

doc/api.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@ of many data analysis procedures.
6363

6464
fisherz
6565
kci
66-
categorical
66+
power_divergence

doc/conditional_independence.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ which may be unrealistic in certain datasets.
9292

9393
fisherz
9494

95-
:mod:`pywhy_stats.categorical` Discrete, Categorical and Binary Data
96-
--------------------------------------------------------------------
95+
:mod:`pywhy_stats.power_divergence` Discrete, Categorical and Binary Data
96+
-------------------------------------------------------------------------
9797
If one has discrete data, then the test to use is based on Chi-square tests. The :math:`G^2`
9898
class of tests will construct a contingency table based on the number of levels across
9999
each discrete variable. An exponential amount of data is needed for increasing levels
@@ -102,7 +102,7 @@ for a discrete variable.
102102
.. autosummary::
103103
:toctree: generated/
104104

105-
categorical
105+
power_divergence
106106

107107
Kernel-Approaches
108108
-----------------

pywhy_stats/power_divergence.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Independence test among categorical variables using power-divergence tests.
22
3-
Works on categorical random variables. Based on the ``lambda_`` parameter, one
3+
Works on categorical random variables. Based on the ``method`` parameter, one
44
can compute a wide variety of different categorical hypothesis tests.
55
66
Categorical data is a type of data that can be divided into discrete groups.
@@ -54,14 +54,14 @@ def ind(
5454
The second node variable.
5555
method : float or string
5656
The lambda parameter for the power_divergence statistic. Some values of
57-
``lambda_`` results in other well known tests:
57+
``method`` results in other well known tests:
58+
5859
"pearson" 1 "Chi-squared test"
5960
"log-likelihood" 0 "G-test or log-likelihood"
60-
"freeman-tukey" -1/2 "freeman-tukey Statistic"
61+
"freeman-tukey" -1/2 "freeman-tukey Statistic"
6162
"mod-log-likelihood" -1 "Modified Log-likelihood"
6263
"neyman" -2 "Neyman's statistic"
63-
"cressie-read" 2/3 "The value recommended in the paper
64-
:footcite:`cressieread1984`"
64+
"cressie-read" 2/3 "The value recommended in the paper :footcite:`cressieread1984`"
6565
num_categories_allowed : int
6666
The maximum number of categories allowed in the input variables. Default
6767
of 10 is chosen to error out on large number of categories.
@@ -79,7 +79,7 @@ def ind(
7979
"""
8080
X, Y, _ = _preprocess_inputs(X=X, Y=Y, Z=None)
8181
return _power_divergence(
82-
X=X, Y=Y, Z=None, lambda_=method, num_categories_allowed=num_categories_allowed
82+
X=X, Y=Y, Z=None, method=method, num_categories_allowed=num_categories_allowed
8383
)
8484

8585

@@ -106,14 +106,14 @@ def condind(
106106
The conditioning set.
107107
method : float or string
108108
The lambda parameter for the power_divergence statistic. Some values of
109-
lambda_ results in other well known tests:
109+
method results in other well known tests:
110+
110111
"pearson" 1 "Chi-squared test"
111112
"log-likelihood" 0 "G-test or log-likelihood"
112-
"freeman-tukey" -1/2 "freeman-tukey Statistic"
113+
"freeman-tukey" -1/2 "freeman-tukey Statistic"
113114
"mod-log-likelihood" -1 "Modified Log-likelihood"
114115
"neyman" -2 "Neyman's statistic"
115-
"cressie-read" 2/3 "The value recommended in the paper
116-
:footcite:`cressieread1984`"
116+
"cressie-read" 2/3 "The value recommended in the paper :footcite:`cressieread1984`"
117117
num_categories_allowed : int
118118
The maximum number of categories allowed in the input variables. Default
119119
of 10 is chosen to error out on large number of categories.
@@ -127,7 +127,7 @@ def condind(
127127
"""
128128
X, Y, condition_on = _preprocess_inputs(X=X, Y=Y, Z=condition_on)
129129
return _power_divergence(
130-
X=X, Y=Y, Z=condition_on, lambda_=method, num_categories_allowed=num_categories_allowed
130+
X=X, Y=Y, Z=condition_on, method=method, num_categories_allowed=num_categories_allowed
131131
)
132132

133133

@@ -205,7 +205,7 @@ def _power_divergence(
205205
X: ArrayLike,
206206
Y: ArrayLike,
207207
Z: Optional[ArrayLike],
208-
lambda_: str = "cressie-read",
208+
method: str = "cressie-read",
209209
num_categories_allowed: int = 10,
210210
) -> PValueResult:
211211
"""Compute the Cressie-Read power divergence statistic.
@@ -218,16 +218,17 @@ def _power_divergence(
218218
The second node variable.
219219
Z : optional, ArrayLike of shape (n_samples, n_variables) of type np.int
220220
The conditioning set. If not defined, is `None`.
221-
lambda_: float or string
221+
method : float or string
222222
The lambda parameter for the power_divergence statistic. Some values of
223-
lambda_ results in other well known tests:
224-
"pearson" 1 "Chi-squared test"
225-
"log-likelihood" 0 "G-test or log-likelihood"
226-
"freeman-tukey" -1/2 "freeman-tukey Statistic"
227-
"mod-log-likelihood" -1 "Modified Log-likelihood"
228-
"neyman" -2 "Neyman's statistic"
229-
"cressie-read" 2/3 "The value recommended in the paper
230-
:footcite:`cressieread1984`"
223+
method results in other well known tests:
224+
225+
"pearson" 1 "Chi-squared test"
226+
"log-likelihood" 0 "G-test or log-likelihood"
227+
"freeman-tukey" -1/2 "freeman-tukey Statistic"
228+
"mod-log-likelihood" -1 "Modified Log-likelihood"
229+
"neyman" -2 "Neyman's statistic"
230+
"cressie-read" 2/3 "The value recommended in the paper
231+
:footcite:`cressieread1984`"
231232
num_categories_allowed : int
232233
The maximum number of categories allowed in the input variables.
233234
@@ -264,7 +265,7 @@ def _power_divergence(
264265
if Z is None:
265266
# Compute the contingency table
266267
observed_xy, _, _ = np.histogram2d(X, Y, bins=(np.unique(X).size, np.unique(Y).size))
267-
chi, p_value, dof, expected = stats.chi2_contingency(observed_xy, lambda_=lambda_)
268+
chi, p_value, dof, expected = stats.chi2_contingency(observed_xy, method=method)
268269

269270
# Step 2: If there are conditionals variables, iterate over unique states and do
270271
# the contingency test.
@@ -309,7 +310,7 @@ def _power_divergence(
309310
sub_table_z = (
310311
df.groupby(X_columns + Y_columns).size().unstack(Y_columns, fill_value=1e-7)
311312
)
312-
c, _, d, _ = stats.chi2_contingency(sub_table_z, lambda_=lambda_)
313+
c, _, d, _ = stats.chi2_contingency(sub_table_z, method=method)
313314
chi += c
314315
dof += d
315316
except ValueError:

tests/test_power_divergence.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,29 +36,29 @@ def test_chisquare_marginal_independence_adult_dataset():
3636
Uses the test data from dagitty.
3737
"""
3838
# Comparision values taken from dagitty (DAGitty)
39-
lambda_ = "pearson"
39+
method = "pearson"
4040
X = df_adult["Age"]
4141
Y = df_adult["Immigrant"]
42-
result = power_divergence.ind(X=X, Y=Y, method=lambda_)
42+
result = power_divergence.ind(X=X, Y=Y, method=method)
4343
assert_almost_equal(result.statistic, 57.75, decimal=1)
4444
assert_almost_equal(np.log(result.pvalue), -25.47, decimal=1)
4545
assert result.additional_information["dof"] == 4
4646

4747
Y = df_adult["Race"]
48-
result = power_divergence.ind(X=X, Y=Y, method=lambda_)
48+
result = power_divergence.ind(X=X, Y=Y, method=method)
4949
assert_almost_equal(result.statistic, 56.25, decimal=1)
5050
assert_almost_equal(np.log(result.pvalue), -24.75, decimal=1)
5151
assert result.additional_information["dof"] == 4
5252

5353
Y = df_adult["Sex"]
54-
result = power_divergence.ind(X=X, Y=Y, method=lambda_)
54+
result = power_divergence.ind(X=X, Y=Y, method=method)
5555
assert_almost_equal(result.statistic, 289.62, decimal=1)
5656
assert_almost_equal(np.log(result.pvalue), -139.82, decimal=1)
5757
assert result.additional_information["dof"] == 4
5858

5959
X = df_adult["Immigrant"]
6060
Y = df_adult["Sex"]
61-
result = power_divergence.ind(X=X, Y=Y, method=lambda_)
61+
result = power_divergence.ind(X=X, Y=Y, method=method)
6262
assert_almost_equal(result.statistic, 0.2724, decimal=1)
6363
assert_almost_equal(np.log(result.pvalue), -0.50, decimal=1)
6464
assert result.additional_information["dof"] == 1
@@ -69,18 +69,18 @@ def test_chisquare_conditional_independence_adult_dataset():
6969
7070
Uses the test data from dagitty.
7171
"""
72-
lambda_ = "pearson"
72+
method = "pearson"
7373
X = df_adult["Education"]
7474
Y = df_adult["HoursPerWeek"]
7575
condition_on = df_adult[["Age", "Immigrant", "Race", "Sex"]]
76-
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
76+
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
7777
assert_almost_equal(result.statistic, 1460.11, decimal=1)
7878
assert_almost_equal(result.pvalue, 0, decimal=1)
7979
assert result.additional_information["dof"] == 316
8080

8181
Y = df_adult["MaritalStatus"]
8282
condition_on = df_adult[["Age", "Sex"]]
83-
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
83+
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
8484
assert_almost_equal(result.statistic, 481.96, decimal=1)
8585
assert_almost_equal(result.pvalue, 0, decimal=1)
8686
assert result.additional_information["dof"] == 58
@@ -90,7 +90,7 @@ def test_chisquare_conditional_independence_adult_dataset():
9090
X = df_adult["Income"]
9191
Y = df_adult["Race"]
9292
condition_on = df_adult[["Age", "Education", "HoursPerWeek", "MaritalStatus"]]
93-
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
93+
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
9494

9595
assert_almost_equal(result.statistic, 66.39, decimal=1)
9696
assert_almost_equal(result.pvalue, 0.99, decimal=1)
@@ -99,14 +99,14 @@ def test_chisquare_conditional_independence_adult_dataset():
9999
X = df_adult["Immigrant"]
100100
Y = df_adult["Income"]
101101
condition_on = df_adult[["Age", "Education", "HoursPerWeek", "MaritalStatus"]]
102-
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
102+
result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
103103
assert_almost_equal(result.statistic, 65.59, decimal=1)
104104
assert_almost_equal(result.pvalue, 0.999, decimal=2)
105105
assert result.additional_information["dof"] == 131
106106

107107

108108
@pytest.mark.parametrize(
109-
"lambda_",
109+
"method",
110110
[
111111
"pearson", # chi-square
112112
"log-likelihood", # G^2
@@ -116,17 +116,17 @@ def test_chisquare_conditional_independence_adult_dataset():
116116
"cressie-read", # Cressie-read
117117
],
118118
)
119-
def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
119+
def test_chisquare_when_dependent_given_different_methodon_testdata(method):
120120
assert (
121-
power_divergence.ind(X=df_adult["Age"], Y=df_adult["Immigrant"], method=lambda_).pvalue
121+
power_divergence.ind(X=df_adult["Age"], Y=df_adult["Immigrant"], method=method).pvalue
122122
< 0.05
123123
)
124124

125-
assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Race"], method=lambda_).pvalue < 0.05
125+
assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Race"], method=method).pvalue < 0.05
126126

127-
assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Sex"], method=lambda_).pvalue < 0.05
127+
assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Sex"], method=method).pvalue < 0.05
128128
assert (
129-
power_divergence.ind(X=df_adult["Immigrant"], Y=df_adult["Sex"], method=lambda_).pvalue
129+
power_divergence.ind(X=df_adult["Immigrant"], Y=df_adult["Sex"], method=method).pvalue
130130
>= 0.05
131131
)
132132

@@ -135,7 +135,7 @@ def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
135135
X=df_adult["Education"],
136136
Y=df_adult["HoursPerWeek"],
137137
condition_on=df_adult[["Age", "Immigrant", "Race", "Sex"]],
138-
method=lambda_,
138+
method=method,
139139
).pvalue
140140
< 0.05
141141
)
@@ -144,14 +144,14 @@ def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
144144
X=df_adult["Education"],
145145
Y=df_adult["MaritalStatus"],
146146
condition_on=df_adult[["Age", "Sex"]],
147-
method=lambda_,
147+
method=method,
148148
).pvalue
149149
< 0.05
150150
)
151151

152152

153153
@pytest.mark.parametrize(
154-
"lambda_",
154+
"method",
155155
[
156156
"pearson", # chi-square
157157
"log-likelihood", # G^2
@@ -161,12 +161,12 @@ def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
161161
"cressie-read", # Cressie-read
162162
],
163163
)
164-
def test_chisquare_when_exactly_dependent_given_different_lambda_(lambda_):
164+
def test_chisquare_when_exactly_dependent_given_different_method(method):
165165
x = np.random.choice([0, 1], size=1000)
166166
y = x.copy()
167167
df = pd.DataFrame({"x": x, "y": y})
168168

169-
result = power_divergence.ind(X=df["x"], Y=df["y"], method=lambda_)
169+
result = power_divergence.ind(X=df["x"], Y=df["y"], method=method)
170170
assert result.additional_information["dof"] == 1
171171
assert_almost_equal(result.pvalue, 0, decimal=5)
172172

0 commit comments

Comments
 (0)