Fix build docs

adam2392 · bloebp · commit 92a01b78f06d · 2023-07-21T08:26:29.000-07:00
Signed-off-by: Adam Li &lt;adam2392@gmail.com&gt;
diff --git a/doc/api.rst b/doc/api.rst
@@ -63,4 +63,4 @@ of many data analysis procedures.
 
    fisherz
    kci
-   categorical
+   power_divergence
diff --git a/doc/conditional_independence.rst b/doc/conditional_independence.rst
@@ -92,8 +92,8 @@ which may be unrealistic in certain datasets.
 
     fisherz
 
-:mod:`pywhy_stats.categorical` Discrete, Categorical and Binary Data
---------------------------------------------------------------------
+:mod:`pywhy_stats.power_divergence` Discrete, Categorical and Binary Data
+-------------------------------------------------------------------------
 If one has discrete data, then the test to use is based on Chi-square tests. The :math:`G^2`
 class of tests will construct a contingency table based on the number of levels across
 each discrete variable. An exponential amount of data is needed for increasing levels
@@ -102,7 +102,7 @@ for a discrete variable.
 .. autosummary::
    :toctree: generated/
 
-    categorical
+    power_divergence
 
 Kernel-Approaches
 -----------------
diff --git a/pywhy_stats/power_divergence.py b/pywhy_stats/power_divergence.py
@@ -1,6 +1,6 @@
 """Independence test among categorical variables using power-divergence tests.
 
-Works on categorical random variables. Based on the ``lambda_`` parameter, one
+Works on categorical random variables. Based on the ``method`` parameter, one
 can compute a wide variety of different categorical hypothesis tests.
 
 Categorical data is a type of data that can be divided into discrete groups.
@@ -54,14 +54,14 @@ def ind(
         The second node variable.
     method : float or string
         The lambda parameter for the power_divergence statistic. Some values of
-        ``lambda_`` results in other well known tests:
+        ``method`` results in other well known tests:
+
             "pearson"             1          "Chi-squared test"
             "log-likelihood"      0          "G-test or log-likelihood"
-            "freeman-tukey"     -1/2        "freeman-tukey Statistic"
+            "freeman-tukey"     -1/2         "freeman-tukey Statistic"
             "mod-log-likelihood"  -1         "Modified Log-likelihood"
             "neyman"              -2         "Neyman's statistic"
-            "cressie-read"        2/3        "The value recommended in the paper
-                                             :footcite:`cressieread1984`"
+            "cressie-read"        2/3        "The value recommended in the paper :footcite:`cressieread1984`"
     num_categories_allowed : int
         The maximum number of categories allowed in the input variables. Default
         of 10 is chosen to error out on large number of categories.
@@ -79,7 +79,7 @@ def ind(
     """
     X, Y, _ = _preprocess_inputs(X=X, Y=Y, Z=None)
     return _power_divergence(
-        X=X, Y=Y, Z=None, lambda_=method, num_categories_allowed=num_categories_allowed
+        X=X, Y=Y, Z=None, method=method, num_categories_allowed=num_categories_allowed
     )
 
 
@@ -106,14 +106,14 @@ def condind(
         The conditioning set.
     method : float or string
         The lambda parameter for the power_divergence statistic. Some values of
-        lambda_ results in other well known tests:
+        method results in other well known tests:
+
             "pearson"             1          "Chi-squared test"
             "log-likelihood"      0          "G-test or log-likelihood"
-            "freeman-tukey"     -1/2        "freeman-tukey Statistic"
+            "freeman-tukey"     -1/2         "freeman-tukey Statistic"
             "mod-log-likelihood"  -1         "Modified Log-likelihood"
             "neyman"              -2         "Neyman's statistic"
-            "cressie-read"        2/3        "The value recommended in the paper
-                                             :footcite:`cressieread1984`"
+            "cressie-read"        2/3        "The value recommended in the paper :footcite:`cressieread1984`"
     num_categories_allowed : int
         The maximum number of categories allowed in the input variables. Default
         of 10 is chosen to error out on large number of categories.
@@ -127,7 +127,7 @@ def condind(
     """
     X, Y, condition_on = _preprocess_inputs(X=X, Y=Y, Z=condition_on)
     return _power_divergence(
-        X=X, Y=Y, Z=condition_on, lambda_=method, num_categories_allowed=num_categories_allowed
+        X=X, Y=Y, Z=condition_on, method=method, num_categories_allowed=num_categories_allowed
     )
 
 
@@ -205,7 +205,7 @@ def _power_divergence(
     X: ArrayLike,
     Y: ArrayLike,
     Z: Optional[ArrayLike],
-    lambda_: str = "cressie-read",
+    method: str = "cressie-read",
     num_categories_allowed: int = 10,
 ) -> PValueResult:
     """Compute the Cressie-Read power divergence statistic.
@@ -218,16 +218,17 @@ def _power_divergence(
         The second node variable.
     Z : optional, ArrayLike of shape (n_samples, n_variables) of type np.int
         The conditioning set. If not defined, is `None`.
-    lambda_: float or string
+    method : float or string
         The lambda parameter for the power_divergence statistic. Some values of
-        lambda_ results in other well known tests:
-            "pearson"             1          "Chi-squared test"
-            "log-likelihood"      0          "G-test or log-likelihood"
-            "freeman-tukey"     -1/2        "freeman-tukey Statistic"
-            "mod-log-likelihood"  -1         "Modified Log-likelihood"
-            "neyman"              -2         "Neyman's statistic"
-            "cressie-read"        2/3        "The value recommended in the paper
-                                             :footcite:`cressieread1984`"
+        method results in other well known tests:
+
+        "pearson"             1          "Chi-squared test"
+        "log-likelihood"      0          "G-test or log-likelihood"
+        "freeman-tukey"     -1/2         "freeman-tukey Statistic"
+        "mod-log-likelihood"  -1         "Modified Log-likelihood"
+        "neyman"              -2         "Neyman's statistic"
+        "cressie-read"        2/3        "The value recommended in the paper
+                                         :footcite:`cressieread1984`"
     num_categories_allowed : int
         The maximum number of categories allowed in the input variables.
 
@@ -264,7 +265,7 @@ def _power_divergence(
     if Z is None:
         # Compute the contingency table
         observed_xy, _, _ = np.histogram2d(X, Y, bins=(np.unique(X).size, np.unique(Y).size))
-        chi, p_value, dof, expected = stats.chi2_contingency(observed_xy, lambda_=lambda_)
+        chi, p_value, dof, expected = stats.chi2_contingency(observed_xy, method=method)
 
     # Step 2: If there are conditionals variables, iterate over unique states and do
     #         the contingency test.
@@ -309,7 +310,7 @@ def _power_divergence(
                 sub_table_z = (
                     df.groupby(X_columns + Y_columns).size().unstack(Y_columns, fill_value=1e-7)
                 )
-                c, _, d, _ = stats.chi2_contingency(sub_table_z, lambda_=lambda_)
+                c, _, d, _ = stats.chi2_contingency(sub_table_z, method=method)
                 chi += c
                 dof += d
             except ValueError:
diff --git a/tests/test_power_divergence.py b/tests/test_power_divergence.py
@@ -36,29 +36,29 @@ def test_chisquare_marginal_independence_adult_dataset():
     Uses the test data from dagitty.
     """
     # Comparision values taken from dagitty (DAGitty)
-    lambda_ = "pearson"
+    method = "pearson"
     X = df_adult["Age"]
     Y = df_adult["Immigrant"]
-    result = power_divergence.ind(X=X, Y=Y, method=lambda_)
+    result = power_divergence.ind(X=X, Y=Y, method=method)
     assert_almost_equal(result.statistic, 57.75, decimal=1)
     assert_almost_equal(np.log(result.pvalue), -25.47, decimal=1)
     assert result.additional_information["dof"] == 4
 
     Y = df_adult["Race"]
-    result = power_divergence.ind(X=X, Y=Y, method=lambda_)
+    result = power_divergence.ind(X=X, Y=Y, method=method)
     assert_almost_equal(result.statistic, 56.25, decimal=1)
     assert_almost_equal(np.log(result.pvalue), -24.75, decimal=1)
     assert result.additional_information["dof"] == 4
 
     Y = df_adult["Sex"]
-    result = power_divergence.ind(X=X, Y=Y, method=lambda_)
+    result = power_divergence.ind(X=X, Y=Y, method=method)
     assert_almost_equal(result.statistic, 289.62, decimal=1)
     assert_almost_equal(np.log(result.pvalue), -139.82, decimal=1)
     assert result.additional_information["dof"] == 4
 
     X = df_adult["Immigrant"]
     Y = df_adult["Sex"]
-    result = power_divergence.ind(X=X, Y=Y, method=lambda_)
+    result = power_divergence.ind(X=X, Y=Y, method=method)
     assert_almost_equal(result.statistic, 0.2724, decimal=1)
     assert_almost_equal(np.log(result.pvalue), -0.50, decimal=1)
     assert result.additional_information["dof"] == 1
@@ -69,18 +69,18 @@ def test_chisquare_conditional_independence_adult_dataset():
 
     Uses the test data from dagitty.
     """
-    lambda_ = "pearson"
+    method = "pearson"
     X = df_adult["Education"]
     Y = df_adult["HoursPerWeek"]
     condition_on = df_adult[["Age", "Immigrant", "Race", "Sex"]]
-    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
+    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
     assert_almost_equal(result.statistic, 1460.11, decimal=1)
     assert_almost_equal(result.pvalue, 0, decimal=1)
     assert result.additional_information["dof"] == 316
 
     Y = df_adult["MaritalStatus"]
     condition_on = df_adult[["Age", "Sex"]]
-    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
+    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
     assert_almost_equal(result.statistic, 481.96, decimal=1)
     assert_almost_equal(result.pvalue, 0, decimal=1)
     assert result.additional_information["dof"] == 58
@@ -90,7 +90,7 @@ def test_chisquare_conditional_independence_adult_dataset():
     X = df_adult["Income"]
     Y = df_adult["Race"]
     condition_on = df_adult[["Age", "Education", "HoursPerWeek", "MaritalStatus"]]
-    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
+    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
 
     assert_almost_equal(result.statistic, 66.39, decimal=1)
     assert_almost_equal(result.pvalue, 0.99, decimal=1)
@@ -99,14 +99,14 @@ def test_chisquare_conditional_independence_adult_dataset():
     X = df_adult["Immigrant"]
     Y = df_adult["Income"]
     condition_on = df_adult[["Age", "Education", "HoursPerWeek", "MaritalStatus"]]
-    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=lambda_)
+    result = power_divergence.condind(X=X, Y=Y, condition_on=condition_on, method=method)
     assert_almost_equal(result.statistic, 65.59, decimal=1)
     assert_almost_equal(result.pvalue, 0.999, decimal=2)
     assert result.additional_information["dof"] == 131
 
 
 @pytest.mark.parametrize(
-    "lambda_",
+    "method",
     [
         "pearson",  # chi-square
         "log-likelihood",  # G^2
@@ -116,17 +116,17 @@ def test_chisquare_conditional_independence_adult_dataset():
         "cressie-read",  # Cressie-read
     ],
 )
-def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
+def test_chisquare_when_dependent_given_different_methodon_testdata(method):
     assert (
-        power_divergence.ind(X=df_adult["Age"], Y=df_adult["Immigrant"], method=lambda_).pvalue
+        power_divergence.ind(X=df_adult["Age"], Y=df_adult["Immigrant"], method=method).pvalue
         < 0.05
     )
 
-    assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Race"], method=lambda_).pvalue < 0.05
+    assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Race"], method=method).pvalue < 0.05
 
-    assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Sex"], method=lambda_).pvalue < 0.05
+    assert power_divergence.ind(X=df_adult["Age"], Y=df_adult["Sex"], method=method).pvalue < 0.05
     assert (
-        power_divergence.ind(X=df_adult["Immigrant"], Y=df_adult["Sex"], method=lambda_).pvalue
+        power_divergence.ind(X=df_adult["Immigrant"], Y=df_adult["Sex"], method=method).pvalue
         >= 0.05
     )
 
@@ -135,7 +135,7 @@ def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
             X=df_adult["Education"],
             Y=df_adult["HoursPerWeek"],
             condition_on=df_adult[["Age", "Immigrant", "Race", "Sex"]],
-            method=lambda_,
+            method=method,
         ).pvalue
         < 0.05
     )
@@ -144,14 +144,14 @@ def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
             X=df_adult["Education"],
             Y=df_adult["MaritalStatus"],
             condition_on=df_adult[["Age", "Sex"]],
-            method=lambda_,
+            method=method,
         ).pvalue
         < 0.05
     )
 
 
 @pytest.mark.parametrize(
-    "lambda_",
+    "method",
     [
         "pearson",  # chi-square
         "log-likelihood",  # G^2
@@ -161,12 +161,12 @@ def test_chisquare_when_dependent_given_different_lambda_on_testdata(lambda_):
         "cressie-read",  # Cressie-read
     ],
 )
-def test_chisquare_when_exactly_dependent_given_different_lambda_(lambda_):
+def test_chisquare_when_exactly_dependent_given_different_method(method):
     x = np.random.choice([0, 1], size=1000)
     y = x.copy()
     df = pd.DataFrame({"x": x, "y": y})
 
-    result = power_divergence.ind(X=df["x"], Y=df["y"], method=lambda_)
+    result = power_divergence.ind(X=df["x"], Y=df["y"], method=method)
     assert result.additional_information["dof"] == 1
     assert_almost_equal(result.pvalue, 0, decimal=5)