added support for ordered categoricals in kendall and spearman correlation

Michele Pau · Michele Pau · commit 6eefa20fb7a2 · 2024-12-04T13:47:05.000Z
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -56,6 +56,7 @@ Other enhancements
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
+- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -11034,6 +11034,10 @@ def corr(
         data = self._get_numeric_data() if numeric_only else self
         cols = data.columns
         idx = cols.copy()
+
+        if method in ("spearman", "kendall"):
+            data = data._convert_ordered_cat_to_code()
+
         mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
 
         if method == "pearson":
@@ -11321,6 +11325,8 @@ def corrwith(
             correl = num / dom
 
         elif method in ["kendall", "spearman"] or callable(method):
+            left = left._convert_ordered_cat_to_code()
+            right = right._convert_ordered_cat_to_code()
 
             def c(x):
                 return nanops.nancorr(x[0], x[1], method=method)
@@ -11352,6 +11358,24 @@ def c(x):
 
         return correl
 
+    def _convert_ordered_cat_to_code(self) -> DataFrame:
+        """
+        Converts all category columns to their codes wherever possible
+        (i.e. wherever they are ordered) otherwise leaves shape unchanged
+        """
+        categ = self.select_dtypes("category")
+        if len(categ.columns) == 0:
+            return self
+
+        cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns
+        if len(cols_convert) > 0:
+            data = self.copy(deep=False)
+            data[cols_convert] = data[cols_convert].transform(
+                lambda x: x.cat.codes.replace(-1, np.nan)
+            )
+
+        return data
+
     # ----------------------------------------------------------------------
     # ndarray-like stats methods
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2680,6 +2680,12 @@ def corr(
         if len(this) == 0:
             return np.nan
 
+        if method in ("spearman", "kendall"):
+            if this.dtype == "category" and this.cat.ordered:
+                this = this.cat.codes.replace(-1, np.nan)
+            if other.dtype == "category" and other.cat.ordered:
+                other = other.cat.codes.replace(-1, np.nan)
+
         this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
         other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)
 
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 from pandas import (
+    Categorical,
     DataFrame,
     Index,
     Series,
@@ -16,6 +17,19 @@
 import pandas._testing as tm
 
 
+@pytest.fixture
+def categorical_frame():
+    frame = DataFrame(
+        {
+            "A": Categorical(list("abcde") * 6, list("bacde"), ordered=True),
+            "B": Categorical(list("123") * 10, list("321"), ordered=True),
+        }
+    )
+    frame.loc[frame.index[:5], "A"] = np.nan
+    frame.loc[frame.index[3:6], "B"] = np.nan
+    return frame
+
+
 class TestDataFrameCov:
     def test_cov(self, float_frame, float_string_frame):
         # min_periods no NAs (corner case)
@@ -116,6 +130,13 @@ def test_corr_scipy_method(self, float_frame, method):
         expected = float_frame["A"].corr(float_frame["C"], method=method)
         tm.assert_almost_equal(correls["A"]["C"], expected)
 
+    @pytest.mark.parametrize("method", ["kendall", "spearman"])
+    def test_corr_scipy_method_category(self, method, categorical_frame):
+        pytest.importorskip("scipy")
+        correls = categorical_frame.corr(method=method)
+        expected = categorical_frame["A"].corr(categorical_frame["B"], method=method)
+        tm.assert_almost_equal(correls["A"]["B"], expected)
+
     # ---------------------------------------------------------------------
 
     def test_corr_non_numeric(self, float_string_frame):
@@ -303,6 +324,14 @@ def test_corrwith(self, datetime_frame, dtype):
         dropped = a.corrwith(b, axis=1, drop=True)
         assert a.index[-1] not in dropped.index
 
+    @pytest.mark.parametrize("method", ["spearman", "kendall"])
+    def test_corrwith_categorical(self, categorical_frame, method):
+        other = categorical_frame["B"]
+        result = categorical_frame.corrwith(other, method=method)
+        expected = categorical_frame.agg(lambda x: x.corr(other, method=method))
+        tm.assert_almost_equal(result["A"], expected["A"])
+        tm.assert_almost_equal(result["B"], expected["B"])
+
     def test_corrwith_non_timeseries_data(self):
         index = ["a", "b", "c", "d", "e"]
         columns = ["one", "two", "three", "four"]
diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py
@@ -5,13 +5,43 @@
 
 import pandas as pd
 from pandas import (
+    Categorical,
     Series,
     date_range,
     isna,
 )
 import pandas._testing as tm
 
 
+@pytest.fixture
+def A():
+    return Series(
+        np.concatenate([np.arange(5, dtype=np.float64)] * 2),
+        index=date_range("2020-01-01", periods=10),
+        name="ts",
+    )
+
+
+@pytest.fixture
+def B():
+    return Series(
+        np.arange(10, dtype=np.float64),
+        index=date_range("2020-01-01", periods=10),
+        name="ts",
+    )
+
+
+@pytest.fixture
+def C():
+    s = Series(
+        data=Categorical(list("12345") * 2, categories=list("54321"), ordered=True),
+        index=date_range("2020-01-01", periods=10),
+        name="categorical",
+    )
+    s["2020-01-03"] = np.nan
+    return s
+
+
 class TestSeriesCov:
     def test_cov(self, datetime_series):
         # full overlap
@@ -56,7 +86,7 @@ def test_cov_ddof(self, test_ddof, dtype):
 
 
 class TestSeriesCorr:
-    def test_corr(self, datetime_series, any_float_dtype):
+    def test_corr(self, B, datetime_series, any_float_dtype):
         stats = pytest.importorskip("scipy.stats")
 
         datetime_series = datetime_series.astype(any_float_dtype)
@@ -81,29 +111,14 @@ def test_corr(self, datetime_series, any_float_dtype):
         cp[:] = np.nan
         assert isna(cp.corr(cp))
 
-        A = Series(
-            np.arange(10, dtype=np.float64),
-            index=date_range("2020-01-01", periods=10),
-            name="ts",
-        )
-        result = A.corr(A)
-        expected, _ = stats.pearsonr(A, A)
+        result = B.corr(B)
+        expected, _ = stats.pearsonr(B, B)
         tm.assert_almost_equal(result, expected)
 
-    def test_corr_rank(self):
+    def test_corr_rank(self, A, B):
         stats = pytest.importorskip("scipy.stats")
 
         # kendall and spearman
-        B = Series(
-            np.arange(10, dtype=np.float64),
-            index=date_range("2020-01-01", periods=10),
-            name="ts",
-        )
-        A = Series(
-            np.concatenate([np.arange(5, dtype=np.float64)] * 2),
-            index=date_range("2020-01-01", periods=10),
-            name="ts",
-        )
         result = A.corr(B, method="kendall")
         expected = stats.kendalltau(A, B)[0]
         tm.assert_almost_equal(result, expected)
@@ -146,6 +161,29 @@ def test_corr_rank(self):
         tm.assert_almost_equal(A.corr(B, method="kendall"), kexp)
         tm.assert_almost_equal(A.corr(B, method="spearman"), sexp)
 
+    def test_corr_category(self, A, C):
+        stats = pytest.importorskip("scipy.stats")
+
+        def get_codes(s: Series) -> Series:
+            return C.cat.codes.replace(-1, np.nan)
+
+        result = A.corr(C, method="pearson")
+        expected = stats.pearsonr(A[C.notna()], C.dropna().astype("float"))[0]
+        tm.assert_almost_equal(result, expected)
+        tm.assert_almost_equal(result, 1)
+
+        result = A.corr(C, method="spearman")
+        expected = stats.spearmanr(A, get_codes(C), nan_policy="omit")[0]
+        expected_pearson = stats.pearsonr(A[C.notna()], get_codes(C).dropna())[0]
+
+        tm.assert_almost_equal(result, expected)
+        tm.assert_almost_equal(result, expected_pearson)
+        tm.assert_almost_equal(result, -1)
+
+        result = A.corr(C, method="kendall")
+        expected = stats.kendalltau(A, get_codes(C), nan_policy="omit")[0]
+        tm.assert_almost_equal(result, expected)
+
     def test_corr_invalid_method(self):
         # GH PR #22298
         s1 = Series(np.random.default_rng(2).standard_normal(10))