Merge branch 'main' into sainivas-pandas

sainivas-99 · web-flow · commit 6ec1b331e7c8 · 2025-04-28T14:48:41.000-05:00
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -153,7 +153,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.23.2
+        uses: pypa/cibuildwheel@v2.23.3
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -806,6 +806,7 @@ Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`)
 - Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`)
+- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`)
 - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
 - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
 - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3568,6 +3568,7 @@ def _wrap(x, alt_format_):
         elif formatters is None and float_format is not None:
             formatters_ = partial(_wrap, alt_format_=lambda v: v)
         format_index_ = [index_format_, column_format_]
+        format_index_names_ = [index_format_, column_format_]
 
         # Deal with hiding indexes and relabelling column names
         hide_: list[dict] = []
@@ -3616,6 +3617,7 @@ def _wrap(x, alt_format_):
             relabel_index=relabel_index_,
             format={"formatter": formatters_, **base_format_},
             format_index=format_index_,
+            format_index_names=format_index_names_,
             render_kwargs=render_kwargs_,
         )
 
@@ -3628,6 +3630,7 @@ def _to_latex_via_styler(
         relabel_index: dict | list[dict] | None = None,
         format: dict | list[dict] | None = None,
         format_index: dict | list[dict] | None = None,
+        format_index_names: dict | list[dict] | None = None,
         render_kwargs: dict | None = None,
     ):
         """
@@ -3672,7 +3675,13 @@ def _to_latex_via_styler(
         self = cast("DataFrame", self)
         styler = Styler(self, uuid="")
 
-        for kw_name in ["hide", "relabel_index", "format", "format_index"]:
+        for kw_name in [
+            "hide",
+            "relabel_index",
+            "format",
+            "format_index",
+            "format_index_names",
+        ]:
             kw = vars()[kw_name]
             if isinstance(kw, dict):
                 getattr(styler, kw_name)(**kw)
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -12,11 +12,16 @@
 
 import numpy as np
 
+from pandas._libs import (
+    algos as libalgos,
+)
 from pandas._libs.tslibs import OutOfBoundsDatetime
 from pandas.errors import InvalidIndexError
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.common import (
+    ensure_int64,
+    ensure_platform_int,
     is_list_like,
     is_scalar,
 )
@@ -38,7 +43,10 @@
 )
 from pandas.core.series import Series
 
-from pandas.io.formats.printing import pprint_thing
+from pandas.io.formats.printing import (
+    PrettyDict,
+    pprint_thing,
+)
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -668,8 +676,14 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
     def groups(self) -> dict[Hashable, Index]:
         codes, uniques = self._codes_and_uniques
         uniques = Index._with_infer(uniques, name=self.name)
-        cats = Categorical.from_codes(codes, uniques, validate=False)
-        return self._index.groupby(cats)
+
+        r, counts = libalgos.groupsort_indexer(ensure_platform_int(codes), len(uniques))
+        counts = ensure_int64(counts).cumsum()
+        _result = (r[start:end] for start, end in zip(counts, counts[1:]))
+        # map to the label
+        result = {k: self._index.take(v) for k, v in zip(uniques, _result)}
+
+        return PrettyDict(result)
 
     @property
     def observed_grouping(self) -> Grouping:
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -60,13 +60,15 @@ def get_dummies(
     data : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, list of str, or dict of str, default None
-        String to append DataFrame column names.
+        A string to be prepended to DataFrame column names.
         Pass a list with length equal to the number of columns
         when calling get_dummies on a DataFrame. Alternatively, `prefix`
         can be a dictionary mapping column names to prefixes.
-    prefix_sep : str, default '_'
-        If appending prefix, separator/delimiter to use. Or pass a
-        list or dictionary as with `prefix`.
+    prefix_sep : str, list of str, or dict of str, default '_'
+        Should you choose to prepend DataFrame column names with a prefix, this
+        is the separator/delimiter to use between the two. Alternatively,
+        `prefix_sep` can be a list with length equal to the number of columns,
+        or a dictionary mapping column names to separators.
     dummy_na : bool, default False
         If True, a NaN indicator column will be added even if no NaN values are present.
         If False, NA values are encoded as all zero.
diff --git a/pandas/core/sample.py b/pandas/core/sample.py
@@ -123,7 +123,7 @@ def sample(
     random_state: np.random.RandomState | np.random.Generator,
 ) -> np.ndarray:
     """
-    Randomly sample `size` indices in `np.arange(obj_len)`
+    Randomly sample `size` indices in `np.arange(obj_len)`.
 
     Parameters
     ----------
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -506,6 +506,23 @@ def test_observed_groups(observed):
     tm.assert_dict_equal(result, expected)
 
 
+def test_groups_na_category(dropna, observed):
+    # https://github.com/pandas-dev/pandas/issues/61356
+    df = DataFrame(
+        {"cat": Categorical(["a", np.nan, "a"], categories=list("adb"))},
+        index=list("xyz"),
+    )
+    g = df.groupby("cat", observed=observed, dropna=dropna)
+
+    result = g.groups
+    expected = {"a": Index(["x", "z"])}
+    if not dropna:
+        expected |= {np.nan: Index(["y"])}
+    if not observed:
+        expected |= {"b": Index([]), "d": Index([])}
+    tm.assert_dict_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "keys, expected_values, expected_index_levels",
     [
diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
@@ -824,6 +824,46 @@ def test_to_latex_escape_special_chars(self):
         )
         assert result == expected
 
+    def test_to_latex_escape_special_chars_in_index_names(self):
+        # https://github.com/pandas-dev/pandas/issues/61309
+        # https://github.com/pandas-dev/pandas/issues/57362
+        index = "&%$#_{}}~^\\"
+        df = DataFrame({index: [1, 2, 3]}).set_index(index)
+        result = df.to_latex(escape=True)
+        expected = _dedent(
+            r"""
+            \begin{tabular}{l}
+            \toprule
+            \&\%\$\#\_\{\}\}\textasciitilde \textasciicircum \textbackslash  \\
+            \midrule
+            1 \\
+            2 \\
+            3 \\
+            \bottomrule
+            \end{tabular}
+            """
+        )
+        assert result == expected
+
+    def test_to_latex_escape_special_chars_in_column_name(self):
+        df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]})
+        df.columns.name = "_^~"
+        result = df.to_latex(escape=True)
+        expected = _dedent(
+            r"""
+            \begin{tabular}{lrl}
+            \toprule
+            \_\textasciicircum \textasciitilde  & A & B \\
+            \midrule
+            0 & 1 & a \\
+            1 & 2 & b \\
+            2 & 3 & c \\
+            \bottomrule
+            \end{tabular}
+            """
+        )
+        assert result == expected
+
     def test_to_latex_specified_header_special_chars_without_escape(self):
         # GH 7124
         df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh
@@ -8,8 +8,6 @@ done
 FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
 if [[ $FREE_THREADED_BUILD == "True" ]]; then
     python -m pip install -U pip
-    # python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
-    # TODO: Remove below and uncomment above once https://github.com/cython/cython/pull/6717 no longer breaks tests
-    python -m pip install git+https://github.com/cython/cython.git@3276b588720a053c78488e5de788605950f4b136
+    python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
     python -m pip install ninja meson-python versioneer[toml] numpy
 fi

-Original file line number
+Diff line change
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
       - name: Build wheels
 -        uses: pypa/[email protected].2
 +        uses: pypa/[email protected].3
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env: