From f7c4694f01735b71a6638f3828ab02f8d17c321a Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Wed, 30 Jul 2025 13:23:13 +0200 Subject: [PATCH 1/6] Avoid copying categoricals --- pandas/core/arrays/categorical.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e4420d07675ba..d57856115d276 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -575,7 +575,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: # GH 10696/18593/18630 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - result = self._set_dtype(dtype) + result = self._set_dtype(dtype, copy=False) elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) @@ -945,7 +945,7 @@ def _set_categories(self, categories, fastpath: bool = False) -> None: super().__init__(self._ndarray, new_dtype) - def _set_dtype(self, dtype: CategoricalDtype) -> Self: + def _set_dtype(self, dtype: CategoricalDtype, copy: bool = True) -> Self: """ Internal method for directly updating the CategoricalDtype @@ -958,7 +958,9 @@ def _set_dtype(self, dtype: CategoricalDtype) -> Self: We don't do any validation here. It's assumed that the dtype is a (valid) instance of `CategoricalDtype`. """ - codes = recode_for_categories(self.codes, self.categories, dtype.categories) + codes = recode_for_categories( + self.codes, self.categories, dtype.categories, copy + ) return type(self)._simple_new(codes, dtype=dtype) def set_ordered(self, value: bool) -> Self: From 9192881b6eb21ca886d90891cf9522c83caaa2d4 Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Thu, 31 Jul 2025 09:56:15 +0200 Subject: [PATCH 2/6] Add a test --- pandas/tests/arrays/categorical/test_astype.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 00999d491b242..1c7d1a2eec8b7 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -130,6 +130,16 @@ def test_astype_category(self, dtype_ordered, ordered): expected = cat tm.assert_categorical_equal(result, expected) + def test_astype_category_copy_false_nocopy_codes(self): + cat = Categorical([3, 2, 4, 1]) + # ensure that the categories are not copied + assert cat.codes.base is not None + + new = cat.astype("category", copy=False) + assert new.codes.base is cat.codes.base or new.codes is cat.codes + new = cat.astype("category", copy=True) + assert not (new.codes.base is cat.codes.base or new.codes is cat.codes) + def test_astype_object_datetime_categories(self): # GH#40754 cat = Categorical(to_datetime(["2021-03-27", NaT])) From b43e5ba1a885c50c64d343abcb99662230237867 Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Thu, 31 Jul 2025 09:58:36 +0200 Subject: [PATCH 3/6] add whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f7b64b03a52fd..3191c077d3c36 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -687,6 +687,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`) - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) - From b3bc7a40b2be230501334153bb1f2e2073db89f5 Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Thu, 31 Jul 2025 09:59:50 +0200 Subject: [PATCH 4/6] Add issue as comment --- pandas/tests/arrays/categorical/test_astype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 1c7d1a2eec8b7..e7612d3960c43 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -131,8 +131,8 @@ def test_astype_category(self, dtype_ordered, ordered): tm.assert_categorical_equal(result, expected) def test_astype_category_copy_false_nocopy_codes(self): + # GH#62000 cat = Categorical([3, 2, 4, 1]) - # ensure that the categories are not copied assert cat.codes.base is not None new = cat.astype("category", copy=False) From aff3cd44f24995c71b3a1416f2b10dad45ef70ce Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Thu, 31 Jul 2025 10:01:56 +0200 Subject: [PATCH 5/6] remove unnecessary assert --- pandas/tests/arrays/categorical/test_astype.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index e7612d3960c43..4c3ae0b8f83c6 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -133,8 +133,6 @@ def test_astype_category(self, dtype_ordered, ordered): def test_astype_category_copy_false_nocopy_codes(self): # GH#62000 cat = Categorical([3, 2, 4, 1]) - assert cat.codes.base is not None - new = cat.astype("category", copy=False) assert new.codes.base is cat.codes.base or new.codes is cat.codes new = cat.astype("category", copy=True) From b9b1d259cbafb1a78757552d76dd515b2ec52922 Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Fri, 1 Aug 2025 09:29:42 +0200 Subject: [PATCH 6/6] use tm.shares_memory --- pandas/tests/arrays/categorical/test_astype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 4c3ae0b8f83c6..42edb1f511391 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -134,9 +134,9 @@ def test_astype_category_copy_false_nocopy_codes(self): # GH#62000 cat = Categorical([3, 2, 4, 1]) new = cat.astype("category", copy=False) - assert new.codes.base is cat.codes.base or new.codes is cat.codes + assert tm.shares_memory(new.codes, cat.codes) new = cat.astype("category", copy=True) - assert not (new.codes.base is cat.codes.base or new.codes is cat.codes) + assert not tm.shares_memory(new.codes, cat.codes) def test_astype_object_datetime_categories(self): # GH#40754