From c4e1c186d7fa0cb2800d6a3ab430ab916a7a1310 Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Tue, 24 Jun 2025 14:10:47 -0700 Subject: [PATCH 1/3] slack link update --- doc/source/development/community.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index 1c698d130ea6c..e139ea0376771 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -114,7 +114,7 @@ people who are hesitant to bring up their questions or ideas on a large public mailing list or GitHub. If this sounds like the right place for you, you are welcome to join using -`this link `_! +`this link `_! Please remember to follow our `Code of Conduct `_, and be aware that our admins are monitoring for irrelevant messages and will remove folks who use our From 921695444d0d53cff1517d8edf904474b6f1c246 Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Fri, 8 Aug 2025 20:06:29 -0700 Subject: [PATCH 2/3] object --- doc/source/user_guide/categorical.rst | 29 ++++++++++++++++++++++ pandas/core/arrays/categorical.py | 13 +++++++--- pandas/tests/extension/test_categorical.py | 25 +++++++++++++++++++ 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 1e7d66dfeb142..51d6fd4a9e3ad 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1178,3 +1178,32 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica This also happens in some cases when you supply a NumPy array instead of a ``Categorical``: using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behavior, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. + +.. note:: + + When constructing a :class:`pandas.Categorical` from a pandas :class:`Series` or + :class:`Index` with ``dtype='object'``, the dtype of the categories will be + preserved as ``object``. When constructing from a NumPy array + with ``dtype='object'`` or a raw Python sequence, pandas will infer the most + specific dtype for the categories (for example, ``str`` if all elements are strings). + +.. ipython:: python + + pd.options.future.infer_string = True + ser = pd.Series(["foo", "bar", "baz"], dtype="object") + idx = pd.Index(["foo", "bar", "baz"], dtype="object") + arr = np.array(["foo", "bar", "baz"], dtype="object") + pylist = ["foo", "bar", "baz"] + + cat_from_ser = pd.Categorical(ser) + cat_from_idx = pd.Categorical(idx) + cat_from_arr = pd.Categorical(arr) + cat_from_list = pd.Categorical(pylist) + + # Series/Index with object dtype: preserve object dtype + assert cat_from_ser.categories.dtype == "object" + assert cat_from_idx.categories.dtype == "object" + + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.dtype == "str" + assert cat_from_list.categories.dtype == "str" diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d57856115d276..fa550a7f46617 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -457,6 +457,11 @@ def __init__( codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) else: + # Check for pandas Series/ Index with object dtye + preserve_object_dtpe = False + if isinstance(values, (ABCSeries, ABCIndex)): + if getattr(values.dtype, "name", None) == "object": + preserve_object_dtpe = True if not isinstance(values, ABCIndex): # in particular RangeIndex xref test_index_equal_range_categories values = sanitize_array(values, None) @@ -465,15 +470,17 @@ def __init__( except TypeError as err: codes, categories = factorize(values, sort=False) if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories raise TypeError( "'values' is not ordered, please " "explicitly specify the categories order " "by passing in a categories argument." ) from err - # we're inferring from values + # If we should prserve object dtype, force categories to object dtype + if preserve_object_dtpe: + from pandas import Index + + categories = Index(categories, dtype=object, copy=False) dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 8f8af607585df..5a519a261b029 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -180,6 +180,31 @@ def test_array_repr(self, data, size): def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) + def test_categorical_preserve_object_dtype_from_pandas(self): + import numpy as np + + import pandas as pd + + pd.options.future.infer_string = True + + ser = pd.Series(["foo", "bar", "baz"], dtype="object") + idx = pd.Index(["foo", "bar", "baz"], dtype="object") + arr = np.array(["foo", "bar", "baz"], dtype="object") + pylist = ["foo", "bar", "baz"] + + cat_from_ser = Categorical(ser) + cat_from_idx = Categorical(idx) + cat_from_arr = Categorical(arr) + cat_from_list = Categorical(pylist) + + # Series/Index with object dtype: preserve object dtype + assert cat_from_ser.categories.dtype == "object" + assert cat_from_idx.categories.dtype == "object" + + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.dtype == "str" + assert cat_from_list.categories.dtype == "str" + class Test2DCompat(base.NDArrayBacked2DTests): def test_repr_2d(self, data): From 8f460acc845be6726582995ca7714c73f764d77f Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Fri, 8 Aug 2025 20:13:29 -0700 Subject: [PATCH 3/3] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3191c077d3c36..5501d3fa8b08e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -690,7 +690,7 @@ Categorical - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`) - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) -- +- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the dtype is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings). Datetimelike ^^^^^^^^^^^^