Skip to content

Commit 9216954

Browse files
committed
object
1 parent 0fb42cc commit 9216954

File tree

3 files changed

+64
-3
lines changed

3 files changed

+64
-3
lines changed

doc/source/user_guide/categorical.rst

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,3 +1178,32 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica
11781178
This also happens in some cases when you supply a NumPy array instead of a ``Categorical``:
11791179
using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behavior, while using
11801180
a string array (e.g. ``np.array(["a","b","c","a"])``) will not.
1181+
1182+
.. note::
1183+
1184+
When constructing a :class:`pandas.Categorical` from a pandas :class:`Series` or
1185+
:class:`Index` with ``dtype='object'``, the dtype of the categories will be
1186+
preserved as ``object``. When constructing from a NumPy array
1187+
with ``dtype='object'`` or a raw Python sequence, pandas will infer the most
1188+
specific dtype for the categories (for example, ``str`` if all elements are strings).
1189+
1190+
.. ipython:: python
1191+
1192+
pd.options.future.infer_string = True
1193+
ser = pd.Series(["foo", "bar", "baz"], dtype="object")
1194+
idx = pd.Index(["foo", "bar", "baz"], dtype="object")
1195+
arr = np.array(["foo", "bar", "baz"], dtype="object")
1196+
pylist = ["foo", "bar", "baz"]
1197+
1198+
cat_from_ser = pd.Categorical(ser)
1199+
cat_from_idx = pd.Categorical(idx)
1200+
cat_from_arr = pd.Categorical(arr)
1201+
cat_from_list = pd.Categorical(pylist)
1202+
1203+
# Series/Index with object dtype: preserve object dtype
1204+
assert cat_from_ser.categories.dtype == "object"
1205+
assert cat_from_idx.categories.dtype == "object"
1206+
1207+
# Numpy array or list: infer string dtype
1208+
assert cat_from_arr.categories.dtype == "str"
1209+
assert cat_from_list.categories.dtype == "str"

pandas/core/arrays/categorical.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,11 @@ def __init__(
457457
codes = arr.indices.to_numpy()
458458
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
459459
else:
460+
# Check for pandas Series/ Index with object dtye
461+
preserve_object_dtpe = False
462+
if isinstance(values, (ABCSeries, ABCIndex)):
463+
if getattr(values.dtype, "name", None) == "object":
464+
preserve_object_dtpe = True
460465
if not isinstance(values, ABCIndex):
461466
# in particular RangeIndex xref test_index_equal_range_categories
462467
values = sanitize_array(values, None)
@@ -465,15 +470,17 @@ def __init__(
465470
except TypeError as err:
466471
codes, categories = factorize(values, sort=False)
467472
if dtype.ordered:
468-
# raise, as we don't have a sortable data structure and so
469-
# the user should give us one by specifying categories
470473
raise TypeError(
471474
"'values' is not ordered, please "
472475
"explicitly specify the categories order "
473476
"by passing in a categories argument."
474477
) from err
475478

476-
# we're inferring from values
479+
# If we should prserve object dtype, force categories to object dtype
480+
if preserve_object_dtpe:
481+
from pandas import Index
482+
483+
categories = Index(categories, dtype=object, copy=False)
477484
dtype = CategoricalDtype(categories, dtype.ordered)
478485

479486
elif isinstance(values.dtype, CategoricalDtype):

pandas/tests/extension/test_categorical.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,31 @@ def test_array_repr(self, data, size):
180180
def test_groupby_extension_agg(self, as_index, data_for_grouping):
181181
super().test_groupby_extension_agg(as_index, data_for_grouping)
182182

183+
def test_categorical_preserve_object_dtype_from_pandas(self):
184+
import numpy as np
185+
186+
import pandas as pd
187+
188+
pd.options.future.infer_string = True
189+
190+
ser = pd.Series(["foo", "bar", "baz"], dtype="object")
191+
idx = pd.Index(["foo", "bar", "baz"], dtype="object")
192+
arr = np.array(["foo", "bar", "baz"], dtype="object")
193+
pylist = ["foo", "bar", "baz"]
194+
195+
cat_from_ser = Categorical(ser)
196+
cat_from_idx = Categorical(idx)
197+
cat_from_arr = Categorical(arr)
198+
cat_from_list = Categorical(pylist)
199+
200+
# Series/Index with object dtype: preserve object dtype
201+
assert cat_from_ser.categories.dtype == "object"
202+
assert cat_from_idx.categories.dtype == "object"
203+
204+
# Numpy array or list: infer string dtype
205+
assert cat_from_arr.categories.dtype == "str"
206+
assert cat_from_list.categories.dtype == "str"
207+
183208

184209
class Test2DCompat(base.NDArrayBacked2DTests):
185210
def test_repr_2d(self, data):

0 commit comments

Comments
 (0)