-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
BUG: creating Categorical from pandas Index/Series with "object" dtype infers string #62080
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
c4e1c18
e1a893d
cfa767f
c0ae870
5188b81
b63a723
0fb42cc
9216954
8f460ac
87a54fe
cddc574
e83e4f9
5ed039a
9b4b2d9
4855994
1b81162
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -457,6 +457,11 @@ def __init__( | |
codes = arr.indices.to_numpy() | ||
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) | ||
else: | ||
# Check for pandas Series/ Index with object dtye | ||
preserve_object_dtpe = False | ||
if isinstance(values, (ABCSeries, ABCIndex)): | ||
if getattr(values.dtype, "name", None) == "object": | ||
preserve_object_dtpe = True | ||
if not isinstance(values, ABCIndex): | ||
# in particular RangeIndex xref test_index_equal_range_categories | ||
values = sanitize_array(values, None) | ||
|
@@ -465,15 +470,17 @@ def __init__( | |
except TypeError as err: | ||
codes, categories = factorize(values, sort=False) | ||
if dtype.ordered: | ||
# raise, as we don't have a sortable data structure and so | ||
# the user should give us one by specifying categories | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I felt the comments were redundant as TypeError already explain it clearly and also new logic is added to detect if the input values is a pandas Series or Index with "object" dtype, and then force the categories to use object dtype. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although I do not have any strong preference, I am happy to add it back. |
||
raise TypeError( | ||
"'values' is not ordered, please " | ||
"explicitly specify the categories order " | ||
"by passing in a categories argument." | ||
) from err | ||
|
||
# we're inferring from values | ||
# If we should prserve object dtype, force categories to object dtype | ||
|
||
if preserve_object_dtpe: | ||
from pandas import Index | ||
|
||
categories = Index(categories, dtype=object, copy=False) | ||
dtype = CategoricalDtype(categories, dtype.ordered) | ||
|
||
elif isinstance(values.dtype, CategoricalDtype): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,6 +180,31 @@ def test_array_repr(self, data, size): | |
def test_groupby_extension_agg(self, as_index, data_for_grouping): | ||
super().test_groupby_extension_agg(as_index, data_for_grouping) | ||
|
||
def test_categorical_preserve_object_dtype_from_pandas(self): | ||
|
||
import numpy as np | ||
|
||
import pandas as pd | ||
|
||
|
||
pd.options.future.infer_string = True | ||
|
||
|
||
ser = pd.Series(["foo", "bar", "baz"], dtype="object") | ||
idx = pd.Index(["foo", "bar", "baz"], dtype="object") | ||
arr = np.array(["foo", "bar", "baz"], dtype="object") | ||
pylist = ["foo", "bar", "baz"] | ||
|
||
cat_from_ser = Categorical(ser) | ||
cat_from_idx = Categorical(idx) | ||
cat_from_arr = Categorical(arr) | ||
cat_from_list = Categorical(pylist) | ||
|
||
# Series/Index with object dtype: preserve object dtype | ||
assert cat_from_ser.categories.dtype == "object" | ||
assert cat_from_idx.categories.dtype == "object" | ||
|
||
# Numpy array or list: infer string dtype | ||
assert cat_from_arr.categories.dtype == "str" | ||
assert cat_from_list.categories.dtype == "str" | ||
|
||
|
||
class Test2DCompat(base.NDArrayBacked2DTests): | ||
def test_repr_2d(self, data): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can just check values.dtype == object