diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b94d82f3c9783..a388c80861ca9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -621,6 +621,7 @@ Other Deprecations - Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) - Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) +- Deprecated the ``.str`` accessor for ``object`` dtype :class:`Series`; explicitly cast to ``"str"`` dtype before using the accessor instead (:issue:`29710`) - Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 47cf0452bc32c..e3801c298c24d 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -41,7 +41,10 @@ def _dir_additions(self) -> set[str]: """ Add additional __dir__ for this object. """ - return {accessor for accessor in self._accessors if hasattr(self, accessor)} + with warnings.catch_warnings(): + # Don't issue warning about .str accessor on object dtype + warnings.filterwarnings("ignore") + return {accessor for accessor in self._accessors if hasattr(self, accessor)} def __dir__(self) -> list[str]: """ diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 21e6e2efbe778..6dc058428ec21 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -49,6 +49,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array @@ -203,8 +204,6 @@ class StringMethods(NoNewAttributesMixin): # * extractall def __init__(self, data) -> None: - from pandas.core.arrays.string_ import StringDtype - self._inferred_dtype = self._validate(data) self._is_categorical = isinstance(data.dtype, CategoricalDtype) self._is_string = isinstance(data.dtype, StringDtype) @@ -255,6 +254,14 @@ def _validate(data): data = extract_array(data) values = getattr(data, "categories", data) # categorical / normal + if data.dtype == object and get_option("future.infer_string"): + warnings.warn( + # GH#29710 + ".str accessor on object dtype is deprecated. Explicitly cast " + "to 'str' dtype instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) inferred_dtype = lib.infer_dtype(values, skipna=True) @@ -3875,7 +3882,6 @@ def _result_dtype(arr): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - from pandas.core.arrays.string_ import StringDtype if isinstance(arr.dtype, (ArrowDtype, StringDtype)): return arr.dtype diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index cfd5b3ac1f33f..79465240f13e2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5278,11 +5278,16 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd """ # encode if needed if len(data): - data = ( - Series(data.ravel(), copy=False, dtype="object") - .str.encode(encoding, errors) - ._values.reshape(data.shape) + # We can _almost_ do ser.astype("str").str.encode(encoding, errors) + # But the conversion to "str" can fail in e.g. test_to_hdf_errors + ser = Series(data.ravel(), copy=False, dtype="object") + arr = np.asarray(ser) + func = lambda x: x.encode(encoding, errors=errors) + mask = isna(arr) + result = lib.map_infer_mask( + arr, func, mask.view(np.uint8), convert=not np.all(mask) ) + data = result.reshape(data.shape) # create the sized dtype ensured = ensure_object(data.ravel()) @@ -5319,9 +5324,13 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - ser = Series(data, copy=False).str.decode( - encoding, errors=errors, dtype="object" - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", ".str accessor on object dtype is deprecated" + ) + ser = Series(data, copy=False).str.decode( + encoding, errors=errors, dtype="object" + ) data = ser.to_numpy() data.flags.writeable = True else: diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 792af5ff713a3..a216d385ab717 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -19,6 +19,7 @@ from datetime import datetime import sys from typing import TYPE_CHECKING +import warnings import numpy as np @@ -717,7 +718,11 @@ def _chunk_to_dataframe(self) -> DataFrame: elif self._column_types[j] == b"s": rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): - rslt[name] = self._decode_string(rslt[name].str) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", ".str accessor on object dtype is deprecated" + ) + rslt[name] = self._decode_string(rslt[name].str) if infer_string: rslt[name] = rslt[name].astype("str") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5b102a567f409..37adc6b0006b8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2744,7 +2744,7 @@ def _encode_strings(self) -> None: types cannot be exported and must first be converted to one of the supported types.""" ) - encoded = self.data[col].str.encode(self._encoding) + encoded = self.data[col].astype("str").str.encode(self._encoding) # If larger than _max_string_length do nothing if ( max_len_string_array(ensure_object(self.data[col]._values)) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b83a09e7f2e18..bf3ff5d272280 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -25,6 +25,7 @@ Generic, TypeVar, ) +import warnings import numpy as np import pytest @@ -134,58 +135,61 @@ def shape(self): # collect all objects to be tested for list-like-ness; use tuples of objects, # whether they are list-like or not (special casing for sets), and their ID -ll_params = [ - ([1], True, "list"), - ([], True, "list-empty"), - ((1,), True, "tuple"), - ((), True, "tuple-empty"), - ({"a": 1}, True, "dict"), - ({}, True, "dict-empty"), - ({"a", 1}, "set", "set"), - (set(), "set", "set-empty"), - (frozenset({"a", 1}), "set", "frozenset"), - (frozenset(), "set", "frozenset-empty"), - (iter([1, 2]), True, "iterator"), - (iter([]), True, "iterator-empty"), - ((x for x in [1, 2]), True, "generator"), - ((_ for _ in []), True, "generator-empty"), - (Series([1]), True, "Series"), - (Series([], dtype=object), True, "Series-empty"), - # Series.str will still raise a TypeError if iterated - (Series(["a"]).str, True, "StringMethods"), - (Series([], dtype="O").str, True, "StringMethods-empty"), - (Index([1]), True, "Index"), - (Index([]), True, "Index-empty"), - (DataFrame([[1]]), True, "DataFrame"), - (DataFrame(), True, "DataFrame-empty"), - (np.ndarray((2,) * 1), True, "ndarray-1d"), - (np.array([]), True, "ndarray-1d-empty"), - (np.ndarray((2,) * 2), True, "ndarray-2d"), - (np.array([[]]), True, "ndarray-2d-empty"), - (np.ndarray((2,) * 3), True, "ndarray-3d"), - (np.array([[[]]]), True, "ndarray-3d-empty"), - (np.ndarray((2,) * 4), True, "ndarray-4d"), - (np.array([[[[]]]]), True, "ndarray-4d-empty"), - (np.array(2), False, "ndarray-0d"), - (MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"), - (MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"), - (MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"), - (MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"), - (MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"), - (MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"), - (MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"), - (MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"), - (MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"), - (1, False, "int"), - (b"123", False, "bytes"), - (b"", False, "bytes-empty"), - ("123", False, "string"), - ("", False, "string-empty"), - (str, False, "string-type"), - (object(), False, "object"), - (np.nan, False, "NaN"), - (None, False, "None"), -] +with warnings.catch_warnings(): + # suppress warning on "StringMethods-empty" with object dtype + warnings.filterwarnings("ignore", ".str accessor on object dtype is deprecated") + ll_params = [ + ([1], True, "list"), + ([], True, "list-empty"), + ((1,), True, "tuple"), + ((), True, "tuple-empty"), + ({"a": 1}, True, "dict"), + ({}, True, "dict-empty"), + ({"a", 1}, "set", "set"), + (set(), "set", "set-empty"), + (frozenset({"a", 1}), "set", "frozenset"), + (frozenset(), "set", "frozenset-empty"), + (iter([1, 2]), True, "iterator"), + (iter([]), True, "iterator-empty"), + ((x for x in [1, 2]), True, "generator"), + ((_ for _ in []), True, "generator-empty"), + (Series([1]), True, "Series"), + (Series([], dtype=object), True, "Series-empty"), + # Series.str will still raise a TypeError if iterated + (Series(["a"]).str, True, "StringMethods"), + (Series([], dtype="O").str, True, "StringMethods-empty"), + (Index([1]), True, "Index"), + (Index([]), True, "Index-empty"), + (DataFrame([[1]]), True, "DataFrame"), + (DataFrame(), True, "DataFrame-empty"), + (np.ndarray((2,) * 1), True, "ndarray-1d"), + (np.array([]), True, "ndarray-1d-empty"), + (np.ndarray((2,) * 2), True, "ndarray-2d"), + (np.array([[]]), True, "ndarray-2d-empty"), + (np.ndarray((2,) * 3), True, "ndarray-3d"), + (np.array([[[]]]), True, "ndarray-3d-empty"), + (np.ndarray((2,) * 4), True, "ndarray-4d"), + (np.array([[[[]]]]), True, "ndarray-4d-empty"), + (np.array(2), False, "ndarray-0d"), + (MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"), + (MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"), + (MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"), + (MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"), + (MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"), + (MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"), + (1, False, "int"), + (b"123", False, "bytes"), + (b"", False, "bytes-empty"), + ("123", False, "string"), + ("", False, "string-empty"), + (str, False, "string-type"), + (object(), False, "object"), + (np.nan, False, "NaN"), + (None, False, "None"), + ] objs, expected, ids = zip(*ll_params) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 1050f8154572a..8373d8b6c58fb 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -347,8 +347,12 @@ def test_against_frame_and_seriesgroupby( expected.name = name if as_index: index_frame = expected.index.to_frame(index=False) - index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) - index_frame["education"] = index_frame["both"].str.split("-").str.get(1) + index_frame["gender"] = ( + index_frame["both"].astype(str).str.split("-").map(lambda x: x[0]) + ) + index_frame["education"] = ( + index_frame["both"].astype(str).str.split("-").map(lambda x: x[1]) + ) del index_frame["both"] index_frame2 = index_frame.rename({0: None}, axis=1) expected.index = MultiIndex.from_frame(index_frame2) @@ -360,8 +364,16 @@ def test_against_frame_and_seriesgroupby( expected.index.names = [None] + expected.index.names[1:] tm.assert_series_equal(result, expected) else: - expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) - expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + expected.insert( + 1, + "gender", + expected["both"].astype(str).str.split("-").map(lambda x: x[0]), + ) + expected.insert( + 2, + "education", + expected["both"].astype(str).str.split("-").map(lambda x: x[1]), + ) if using_infer_string: expected = expected.astype({"gender": "str", "education": "str"}) del expected["both"] diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index a17cd27f8284e..4c8c0fe84d1ea 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -109,6 +109,9 @@ def test_iterator_read_too_much(self, dirpath): tm.assert_frame_equal(d1, d2) +@pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype is deprecated:FutureWarning" +) def test_encoding_options(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 4b369bb0bc869..c18e542764756 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -160,6 +160,7 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.filterwarnings("ignore:.str accessor:FutureWarning") def test_inspect_getmembers(self): # GH38782 ser = Series(dtype=object) diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 4a1b97606db2b..0d95ef85da2d9 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -14,6 +14,10 @@ ) from pandas.core.strings.accessor import StringMethods +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + # subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), @@ -214,3 +218,11 @@ def test_api_for_categorical(any_string_method, any_string_dtype): else: # str.cat(others=None) returns string, for example assert result == expected + + +def test_object_str_deprecated(): + # GH#29710 + ser = Series(["a", "b", "c"], dtype=object) + msg = ".str accessor on object dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 819556f961fa3..0ce8f201a832e 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -9,6 +9,10 @@ _testing as tm, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + def test_title(any_string_dtype): s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype) diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 68ca807bde145..ddc935e79adbf 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -15,6 +15,10 @@ option_context, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + @pytest.fixture def index_or_series2(index_or_series): diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 1dc7dbabe85b0..104c876e9ea2c 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -14,6 +14,10 @@ _testing as tm, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype): # TODO: should this raise TypeError diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index cce96f38d216a..e7aa66a9df191 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -16,6 +16,10 @@ is_object_or_nan_string_dtype, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + # -------------------------------------------------------------------------------------- # str.contains # -------------------------------------------------------------------------------------- @@ -1095,6 +1099,9 @@ def test_translate_mixed_object(): # -------------------------------------------------------------------------------------- +@pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype is deprecated:FutureWarning" +) def test_flags_kwarg(any_string_dtype): data = { "Dave": "dave@google.com", @@ -1121,7 +1128,8 @@ def test_flags_kwarg(any_string_dtype): result = data.str.count(pat, flags=re.IGNORECASE) assert result.iloc[0] == 1 + data_str = data.str msg = "has match groups" with tm.assert_produces_warning(UserWarning, match=msg): - result = data.str.contains(pat, flags=re.IGNORECASE) + result = data_str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 16e10c6fcdccd..3f2d48418a959 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -11,6 +11,10 @@ _testing as tm, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 4fab6e7778002..6daf92efa0afc 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -17,6 +17,10 @@ is_object_or_nan_string_dtype, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split(any_string_dtype, method): diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index c5414022e664b..82aac7192c923 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -11,6 +11,10 @@ option_context, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index fb3a3b8d60b6b..28056a4b643a1 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -18,6 +18,10 @@ from pandas.core.strings.accessor import StringMethods from pandas.tests.strings import is_object_or_nan_string_dtype +pytestmark = pytest.mark.filterwarnings( + "ignore:.str accessor on object dtype:FutureWarning" +) + @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) def test_startswith_endswith_non_str_patterns(pattern):