From 56ae25251f34add594e5c722747b5226256795b2 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 4 Apr 2023 09:26:42 -0600 Subject: [PATCH 01/52] WIP: preliminary support for stringdtype --- asv_bench/asv.conf.json | 41 ++++++++++++++++----------- asv_bench/benchmarks/strings.py | 23 ++++++++++++--- pandas/_libs/lib.pyx | 11 +++++-- pandas/_testing/__init__.py | 4 +-- pandas/core/common.py | 3 +- pandas/core/construction.py | 3 +- pandas/core/dtypes/astype.py | 5 ++-- pandas/core/dtypes/common.py | 41 ++++++++++++++++++++++++++- pandas/core/dtypes/dtypes.py | 1 - pandas/core/dtypes/missing.py | 6 ++++ pandas/core/indexes/base.py | 3 +- pandas/core/internals/blocks.py | 33 +++++++++++++++------ pandas/core/internals/construction.py | 3 +- pandas/core/internals/managers.py | 3 +- pandas/core/strings/accessor.py | 20 +++++++------ pandas/core/strings/object_array.py | 29 +++++++++++++++++-- 16 files changed, 175 insertions(+), 54 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 810764754b7e1..8c6b0faa6d523 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -41,23 +41,30 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "Cython": ["0.29.33"], - "matplotlib": [], - "sqlalchemy": [], - "scipy": [], - "numba": [], - "numexpr": [], - "pytables": [null, ""], // platform dependent, see excludes below - "pyarrow": [], - "tables": [null, ""], - "openpyxl": [], - "xlsxwriter": [], - "xlrd": [], - "odfpy": [], - "jinja2": [], - "meson": [], - "meson-python": [], - "python-build": [], + "req": { + "pip+/home/nathan/Documents/numpy": [], + "Cython": ["0.29.33"], + "matplotlib": [], + "sqlalchemy": [], + "scipy": [], + "numba": [], + "numexpr": [], + "pytables": [null, ""], // platform dependent, see excludes below + "pyarrow": [], + "tables": [null, ""], + "openpyxl": [], + "xlsxwriter": [], + "xlrd": [], + "odfpy": [], + "jinja2": [], + "meson": [], + "meson-python": [], + "python-build": [], + "pip+/home/nathan/Documents/numpy-user-dtypes/stringdtype": [] + }, + "env": { + "NUMPY_EXPERIMENTAL_DTYPE_API": "1" + } }, "conda_channels": ["conda-forge"], // Combinations of libraries/python versions can be excluded/included diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 9f1aeb7670628..2770c5060039e 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,6 +1,7 @@ import warnings import numpy as np +from stringdtype import StringDType from pandas import ( NA, @@ -14,12 +15,21 @@ class Dtypes: - params = ["str", "string[python]", "string[pyarrow]"] + params = ["str", "string[python]", "string[pyarrow]", StringDType()] param_names = ["dtype"] + dtype_mapping = { + "str": "str", + "string[python]": object, + "string[pyarrow]": object, + StringDType(): StringDType(), + } def setup(self, dtype): try: - self.s = Series(tm.makeStringIndex(10**5), dtype=dtype) + self.s = Series( + tm.makeStringIndex(10**5, dtype=self.dtype_mapping[dtype]), + dtype=dtype, + ) except ImportError: raise NotImplementedError @@ -27,11 +37,16 @@ def setup(self, dtype): class Construction: params = ( ["series", "frame", "categorical_series"], - ["str", "string[python]", "string[pyarrow]"], + ["str", "string[python]", "string[pyarrow]", StringDType()], ) param_names = ["pd_type", "dtype"] pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series} - dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object} + dtype_mapping = { + "str": "str", + "string[python]": object, + "string[pyarrow]": object, + StringDType(): StringDType(), + } def setup(self, pd_type, dtype): series_arr = tm.rands_array( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc2886e5b531c..a1f36b011494f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1391,9 +1391,9 @@ cdef object _try_infer_map(object dtype): return None -def infer_dtype(value: object, skipna: bool = True) -> str: +def infer_dtype(value: object, skipna: bool = True) -> object: """ - Return a string label of the type of a scalar or list-like of values. + Return the type of a scalar or list-like of values. Parameters ---------- @@ -1403,7 +1403,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Returns ------- - str + str or dtype object Describing the common type of the input data. Results can include: @@ -1427,6 +1427,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str: - mixed - unknown-array + Returns a dtype object for non-legacy numpy dtypes + Raises ------ TypeError @@ -1529,6 +1531,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if inferred is not None: # Anything other than object-dtype should return here. return inferred + elif not getattr(type(values.dtype), "_legacy", True): + if issubclass(values.dtype.type, str): + return values.dtype if values.descr.type_num != NPY_OBJECT: # i.e. values.dtype != np.object_ diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 7908c9df60df8..e0821cef78103 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -356,8 +356,8 @@ def getCols(k) -> str: # make index -def makeStringIndex(k: int = 10, name=None) -> Index: - return Index(rands_array(nchars=10, size=k), name=name) +def makeStringIndex(k: int = 10, name=None, dtype: NpDtype = "O") -> Index: + return Index(rands_array(nchars=10, size=k, dtype=dtype), name=name) def makeCategoricalIndex( diff --git a/pandas/core/common.py b/pandas/core/common.py index ee8fe220698b5..a0d7c78772eb8 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -35,6 +35,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_integer, + is_legacy_string_dtype, ) from pandas.core.dtypes.generic import ( ABCExtensionArray, @@ -243,7 +244,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi # has incompatible type "Iterable[Any]"; expected "Sized" return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type] - if issubclass(result.dtype.type, str): + if is_legacy_string_dtype(result.dtype): result = np.asarray(values, dtype=object) if result.ndim == 2: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9b4d67a20a7cd..6e8bd7858729a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -43,6 +43,7 @@ maybe_promote, ) from pandas.core.dtypes.common import ( + is_legacy_string_dtype, is_list_like, is_object_dtype, pandas_dtype, @@ -708,7 +709,7 @@ def _sanitize_str_dtypes( # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. - if issubclass(result.dtype.type, str): + if is_legacy_string_dtype(result.dtype): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, result has already the result diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 64df3827d7a3d..5eee410b1061c 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -18,6 +18,7 @@ from pandas.errors import IntCastingNaNError from pandas.core.dtypes.common import ( + is_legacy_string_dtype, is_object_dtype, is_string_dtype, pandas_dtype, @@ -89,7 +90,7 @@ def _astype_nansafe( res = arr.astype(dtype, copy=copy) return np.asarray(res) - if issubclass(dtype.type, str): + if issubclass(dtype.type, str) and is_legacy_string_dtype(dtype): shape = arr.shape if arr.ndim > 1: arr = arr.ravel() @@ -183,7 +184,7 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra values = _astype_nansafe(values, dtype, copy=copy) # in pandas we don't store numpy str dtypes, so convert to object - if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + if isinstance(dtype, np.dtype) and is_legacy_string_dtype(values.dtype): values = np.array(values, dtype=object) return values diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2c426187c83e8..b44c23ba6f778 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -515,7 +515,7 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: """ Faster alternative to is_string_dtype, assumes we have a np.dtype object. """ - return dtype == object or dtype.kind in "SU" + return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str) def is_string_dtype(arr_or_dtype) -> bool: @@ -1662,6 +1662,44 @@ def is_all_strings(value: ArrayLike) -> bool: return dtype == "string" +def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool: + """Check if the dtype is a numpy legacy string dtype + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check + + include_bytes : boolean + whether or not to include bytestring dtypes + + Returns + ------- + boolean + True for legacy numpy dtypes that represent python strings, + False otherwise. If include_bytes is True, also true for + legacy bytes dtypes. + + """ + if arr_or_dtype is None: + return False + + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + + if not isinstance(dtype, np.dtype): + return False + + # the _legacy attribute was added in Numpy 1.25. If the attribute isn't + # defined on the dtype class, Numpy isn't sufficiently new, so we have to be + # dealing with a legacy dtype. + is_legacy = getattr(type(dtype), "_legacy", True) + if not is_legacy: + return False + if include_bytes: + return issubclass(dtype.type, (str, bytes)) + return issubclass(dtype.type, str) + + __all__ = [ "classes", "DT64NS_DTYPE", @@ -1696,6 +1734,7 @@ def is_all_strings(value: ArrayLike) -> bool: "is_interval", "is_interval_dtype", "is_iterator", + "is_legacy_string_dtype", "is_named_tuple", "is_nested_list_like", "is_number", diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 7fff0f0d2d805..0bf7dd82c5e36 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -275,7 +275,6 @@ def _from_values_or_dtype( >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object) """ - if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, str): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 65bbdb0e5df92..36bd89edafa88 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -11,6 +11,7 @@ ) import numpy as np +from stringdtype import StringDType from pandas._config import get_option @@ -305,6 +306,11 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) + elif type(dtype) is StringDType: + if inf_as_na: + result = ~np.isfinite(values) + else: + result = np.isnan(values) else: if values.ndim in {1, 2}: result = libmissing.isnaobj(values, inf_as_na=inf_as_na) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7a52630296c27..e649f667fa6a9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -21,6 +21,7 @@ import warnings import numpy as np +from stringdtype import StringDType from pandas._config import get_option @@ -506,7 +507,7 @@ def __new__( if isinstance(data, ABCMultiIndex): data = data._values - if data.dtype.kind not in "iufcbmM": + if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType: # GH#11836 we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 02f8393eed102..0d51258117bc1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -57,6 +57,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, + is_legacy_string_dtype, is_list_like, is_string_dtype, ) @@ -2317,7 +2318,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike: if isinstance(values, np.ndarray): values = ensure_wrapped_if_datetimelike(values) - if issubclass(values.dtype.type, str): + if is_legacy_string_dtype(values.dtype): values = np.array(values, dtype=object) if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None: @@ -2347,15 +2348,29 @@ def get_block_type(dtype: DtypeObj) -> type[Block]: # Note: need to be sure PandasArray is unwrapped before we get here return ExtensionBlock - # We use kind checks because it is much more performant - # than is_foo_dtype - kind = dtype.kind - if kind in "Mm": - return DatetimeLikeBlock - elif kind in "fciub": - return NumericBlock + dtype_class = type(dtype) + + # the _is_numeric attribute was added in Numpy 1.25, default to checking + # dtype.kind and finally use ObjectBlock if numpy isn't sufficiently new. + try: + is_numeric = dtype_class._is_numeric + except AttributeError: + # We use kind checks because it is much more performant + # than is_foo_dtype + kind = dtype.kind + if kind in "Mm": + return DatetimeLikeBlock + elif kind in "fciub": + return NumericBlock + else: + return ObjectBlock - return ObjectBlock + if is_numeric: + return NumericBlock + else: + if is_legacy_string_dtype(dtype): + return ObjectBlock + return NumpyBlock def new_block_2d( diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f080683d76df7..8d36ff2cd8cc6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -28,6 +28,7 @@ from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, is_integer_dtype, + is_legacy_string_dtype, is_list_like, is_named_tuple, is_object_dtype, @@ -330,7 +331,7 @@ def ndarray_to_mgr( _check_values_indices_shape_match(values, index, columns) if typ == "array": - if issubclass(values.dtype.type, str): + if is_legacy_string_dtype(values.dtype): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 397f9d5b1bbe6..caad9c56659c4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -35,6 +35,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, + is_legacy_string_dtype, is_list_like, ) from pandas.core.dtypes.dtypes import ( @@ -2268,7 +2269,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if isinstance(dtype, np.dtype): is_dtlike = dtype.kind in "mM" - if issubclass(dtype.type, (str, bytes)): + if is_legacy_string_dtype(dtype, include_bytes=True): dtype = np.dtype(object) values, placement = _stack_arrays(list(tup_block), dtype) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9ffb0444f1516..6865b553fd386 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -120,12 +120,14 @@ def _forbid_nonstring_types(func: F) -> F: @wraps(func) def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) + dtype = self._inferred_dtype + if dtype not in allowed_types: + if not (isinstance(dtype, np.dtype) and issubclass(dtype.type, str)): + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) return func(self, *args, **kwargs) wrapper.__name__ = func_name @@ -229,9 +231,11 @@ def _validate(data): values = getattr(data, "categories", data) # categorical / normal - inferred_dtype = lib.infer_dtype(values, skipna=True) + inferred_dtype = lib.infer_dtype(values) - if inferred_dtype not in allowed_types: + if inferred_dtype not in allowed_types and not isinstance( + inferred_dtype, np.dtype + ): raise AttributeError("Can only use .str accessor with string values!") return inferred_dtype diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 87cc6e71b8672..a0cbe6f879be5 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -14,13 +14,20 @@ import unicodedata import numpy as np +from stringdtype import StringDType from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, + is_scalar, +) from pandas.core.dtypes.missing import isna +from pandas.core.arrays.integer import IntegerArray from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: @@ -63,6 +70,8 @@ def _str_map( convert : bool, default True Whether to call `maybe_convert_objects` on the resulting ndarray """ + from pandas.arrays import BooleanArray + if dtype is None: dtype = np.dtype("object") if na_value is None: @@ -71,9 +80,12 @@ def _str_map( if not len(self): return np.array([], dtype=dtype) - arr = np.asarray(self, dtype=object) + arr = np.asarray(self) mask = isna(arr) - map_convert = convert and not np.all(mask) + type(arr.dtype) + map_convert = ( + convert and not np.all(mask) and type(arr.dtype) is not StringDType + ) try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) except (TypeError, AttributeError) as err: @@ -103,6 +115,18 @@ def g(x): np.putmask(result, mask, na_value) if convert and result.dtype == object: result = lib.maybe_convert_objects(result) + + result = result.astype(dtype) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray] | type[BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + return constructor(result, mask) + return result def _str_count(self, pat, flags: int = 0): @@ -258,6 +282,7 @@ def _str_find_(self, sub, start, end, side): f = lambda x: getattr(x, method)(sub, start) else: f = lambda x: getattr(x, method)(sub, start, end) + return self._str_map(f, dtype="int64") def _str_findall(self, pat, flags: int = 0): From 206d2f061d32fbadc2fb02e07c3ca2f8e755acfe Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 26 Apr 2023 14:37:55 -0600 Subject: [PATCH 02/52] add NumpyStringArray and string[numpy] dtype --- asv_bench/benchmarks/strings.py | 12 ++- pandas/_libs/lib.pyx | 2 +- pandas/core/arrays/__init__.py | 8 +- pandas/core/arrays/string_.py | 139 +++++++++++++++++++--------- pandas/core/construction.py | 1 + pandas/core/dtypes/common.py | 13 +++ pandas/core/dtypes/missing.py | 5 +- pandas/core/indexes/base.py | 3 +- pandas/core/strings/object_array.py | 6 +- 9 files changed, 133 insertions(+), 56 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2770c5060039e..b8e1aa1e38dc0 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -15,12 +15,19 @@ class Dtypes: - params = ["str", "string[python]", "string[pyarrow]", StringDType()] + params = [ + "str", + "string[python]", + "string[pyarrow]", + "string[numpy]", + StringDType(), + ] param_names = ["dtype"] dtype_mapping = { "str": "str", "string[python]": object, "string[pyarrow]": object, + "string[numpy]": StringDType(), StringDType(): StringDType(), } @@ -37,7 +44,7 @@ def setup(self, dtype): class Construction: params = ( ["series", "frame", "categorical_series"], - ["str", "string[python]", "string[pyarrow]", StringDType()], + ["str", "string[python]", "string[pyarrow]", "string[numpy]", StringDType()], ) param_names = ["pd_type", "dtype"] pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series} @@ -45,6 +52,7 @@ class Construction: "str": "str", "string[python]": object, "string[pyarrow]": object, + "string[numpy]": StringDType(), StringDType(): StringDType(), } diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a1f36b011494f..6b4571ad13dad 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1860,7 +1860,7 @@ cdef class StringValidator(Validator): return isinstance(value, str) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.str_) + return issubclass(self.dtype.type, (np.str_, str)) cpdef bint is_string_array(ndarray values, bint skipna=False): diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 79be8760db931..5648df6356260 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -17,7 +17,11 @@ period_array, ) from pandas.core.arrays.sparse import SparseArray -from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_ import ( + NumpyStringArray, + ObjectStringArray, + StringArray, +) from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.arrays.timedeltas import TimedeltaArray @@ -39,5 +43,7 @@ "period_array", "SparseArray", "StringArray", + "ObjectStringArray", + "NumpyStringArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c9dc20cf93ddd..7c22c6b8437af 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -14,7 +14,10 @@ missing as libmissing, ) from pandas._libs.arrays import NDArrayBacked -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + is_numpy_dev, + pa_version_under7p0, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -24,6 +27,7 @@ register_extension_dtype, ) from pandas.core.dtypes.common import ( + get_string_dtype, is_array_like, is_bool_dtype, is_integer_dtype, @@ -76,7 +80,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -108,14 +112,17 @@ def na_value(self) -> libmissing.NAType: def __init__(self, storage=None) -> None: if storage is None: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + if storage not in {"python", "pyarrow", "numpy"}: raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + "Storage must be 'python', 'pyarrow', or 'numpy'. " + "Got {storage} instead." ) if storage == "pyarrow" and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) + if storage == "numpy" and not is_numpy_dev: + raise ImportError("NumPy backed string storage requires numpy dev") self.storage = storage @property @@ -139,6 +146,7 @@ def construct_from_string(cls, string): ``'string'`` pd.options.mode.string_storage, default python ``'string[python]'`` python ``'string[pyarrow]'`` pyarrow + ``'string[numpy]'`` numpy ========================== ============================================== Returns @@ -160,6 +168,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[numpy]": + return cls(storage="numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -179,9 +189,13 @@ def construct_array_type( # type: ignore[override] from pandas.core.arrays.string_arrow import ArrowStringArray if self.storage == "python": - return StringArray - else: + return ObjectStringArray + elif self.storage == "pyarrow": return ArrowStringArray + elif self.storage == "numpy": + return NumpyStringArray + else: + raise NotImplementedError def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -231,7 +245,7 @@ def tolist(self): # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" -class StringArray(BaseStringArray, PandasArray): # type: ignore[misc] +class BaseNumpyStringArray(BaseStringArray, PandasArray): # type: ignore[misc] """ Extension array for string data. @@ -321,54 +335,23 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage)) def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": + if self._ndarray.dtype != self._cache_dtype: raise ValueError( - "StringArray requires a sequence of strings or pandas.NA. Got " + f"{type(self).__name__} requires a sequence of strings or " + "pandas.NA convertible to a NumPy array with dtype " + f"{self._cache_dtype}. Got " f"'{self._ndarray.dtype}' dtype instead." ) - # Check to see if need to convert Na values to pd.NA - if self._ndarray.ndim > 2: - # Ravel if ndims > 2 b/c no cythonized version available - lib.convert_nans_to_NA(self._ndarray.ravel("K")) - else: - lib.convert_nans_to_NA(self._ndarray) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): - if dtype and not (isinstance(dtype, str) and dtype == "string"): - dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "python" - - from pandas.core.arrays.masked import BaseMaskedArray - - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA - - else: - if hasattr(scalars, "type"): - # pyarrow array; we cannot rely on the "to_numpy" check in - # ensure_string_array because calling scalars.to_numpy would set - # zero_copy_only to True which caused problems see GH#52076 - scalars = np.array(scalars) - # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) - - # Manually creating new array avoids the validation step in the __init__, so is - # faster. Refactor need for validation? - new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) - - return new_string_array + raise NotImplementedError("_from_sequence must be implemented in subclasses") @classmethod def _from_sequence_of_strings( @@ -612,3 +595,71 @@ def _str_map( # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + + +class ObjectStringArray(BaseNumpyStringArray): + _cache_dtype = "object" + _storage = "python" + + def _validate(self): + super()._validate() + # Check to see if need to convert Na values to pd.NA + if self._ndarray.ndim > 2: + # Ravel if ndims > 2 b/c no cythonized version available + lib.convert_nans_to_NA(self._ndarray.ravel("K")) + else: + lib.convert_nans_to_NA(self._ndarray) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" + + from pandas.core.arrays.masked import BaseMaskedArray + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result[na_values] = libmissing.NA + + else: + if hasattr(scalars, "type"): + # pyarrow array; we cannot rely on the "to_numpy" check in + # ensure_string_array because calling scalars.to_numpy would set + # zero_copy_only to True which caused problems see GH#52076 + scalars = np.array(scalars) + # convert non-na-likes to str, and nan-likes to StringDtype().na_value + result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + + # Manually creating new array avoids the validation step in the __init__, so is + # faster. Refactor need for validation? + new_string_array = cls.__new__(cls) + NDArrayBacked.__init__( + new_string_array, result, StringDtype(storage=cls._storage) + ) + + return new_string_array + + +StringArray = ObjectStringArray + + +class NumpyStringArray(BaseNumpyStringArray): + _cache_dtype = get_string_dtype() + _storage = "numpy" + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + result = np.array(scalars, dtype=cls._cache_dtype) + + # Manually creating new array avoids the validation step in the __init__, so is + # faster. Refactor need for validation? + new_string_array = cls.__new__(cls) + NDArrayBacked.__init__( + new_string_array, result, StringDtype(storage=cls._storage) + ) + + return new_string_array diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 6e8bd7858729a..f7594922ee448 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -536,6 +536,7 @@ def sanitize_array( ------- np.ndarray or ExtensionArray """ + if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b44c23ba6f778..4cdc6cd408fc9 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -17,6 +17,7 @@ Period, algos, lib, + missing, ) from pandas._libs.tslibs import conversion from pandas.util._exceptions import find_stack_level @@ -518,6 +519,18 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str) +def get_string_dtype(): + import os + import sys + + if not os.environ.get("NUMPY_EXPERIMENTAL_DTYPE_API", None) == "1": + sys.exit() + + import stringdtype + + return stringdtype.StringDType(na_object=missing.NA) + + def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 36bd89edafa88..97143bba439f2 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -11,7 +11,6 @@ ) import numpy as np -from stringdtype import StringDType from pandas._config import get_option @@ -26,6 +25,7 @@ DT64NS_DTYPE, TD64NS_DTYPE, ensure_object, + get_string_dtype, is_scalar, is_string_or_object_np_dtype, ) @@ -300,6 +300,9 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): return result +StringDType = type(get_string_dtype()) + + def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: # Working around NumPy ticket 1542 dtype = values.dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e649f667fa6a9..7a52630296c27 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -21,7 +21,6 @@ import warnings import numpy as np -from stringdtype import StringDType from pandas._config import get_option @@ -507,7 +506,7 @@ def __new__( if isinstance(data, ABCMultiIndex): data = data._values - if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType: + if data.dtype.kind not in "iufcbmM": # GH#11836 we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index a0cbe6f879be5..f7b50e9a25a79 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -14,7 +14,6 @@ import unicodedata import numpy as np -from stringdtype import StringDType from pandas._libs import lib import pandas._libs.missing as libmissing @@ -82,10 +81,7 @@ def _str_map( arr = np.asarray(self) mask = isna(arr) - type(arr.dtype) - map_convert = ( - convert and not np.all(mask) and type(arr.dtype) is not StringDType - ) + map_convert = convert and not np.all(mask) try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) except (TypeError, AttributeError) as err: From 5adadfa15544a9e9f0f181996b9af988ce663ddf Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 15 May 2023 10:54:02 -0600 Subject: [PATCH 03/52] WIP: making progress --- pandas/conftest.py | 1 + pandas/core/algorithms.py | 17 +++++++++++++++-- pandas/core/arrays/string_.py | 8 +++++--- pandas/core/dtypes/cast.py | 14 +++++++++++--- pandas/core/dtypes/common.py | 8 ++++++++ pandas/core/dtypes/missing.py | 9 +++++---- pandas/core/internals/blocks.py | 2 +- pandas/core/missing.py | 21 +++++++++++++++------ pandas/core/util/hashing.py | 3 +++ 9 files changed, 64 insertions(+), 19 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 86f0121dd00a9..8cf95dbdfcbd5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1296,6 +1296,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + "numpy", ] ) def string_storage(request): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 37c1fa76fbbcf..5a09b0fecfd2c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1742,11 +1742,24 @@ def map_array( if not len(arr): return arr.copy() + if isinstance(arr.dtype, np.dtype): + ret_dtype = arr.dtype + else: + try: + ret_dtype = arr._ndarray.dtype + except AttributeError: + ret_dtype = None + # we must convert to python types values = arr.astype(object, copy=False) if na_action is None: - return lib.map_infer(values, mapper, convert=convert) + ret = lib.map_infer(values, mapper, convert=convert) else: - return lib.map_infer_mask( + ret = lib.map_infer_mask( values, mapper, mask=isna(values).view(np.uint8), convert=convert ) + + if ret.dtype == object and ret_dtype is not None: + return ret.astype(ret_dtype, copy=False) + + return ret diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c22c6b8437af..29c6049e42256 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -361,7 +361,7 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=object) + values = np.empty(shape, dtype=cls._cache_dtype) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -381,8 +381,8 @@ def __arrow_array__(self, type=None): def _values_for_factorize(self): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = None - return arr, None + arr[mask] = self._na_value + return arr, self._na_value def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) @@ -599,6 +599,7 @@ def _str_map( class ObjectStringArray(BaseNumpyStringArray): _cache_dtype = "object" + _na_value = None _storage = "python" def _validate(self): @@ -649,6 +650,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal class NumpyStringArray(BaseNumpyStringArray): _cache_dtype = get_string_dtype() + _na_value = libmissing.NA _storage = "numpy" @classmethod diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 08a5f9c79274b..16c5c32d1823e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -48,10 +48,12 @@ ensure_int64, ensure_object, ensure_str, + get_string_dtype, is_bool, is_complex, is_float, is_integer, + is_legacy_string_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -72,6 +74,7 @@ ) from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import ( + dtype_supports_na, is_valid_na_for_dtype, isna, na_value_for_dtype, @@ -593,6 +596,9 @@ def _maybe_promote_cached(dtype, fill_value, fill_value_type): return _maybe_promote(dtype, fill_value) +StringDType = type(get_string_dtype()) + + def _maybe_promote(dtype: np.dtype, fill_value=np.nan): # The actual implementation of the function, use `maybe_promote` above for # a cached version. @@ -606,7 +612,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = _dtype_obj return dtype, fill_value - if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmM": + if is_valid_na_for_dtype(fill_value, dtype) and dtype_supports_na(dtype): dtype = ensure_dtype_can_hold_na(dtype) fv = na_value_for_dtype(dtype) return dtype, fv @@ -694,11 +700,13 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst + elif is_string_dtype(dtype) and not is_legacy_string_dtype(dtype): + pass else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number - if issubclass(dtype.type, (bytes, str)): + if is_legacy_string_dtype(dtype): dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) @@ -1383,7 +1391,7 @@ def find_common_type(types): if t.kind in "iufc": return np.dtype("object") - return np.find_common_type(types, []) + return np.result_type(*types) def construct_2d_arraylike_from_scalar( diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4cdc6cd408fc9..079d628bd1676 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1037,6 +1037,13 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool: ) +StringDType = type(get_string_dtype()) + + +def needs_object_conversion(dtype: DtypeObj | None) -> bool: + return type(dtype) is StringDType + + def needs_i8_conversion(dtype: DtypeObj | None) -> bool: """ Check whether the dtype should be converted to int64. @@ -1766,6 +1773,7 @@ def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool: "is_timedelta64_ns_dtype", "is_unsigned_integer_dtype", "needs_i8_conversion", + "needs_object_conversion", "pandas_dtype", "TD64NS_DTYPE", "validate_all_hashable", diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 97143bba439f2..1afdbf7af2c7b 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -310,10 +310,7 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) elif type(dtype) is StringDType: - if inf_as_na: - result = ~np.isfinite(values) - else: - result = np.isnan(values) + result = np.isnan(values) else: if values.ndim in {1, 2}: result = libmissing.isnaobj(values, inf_as_na=inf_as_na) @@ -681,6 +678,10 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): return np.nan +def dtype_supports_na(dtype: np.dtype): + return dtype.kind in "iufcmM" or type(dtype) is StringDType + + def remove_na_arraylike(arr: Series | Index | np.ndarray): """ Return array-like containing only true/non-NaN values, possibly empty. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0d51258117bc1..18d133d2b3e0d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2368,7 +2368,7 @@ def get_block_type(dtype: DtypeObj) -> type[Block]: if is_numeric: return NumericBlock else: - if is_legacy_string_dtype(dtype): + if dtype_class == _dtype_obj or is_legacy_string_dtype(dtype): return ObjectBlock return NumpyBlock diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 7762ba8e2c730..c11e93d3e903c 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -36,6 +36,7 @@ is_numeric_v_string_like, is_object_dtype, needs_i8_conversion, + needs_object_conversion, ) from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, @@ -871,9 +872,9 @@ def _fillna_prep( return mask -def _datetimelike_compat(func: F) -> F: +def _no_buffer_protocol_compat(func: F) -> F: """ - Wrapper to handle datetime64 and timedelta64 dtypes. + Wrapper to handle dtypes that don't support the buffer protocol """ @wraps(func) @@ -885,13 +886,21 @@ def new_func(values, limit: int | None = None, mask=None): result, mask = func(values.view("i8"), limit=limit, mask=mask) return result.view(values.dtype), mask + if needs_object_conversion(values.dtype): + if mask is None: + # This needs to occur before casting to int64 + mask = isna(values) + # ugly hack, no way to do this in-place so we copy to object dtype + result, mask = func(values.astype("object"), limit=limit, mask=mask) + values[:] = result.astype(values.dtype)[:] + return values, mask return func(values, limit=limit, mask=mask) return cast(F, new_func) -@_datetimelike_compat +@_no_buffer_protocol_compat def _pad_1d( values: np.ndarray, limit: int | None = None, @@ -902,7 +911,7 @@ def _pad_1d( return values, mask -@_datetimelike_compat +@_no_buffer_protocol_compat def _backfill_1d( values: np.ndarray, limit: int | None = None, @@ -913,7 +922,7 @@ def _backfill_1d( return values, mask -@_datetimelike_compat +@_no_buffer_protocol_compat def _pad_2d( values: np.ndarray, limit: int | None = None, @@ -929,7 +938,7 @@ def _pad_2d( return values, mask -@_datetimelike_compat +@_no_buffer_protocol_compat def _backfill_2d( values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None ): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 88dbee0808533..e547a5667bd5f 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -313,6 +313,9 @@ def _hash_ndarray( vals = hash_object_array( vals.astype(str).astype(object), hash_key, encoding ) + except ValueError: + # the dtype doesn't support the buffer protocol (e.g. StringDType) + vals = hash_object_array(vals.astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 From a1175f2514171c7194a57c90e7ac827046f3779b Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 19 May 2023 13:10:29 -0600 Subject: [PATCH 04/52] fix factorize --- pandas/core/arrays/string_.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 29c6049e42256..02c2d21c1e742 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -378,12 +378,6 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) - def _values_for_factorize(self): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = self._na_value - return arr, self._na_value - def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): @@ -611,6 +605,12 @@ def _validate(self): else: lib.convert_nans_to_NA(self._ndarray) + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = None + return arr, None + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): if dtype and not (isinstance(dtype, str) and dtype == "string"): @@ -665,3 +665,11 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal ) return new_string_array + + def _values_for_factorize(self): + arr = self._ndarray.astype(get_string_dtype(na_object=None)) + return arr, None + + @classmethod + def _from_factorized(cls, values, original): + return original._from_backing_data(values.astype(original._ndarray.dtype)) From 7426cd549ad01ec50c37a17ff0eca2743b9f7b86 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 31 May 2023 10:49:19 -0600 Subject: [PATCH 05/52] adapt to new PandasStringDType and circular dependency on pandas --- pandas/core/arrays/string_.py | 47 +++++++++++++++++++++++------------ pandas/core/dtypes/cast.py | 4 --- pandas/core/dtypes/common.py | 8 ++---- pandas/core/dtypes/missing.py | 7 ++---- 4 files changed, 35 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 02c2d21c1e742..efc7051781e00 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -337,18 +337,6 @@ def __init__(self, values, copy: bool = False) -> None: self._validate() NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage)) - def _validate(self): - """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != self._cache_dtype: - raise ValueError( - f"{type(self).__name__} requires a sequence of strings or " - "pandas.NA convertible to a NumPy array with dtype " - f"{self._cache_dtype}. Got " - f"'{self._ndarray.dtype}' dtype instead." - ) - @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): raise NotImplementedError("_from_sequence must be implemented in subclasses") @@ -361,7 +349,9 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=cls._cache_dtype) + from stringdtype import PandasStringDType + + values = np.empty(shape, dtype=PandasStringDType) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -417,6 +407,17 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def _validate(self): + """Validate that we only store NA or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError("StringArray requires a sequence of strings or pandas.NA") + if self._ndarray.dtype != "object": + raise ValueError( + f"{type(self).__name__} requires a sequence of strings or " + "pandas.NA convertible to a NumPy array with dtype " + f"'object'. Got '{self._ndarray.dtype}' dtype instead." + ) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) @@ -592,7 +593,6 @@ def _str_map( class ObjectStringArray(BaseNumpyStringArray): - _cache_dtype = "object" _na_value = None _storage = "python" @@ -649,13 +649,14 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal class NumpyStringArray(BaseNumpyStringArray): - _cache_dtype = get_string_dtype() _na_value = libmissing.NA _storage = "numpy" @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): - result = np.array(scalars, dtype=cls._cache_dtype) + from stringdtype import PandasStringDType + + result = np.array(scalars, dtype=PandasStringDType) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? @@ -673,3 +674,17 @@ def _values_for_factorize(self): @classmethod def _from_factorized(cls, values, original): return original._from_backing_data(values.astype(original._ndarray.dtype)) + + def _validate(self): + """Validate that we only store NA or strings.""" + from stringdtype import PandasStringDType + + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError("StringArray requires a sequence of strings or pandas.NA") + if type(self._ndarray.dtype) != PandasStringDType: + raise ValueError( + f"{type(self).__name__} requires a sequence of strings or " + "pandas.NA convertible to a NumPy array with dtype " + f"{PandasStringDType()}. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 86dcebed7432e..323c74184e4c7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -48,7 +48,6 @@ ensure_int64, ensure_object, ensure_str, - get_string_dtype, is_bool, is_complex, is_float, @@ -597,9 +596,6 @@ def _maybe_promote_cached(dtype, fill_value, fill_value_type): return _maybe_promote(dtype, fill_value) -StringDType = type(get_string_dtype()) - - def _maybe_promote(dtype: np.dtype, fill_value=np.nan): # The actual implementation of the function, use `maybe_promote` above for # a cached version. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 255df66be1143..448513fca4248 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -17,7 +17,6 @@ Period, algos, lib, - missing, ) from pandas._libs.tslibs import conversion from pandas.util._exceptions import find_stack_level @@ -528,7 +527,7 @@ def get_string_dtype(): import stringdtype - return stringdtype.StringDType(na_object=missing.NA) + return stringdtype.PandasStringDType() def is_string_dtype(arr_or_dtype) -> bool: @@ -1037,11 +1036,8 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool: ) -StringDType = type(get_string_dtype()) - - def needs_object_conversion(dtype: DtypeObj | None) -> bool: - return type(dtype) is StringDType + return isinstance(dtype, type(get_string_dtype())) def needs_i8_conversion(dtype: DtypeObj | None) -> bool: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 701a4e82ef15f..4f888e5f57fdb 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -301,16 +301,13 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): return result -StringDType = type(get_string_dtype()) - - def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: # Working around NumPy ticket 1542 dtype = values.dtype if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) - elif type(dtype) is StringDType: + elif isinstance(dtype, type(get_string_dtype())): result = np.isnan(values) else: if values.ndim in {1, 2}: @@ -683,7 +680,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): def dtype_supports_na(dtype: np.dtype): - return dtype.kind in "iufcmM" or type(dtype) is StringDType + return dtype.kind in "iufcmM" or isinstance(dtype, type(get_string_dtype())) def remove_na_arraylike(arr: Series | Index | np.ndarray): From 8e59bba54694e359d678bf15d04765947e050d3b Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 7 Jun 2023 14:34:48 -0600 Subject: [PATCH 06/52] fix more tests --- pandas/core/arrays/string_.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index efc7051781e00..5dbd72181bbcf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -27,7 +27,6 @@ register_extension_dtype, ) from pandas.core.dtypes.common import ( - get_string_dtype, is_array_like, is_bool_dtype, is_integer_dtype, @@ -351,7 +350,7 @@ def _from_sequence_of_strings( def _empty(cls, shape, dtype) -> StringArray: from stringdtype import PandasStringDType - values = np.empty(shape, dtype=PandasStringDType) + values = np.empty(shape, dtype=PandasStringDType()) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -596,6 +595,12 @@ class ObjectStringArray(BaseNumpyStringArray): _na_value = None _storage = "python" + @classmethod + def _empty(cls, shape, dtype) -> StringArray: + values = np.empty(shape, dtype=object) + values[:] = libmissing.NA + return cls(values).astype(dtype, copy=False) + def _validate(self): super()._validate() # Check to see if need to convert Na values to pd.NA @@ -668,12 +673,22 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal return new_string_array def _values_for_factorize(self): - arr = self._ndarray.astype(get_string_dtype(na_object=None)) - return arr, None + arr = self._ndarray.copy() + # sentinel value used by StringHashTable + arr[np.isnan(arr)] = "__nan__" + return arr, "__nan__" @classmethod def _from_factorized(cls, values, original): - return original._from_backing_data(values.astype(original._ndarray.dtype)) + values[values == "__nan__"] = libmissing.NA + return original._from_backing_data(values) + + @classmethod + def _empty(cls, shape, dtype) -> StringArray: + from stringdtype import PandasStringDType + + values = np.empty(shape, dtype=PandasStringDType()) + return cls(values).astype(dtype, copy=False) def _validate(self): """Validate that we only store NA or strings.""" @@ -688,3 +703,10 @@ def _validate(self): f"{PandasStringDType()}. Got " f"'{self._ndarray.dtype}' dtype instead." ) + + def _validate_setitem_value(self, value): + from stringdtype import PandasStringDType + + if value is np.nan: + value = np.array(libmissing.NA, dtype=PandasStringDType()) + return value From 64f85d3e5f84bd8e6068fea1f0a0e2072fe8bdaf Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 14 Jun 2023 09:12:32 -0600 Subject: [PATCH 07/52] fix remaining ExtensionArray tests --- pandas/arrays/__init__.py | 2 ++ pandas/compat/numpy/__init__.py | 2 +- pandas/core/arrays/numpy_.py | 7 ++----- pandas/core/arrays/string_.py | 8 ++++++++ pandas/core/nanops.py | 9 ++++++++- pandas/tests/arrays/string_/test_string.py | 16 ++++++++++++---- 6 files changed, 33 insertions(+), 11 deletions(-) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 3a8e80a6b5d2b..449f72451d0bd 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -12,6 +12,7 @@ FloatingArray, IntegerArray, IntervalArray, + ObjectStringArray, PandasArray, PeriodArray, SparseArray, @@ -32,5 +33,6 @@ "PeriodArray", "SparseArray", "StringArray", + "ObjectStringArray", "TimedeltaArray", ] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 97c434d8f35d0..02d21143af2dd 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -9,7 +9,7 @@ np_version_under1p22 = _nlv < Version("1.22") np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") -is_numpy_dev = _nlv.dev is not None +is_numpy_dev = _nlv.dev is not None or _nlv.is_prerelease _min_numpy_ver = "1.21.6" np_percentile_argname = "interpolation" if np_version_under1p22 else "method" diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 702180b5d779a..08a171ac46ee8 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -409,13 +409,10 @@ def to_numpy( na_value: object = lib.no_default, ) -> np.ndarray: mask = self.isna() + result = np.asarray(self._ndarray, dtype=dtype) if na_value is not lib.no_default and mask.any(): - result = self._ndarray.copy() + result = result.copy() result[mask] = na_value - else: - result = self._ndarray - - result = np.asarray(result, dtype=dtype) if copy and result is self._ndarray: result = result.copy() diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 53c6ab30a6df4..80fe70c9e9af9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -706,3 +706,11 @@ def _validate_setitem_value(self, value): if value is np.nan: value = np.array(libmissing.NA, dtype=PandasStringDType()) return value + + def _validate_scalar(self, fill_value): + fill_value = super()._validate_scalar(fill_value) + if fill_value is np.nan: + fill_value = self.dtype.na_value + if not isinstance(fill_value, str) and fill_value is not self.dtype.na_value: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") + return fill_value diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 59520350e0dc1..b5410a88ad334 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -152,8 +152,14 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: + from stringdtype import PandasStringDType + # Bottleneck chokes on datetime64, PeriodDtype (or and EA) - if dtype != object and not needs_i8_conversion(dtype): + if ( + dtype != object + and dtype != PandasStringDType() + and not needs_i8_conversion(dtype) + ): # GH 42878 # Bottleneck uses naive summation leading to O(n) loss of precision # unlike numpy which implements pairwise summation, which has O(log(n)) loss @@ -998,6 +1004,7 @@ def nanvar( # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5ca95bd00f136..a89a58c7d0cdb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -35,7 +35,12 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + elif dtype.storage == "python": + arr_name = "ObjectStringArray" + elif dtype.storage == "numpy": + arr_name = "NumpyStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -49,14 +54,16 @@ def test_none_to_nan(cls): def test_setitem_validates(cls): arr = cls._from_sequence(["a", "b"]) - if cls is pd.arrays.StringArray: + is_string = issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray) + + if is_string: msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: + if is_string: msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -574,7 +581,8 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if type(ser.array) is pd.arrays.StringArray: + + if isinstance(ser.array, pd.core.arrays.string_.BaseNumpyStringArray): msg = "Cannot set non-string value" else: msg = "Scalar must be NA or str" From 1654f8b13f4bd458a6f83295a5555edf23c61b46 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 16 Jun 2023 09:22:34 -0600 Subject: [PATCH 08/52] deal with stringdtype not coercing NaN and None to NA --- pandas/_libs/missing.pxd | 2 +- pandas/_libs/missing.pyi | 11 +++++++- pandas/_libs/missing.pyx | 15 ++++++++--- pandas/core/arrays/string_.py | 31 ++++++++++++++++++---- pandas/core/dtypes/missing.py | 7 ++++- pandas/tests/arrays/string_/test_string.py | 16 ++++++++--- 6 files changed, 67 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 5920649519442..760c0c8cc51e6 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -8,7 +8,7 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*) cpdef bint check_na_tuples_nonequal(object left, object right) cpdef bint checknull(object val, bint inf_as_na=*) -cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*) +cpdef object isnaobj(ndarray arr, bint inf_as_na=*, bint check_for_any_na=*) cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index d5c9f1342a089..3b9afa49678da 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -1,3 +1,5 @@ +from typing import overload + import numpy as np from numpy import typing as npt @@ -12,6 +14,13 @@ def is_matching_na( def isposinf_scalar(val: object) -> bool: ... def isneginf_scalar(val: object) -> bool: ... def checknull(val: object, inf_as_na: bool = ...) -> bool: ... -def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... +@overload +def isnaobj( + arr: np.ndarray, inf_as_na: bool = ..., check_for_any_na=... +) -> npt.NDArray[np.bool_]: ... +@overload +def isnaobj( + arr: np.ndarray, inf_as_na: bool = ..., check_for_any_na=True +) -> tuple[npt.NDArray[np.bool_], bool]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index e3e7d8daa03e1..9f2fafa57aba6 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -186,7 +186,8 @@ cdef bint is_decimal_na(object val): @cython.wraparound(False) @cython.boundscheck(False) -cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False): +cpdef object isnaobj(ndarray arr, bint inf_as_na=False, + bint check_for_any_na=False): """ Return boolean mask denoting which elements of a 1-D array are na-like, according to the criteria defined in `checknull`: @@ -201,15 +202,19 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False): Parameters ---------- arr : ndarray - + inf_as_na : boolean + Treat inf as NA-like + check_for_any_na : boolean + If true, the return value of this function Returns ------- - result : ndarray (dtype=np.bool_) + result : ndarray (dtype=np.bool_) or tuple of boolean ndarray and a bool """ cdef: Py_ssize_t i, n = arr.size object val bint is_null + bint any_na = 0 ndarray result = np.empty((arr).shape, dtype=np.uint8) flatiter it = cnp.PyArray_IterNew(arr) flatiter it2 = cnp.PyArray_IterNew(result) @@ -222,7 +227,11 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False): is_null = checknull(val, inf_as_na=inf_as_na) # Dereference pointer (set value) ((cnp.PyArray_ITER_DATA(it2)))[0] = is_null + if not any_na and is_null: + any_na = 1 cnp.PyArray_ITER_NEXT(it2) + if check_for_any_na: + return (result.view(np.bool_), bool(any_na)) return result.view(np.bool_) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8231901caf0dc..e4ecac0fc52ef 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -359,7 +359,7 @@ def __arrow_array__(self, type=None): if type is None: type = pa.string() - values = self._ndarray.copy() + values = self._ndarray.astype("object").copy() values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) @@ -511,18 +511,28 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other) + other = np.asarray(other, dtype=self._ndarray.dtype) other = other[valid] if op.__name__ in ops.ARITHMETIC_BINOPS: - result = np.empty_like(self._ndarray, dtype="object") + result = np.empty_like(self._ndarray) result[mask] = libmissing.NA result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return type(self)(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") - result[valid] = op(self._ndarray[valid], other) + try: + result[valid] = op(self._ndarray[valid], other) + except np.core._exceptions._UFuncNoLoopError: + if hasattr(other, "_ndarray"): + other_type = other._ndarray.dtype + else: + other_type = type(other) + raise TypeError( + f"'{op.__name__}' operator not supported between " + f"'{self._ndarray.dtype}' and '{other_type}'" + ) return BooleanArray(result, mask) _arith_method = _cmp_method @@ -653,12 +663,23 @@ class NumpyStringArray(BaseNumpyStringArray): _na_value = libmissing.NA _storage = "numpy" + def __init__(self, values, copy: bool = False) -> None: + from stringdtype import PandasStringDType + + values = np.asarray(values, dtype=PandasStringDType()) + super().__init__(values, copy=copy) + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): from stringdtype import PandasStringDType + na_mask, any_na = libmissing.isnaobj(np.asarray(scalars), check_for_any_na=True) + result = np.array(scalars, dtype=PandasStringDType) + if any_na: + result[na_mask] = libmissing.NA + # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4f888e5f57fdb..a6d040d878331 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -529,6 +529,11 @@ def array_equivalent( # or `in ("O", "S", "U")` return _array_equivalent_object(left, right, strict_nan) + if is_string_or_object_np_dtype(left.dtype) or is_string_or_object_np_dtype( + right.dtype + ): + return _array_equivalent_object(left, right, strict_nan) + # NaNs can occur in float and complex arrays. if left.dtype.kind in "fc": if not (left.size and right.size): @@ -676,7 +681,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): if compat: return False return np.nan - return np.nan + return getattr(dtype, "na_object", np.nan) def dtype_supports_na(dtype: np.dtype): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a89a58c7d0cdb..2c6ed8d488c66 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -4,6 +4,7 @@ """ import numpy as np import pytest +from stringdtype import PandasStringDType import pandas.util._test_decorators as td @@ -228,7 +229,6 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): if op_name not in ["__eq__", "__ne__"]: with pytest.raises(TypeError, match="not supported between"): getattr(a, op_name)(other) - return result = getattr(a, op_name)(other) @@ -258,7 +258,7 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): - if cls is pd.arrays.StringArray: + if issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray): msg = "StringArray requires a sequence of strings or pandas.NA" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -537,7 +537,11 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", pd.NA, "b"], dtype=object) + if dtype.storage == "numpy": + res_dtype = PandasStringDType() + else: + res_dtype = object + expected = np.array(["a", pd.NA, "b"], dtype=res_dtype) tm.assert_numpy_array_equal(result, expected) @@ -545,7 +549,11 @@ def test_to_numpy_na_value(dtype, nulls_fixture): na_value = nulls_fixture arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = arr.to_numpy(na_value=na_value) - expected = np.array(["a", na_value, "b"], dtype=object) + if dtype.storage == "numpy": + res_dtype = PandasStringDType() + else: + res_dtype = object + expected = np.array(["a", na_value, "b"], dtype=res_dtype) tm.assert_numpy_array_equal(result, expected) From 87e2d148da4302426dc357ebb0c5a0a1364045fd Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 11 Jul 2023 16:15:07 -0600 Subject: [PATCH 09/52] adapt to stringdtype getting rid of PandasStringDType --- pandas/core/arrays/string_.py | 27 +++++++--------------- pandas/core/dtypes/common.py | 3 ++- pandas/core/nanops.py | 5 ++-- pandas/tests/arrays/string_/test_string.py | 18 +++++++++++---- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e4ecac0fc52ef..2adfd726d14fb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -27,6 +27,7 @@ register_extension_dtype, ) from pandas.core.dtypes.common import ( + get_string_dtype, is_array_like, is_bool_dtype, is_integer_dtype, @@ -344,9 +345,7 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - from stringdtype import PandasStringDType - - values = np.empty(shape, dtype=PandasStringDType()) + values = np.empty(shape, dtype=get_string_dtype()) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -664,18 +663,14 @@ class NumpyStringArray(BaseNumpyStringArray): _storage = "numpy" def __init__(self, values, copy: bool = False) -> None: - from stringdtype import PandasStringDType - - values = np.asarray(values, dtype=PandasStringDType()) + values = np.asarray(values, dtype=get_string_dtype()) super().__init__(values, copy=copy) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): - from stringdtype import PandasStringDType - na_mask, any_na = libmissing.isnaobj(np.asarray(scalars), check_for_any_na=True) - result = np.array(scalars, dtype=PandasStringDType) + result = np.array(scalars, dtype=get_string_dtype()) if any_na: result[na_mask] = libmissing.NA @@ -702,30 +697,24 @@ def _from_factorized(cls, values, original): @classmethod def _empty(cls, shape, dtype) -> StringArray: - from stringdtype import PandasStringDType - - values = np.empty(shape, dtype=PandasStringDType()) + values = np.empty(shape, dtype=get_string_dtype()) return cls(values).astype(dtype, copy=False) def _validate(self): """Validate that we only store NA or strings.""" - from stringdtype import PandasStringDType - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if type(self._ndarray.dtype) != PandasStringDType: + if self._ndarray.dtype != get_string_dtype(): raise ValueError( f"{type(self).__name__} requires a sequence of strings or " "pandas.NA convertible to a NumPy array with dtype " - f"{PandasStringDType()}. Got " + f"{get_string_dtype()}. Got " f"'{self._ndarray.dtype}' dtype instead." ) def _validate_setitem_value(self, value): - from stringdtype import PandasStringDType - if value is np.nan: - value = np.array(libmissing.NA, dtype=PandasStringDType()) + value = np.array(libmissing.NA, dtype=get_string_dtype()) return value def _validate_scalar(self, fill_value): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 448513fca4248..f45cf655474da 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -17,6 +17,7 @@ Period, algos, lib, + missing as libmissing, ) from pandas._libs.tslibs import conversion from pandas.util._exceptions import find_stack_level @@ -527,7 +528,7 @@ def get_string_dtype(): import stringdtype - return stringdtype.PandasStringDType() + return stringdtype.StringDType(na_object=libmissing.NA) def is_string_dtype(arr_or_dtype) -> bool: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b5410a88ad334..99da112d51a48 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -18,6 +18,7 @@ NaTType, iNaT, lib, + missing as libmissing, ) from pandas._typing import ( ArrayLike, @@ -152,12 +153,12 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: - from stringdtype import PandasStringDType + from stringdtype import StringDType # Bottleneck chokes on datetime64, PeriodDtype (or and EA) if ( dtype != object - and dtype != PandasStringDType() + and dtype != StringDType(na_object=libmissing.NA) and not needs_i8_conversion(dtype) ): # GH 42878 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2c6ed8d488c66..ffb3da3da5b09 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -4,7 +4,7 @@ """ import numpy as np import pytest -from stringdtype import PandasStringDType +from stringdtype import StringDType import pandas.util._test_decorators as td @@ -324,8 +324,16 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" - with pytest.raises(TypeError, match=msg): + if dtype.storage == "numpy": + msg = "Arrays with missing data cannot be converted to integers" + exception = ValueError + else: + msg = ( + r"int\(\) argument must be a string, a bytes-like object or a( real)? " + "number" + ) + exception = TypeError + with pytest.raises(exception, match=msg): arr.astype("int64") @@ -538,7 +546,7 @@ def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) if dtype.storage == "numpy": - res_dtype = PandasStringDType() + res_dtype = StringDType(na_object=pd.NA) else: res_dtype = object expected = np.array(["a", pd.NA, "b"], dtype=res_dtype) @@ -550,7 +558,7 @@ def test_to_numpy_na_value(dtype, nulls_fixture): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = arr.to_numpy(na_value=na_value) if dtype.storage == "numpy": - res_dtype = PandasStringDType() + res_dtype = StringDType(na_object=pd.NA) else: res_dtype = object expected = np.array(["a", na_value, "b"], dtype=res_dtype) From 0f0589e656a52b1e9928593a93eae0ff4e49110c Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 1 Aug 2023 10:52:57 -0600 Subject: [PATCH 10/52] support latest version of stringdtype --- pandas/core/arrays/string_.py | 7 ++++++- pandas/core/dtypes/common.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7a5a62290b1cd..114050caffc85 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -663,7 +663,12 @@ class NumpyStringArray(BaseNumpyStringArray): _storage = "numpy" def __init__(self, values, copy: bool = False) -> None: - values = np.asarray(values, dtype=get_string_dtype()) + try: + values = np.asarray(values, dtype=get_string_dtype()) + except (TypeError, ValueError): + raise ValueError("StringArray requires a sequence of strings or pandas.NA") + if values.size == 0: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") super().__init__(values, copy=copy) @classmethod diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f46131061e721..772c1d78d9386 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -528,7 +528,7 @@ def get_string_dtype(): import stringdtype - return stringdtype.StringDType(na_object=libmissing.NA) + return stringdtype.StringDType(na_object=libmissing.NA, coerce=False) def is_string_dtype(arr_or_dtype) -> bool: From 41ab89474d4bb32387a34c51d0caea6de2be7396 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Thu, 10 Aug 2023 12:19:32 -0600 Subject: [PATCH 11/52] adapt to changes in pandas and stringdtype --- pandas/core/arrays/string_.py | 33 +++++++++++++++------- pandas/core/config_init.py | 2 +- pandas/core/dtypes/common.py | 4 +-- pandas/tests/arrays/string_/test_string.py | 8 +++--- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a71ce4a339e53..6b50d34bf0945 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -423,7 +423,7 @@ def astype(self, dtype, copy: bool = True): elif isinstance(dtype, IntegerDtype): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = 0 + arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) elif isinstance(dtype, FloatingDtype): @@ -438,7 +438,7 @@ def astype(self, dtype, copy: bool = True): elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = 0 + arr[mask] = "0" values = arr.astype(dtype) values[mask] = np.nan return values @@ -510,8 +510,8 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other, dtype=self._ndarray.dtype) - other = other[valid] + other = np.asarray(other) + other = other[valid].astype(self._ndarray.dtype) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray) @@ -673,12 +673,15 @@ def __init__(self, values, copy: bool = False) -> None: @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): - na_mask, any_na = libmissing.isnaobj(np.asarray(scalars), check_for_any_na=True) - - result = np.array(scalars, dtype=get_string_dtype()) - - if any_na: - result[na_mask] = libmissing.NA + arr = np.asarray(scalars) + if is_object_dtype(arr.dtype): + result = np.empty(arr.shape, dtype=get_string_dtype()) + na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True) + result[~na_mask] = arr[~na_mask] + if any_na: + result[na_mask] = libmissing.NA + else: + result = np.array(arr, dtype=get_string_dtype()) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? @@ -729,3 +732,13 @@ def _validate_scalar(self, fill_value): if not isinstance(fill_value, str) and fill_value is not self.dtype.na_value: raise ValueError("StringArray requires a sequence of strings or pandas.NA") return fill_value + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + if dtype is None and na_value is not lib.no_default: + dtype = get_string_dtype(na_object=na_value) + return super().to_numpy(dtype, copy, na_value) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..c73a270c65171 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "numpy"]), ) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c49ca8214abfe..37aa082508bde 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -519,7 +519,7 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str) -def get_string_dtype(): +def get_string_dtype(na_object=libmissing.NA, coerce=False): import os import sys @@ -528,7 +528,7 @@ def get_string_dtype(): import stringdtype - return stringdtype.StringDType(na_object=libmissing.NA, coerce=False) + return stringdtype.StringDType(na_object=na_object, coerce=coerce) def is_string_dtype(arr_or_dtype) -> bool: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1128f35b4196e..7b387f6f9bfa5 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -542,7 +542,7 @@ def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) if dtype.storage == "numpy": - res_dtype = StringDType(na_object=pd.NA) + res_dtype = StringDType(na_object=pd.NA, coerce=False) else: res_dtype = object expected = np.array(["a", pd.NA, "b"], dtype=res_dtype) @@ -551,12 +551,12 @@ def test_to_numpy_returns_pdna_default(dtype): def test_to_numpy_na_value(dtype, nulls_fixture): na_value = nulls_fixture - arr = pd.array(["a", pd.NA, "b"], dtype=dtype) - result = arr.to_numpy(na_value=na_value) if dtype.storage == "numpy": - res_dtype = StringDType(na_object=pd.NA) + res_dtype = StringDType(na_object=na_value, coerce=False) else: res_dtype = object + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = arr.to_numpy(na_value=na_value) expected = np.array(["a", na_value, "b"], dtype=res_dtype) tm.assert_numpy_array_equal(result, expected) From 43b3ce7259a6ffaff0791c0edaba58399b4ca691 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 29 Aug 2023 15:27:04 -0600 Subject: [PATCH 12/52] avoid copy when loading numpy string data --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6b50d34bf0945..1d3440316216f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -681,7 +681,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if any_na: result[na_mask] = libmissing.NA else: - result = np.array(arr, dtype=get_string_dtype()) + result = arr.astype(get_string_dtype(), copy=False) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? From 7e5ea63910c54234ddae9852d1e35c23ef75b022 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 20 Feb 2024 14:00:39 -0700 Subject: [PATCH 13/52] update to work with stringdtype in numpy --- asv_bench/benchmarks/strings.py | 2 +- meson.build | 4 +- pandas/_libs/lib.pyx | 15 +++-- pandas/core/algorithms.py | 2 + pandas/core/arrays/string_.py | 67 +++++----------------- pandas/core/common.py | 3 +- pandas/core/construction.py | 3 +- pandas/core/dtypes/astype.py | 7 +-- pandas/core/dtypes/cast.py | 8 +-- pandas/core/dtypes/common.py | 66 +++++---------------- pandas/core/dtypes/missing.py | 9 +-- pandas/core/internals/blocks.py | 3 +- pandas/core/internals/construction.py | 1 - pandas/core/internals/managers.py | 3 +- pandas/core/nanops.py | 4 +- pandas/tests/arrays/string_/test_string.py | 5 +- 16 files changed, 59 insertions(+), 143 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 8bfcf471a9e84..c4fdaf61dc55b 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,7 @@ import warnings import numpy as np -from stringdtype import StringDType +from numpy.dtypes import StringDType from pandas import ( NA, diff --git a/meson.build b/meson.build index 06623a305ab54..7831b43833d38 100644 --- a/meson.build +++ b/meson.build @@ -24,8 +24,8 @@ add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp') # Allow supporting older numpys than the version compiled against # Set the define to the min supported version of numpy for pandas # e.g. right now this is targeting numpy 1.21+ -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c') -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'c') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'cpp') if fs.exists('_version_meson.py') diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 73ab27a52d8b7..d620961ce302b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1441,9 +1441,9 @@ cdef object _try_infer_map(object dtype): return None -def infer_dtype(value: object, skipna: bool = True) -> object: +def infer_dtype(value: object, skipna: bool = True) -> str: """ - Return the type of a scalar or list-like of values. + Return a string label of the type of a scalar or list-like of values. Parameters ---------- @@ -1453,7 +1453,7 @@ def infer_dtype(value: object, skipna: bool = True) -> object: Returns ------- - str or dtype object + str Describing the common type of the input data. Results can include: @@ -1581,9 +1581,9 @@ def infer_dtype(value: object, skipna: bool = True) -> object: if inferred is not None: # Anything other than object-dtype should return here. return inferred - elif not getattr(type(values.dtype), "_legacy", True): - if issubclass(values.dtype.type, str): - return values.dtype + elif values.dtype.kind == "T": + # NumPy StringDType + return values.dtype if values.descr.type_num != NPY_OBJECT: # i.e. values.dtype != np.object_ @@ -1910,6 +1910,9 @@ cdef class StringValidator(Validator): return isinstance(value, str) cdef bint is_array_typed(self) except -1: + if self.dtype.kind in "TU": + return True + # this lets user-defined string DTypes through return issubclass(self.dtype.type, (np.str_, str)) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1c84f5a6d8480..de26cde8c2822 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1703,6 +1703,7 @@ def map_array( if isinstance(arr.dtype, np.dtype): ret_dtype = arr.dtype else: + # NJG TODO: simplify this try: ret_dtype = arr._ndarray.dtype except AttributeError: @@ -1717,6 +1718,7 @@ def map_array( values, mapper, mask=isna(values).view(np.uint8)) if ret.dtype == object and ret_dtype is not None: + # cast from object back to StringDType return ret.astype(ret_dtype, copy=False) return ret diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f098f6e5402cc..a6f5e08af62f4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -30,7 +30,7 @@ register_extension_dtype, ) from pandas.core.dtypes.common import ( - get_string_dtype, + get_numpy_string_dtype_instance, is_array_like, is_bool_dtype, is_integer_dtype, @@ -387,45 +387,6 @@ def _validate(self) -> None: "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) - # Check to see if need to convert Na values to pd.NA - if self._ndarray.ndim > 2: - # Ravel if ndims > 2 b/c no cythonized version available - lib.convert_nans_to_NA(self._ndarray.ravel("K")) - else: - lib.convert_nans_to_NA(self._ndarray) - - @classmethod - def _from_sequence( - cls, scalars, *, dtype: Dtype | None = None, copy: bool = False - ) -> Self: - if dtype and not (isinstance(dtype, str) and dtype == "string"): - dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "python" - - from pandas.core.arrays.masked import BaseMaskedArray - - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA - - else: - if lib.is_pyarrow_array(scalars): - # pyarrow array; we cannot rely on the "to_numpy" check in - # ensure_string_array because calling scalars.to_numpy would set - # zero_copy_only to True which caused problems see GH#52076 - scalars = np.array(scalars) - # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) - - # Manually creating new array avoids the validation step in the __init__, so is - # faster. Refactor need for validation? - new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) - - return new_string_array @classmethod def _from_sequence_of_strings( @@ -435,7 +396,7 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=get_string_dtype()) + values = np.empty(shape, dtype=get_numpy_string_dtype_instance()) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -720,7 +681,9 @@ def _values_for_factorize(self): return arr, None @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) assert isinstance(dtype, StringDtype) and dtype.storage == "python" @@ -746,9 +709,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__( - new_string_array, result, StringDtype(storage=cls._storage) - ) + NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) return new_string_array @@ -762,7 +723,7 @@ class NumpyStringArray(BaseNumpyStringArray): def __init__(self, values, copy: bool = False) -> None: try: - values = np.asarray(values, dtype=get_string_dtype()) + values = np.asarray(values, dtype=get_numpy_string_dtype_instance()) except (TypeError, ValueError): raise ValueError("StringArray requires a sequence of strings or pandas.NA") if values.size == 0: @@ -773,13 +734,13 @@ def __init__(self, values, copy: bool = False) -> None: def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): arr = np.asarray(scalars) if is_object_dtype(arr.dtype): - result = np.empty(arr.shape, dtype=get_string_dtype()) + result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance()) na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True) result[~na_mask] = arr[~na_mask] if any_na: result[na_mask] = libmissing.NA else: - result = arr.astype(get_string_dtype(), copy=False) + result = arr.astype(get_numpy_string_dtype_instance(), copy=False) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? @@ -803,24 +764,24 @@ def _from_factorized(cls, values, original): @classmethod def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=get_string_dtype()) + values = np.empty(shape, dtype=get_numpy_string_dtype_instance()) return cls(values).astype(dtype, copy=False) def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != get_string_dtype(): + if self._ndarray.dtype != get_numpy_string_dtype_instance(): raise ValueError( f"{type(self).__name__} requires a sequence of strings or " "pandas.NA convertible to a NumPy array with dtype " - f"{get_string_dtype()}. Got " + f"{get_numpy_string_dtype_instance()}. Got " f"'{self._ndarray.dtype}' dtype instead." ) def _validate_setitem_value(self, value): if value is np.nan: - value = np.array(libmissing.NA, dtype=get_string_dtype()) + value = np.array(libmissing.NA, dtype=get_numpy_string_dtype_instance()) return value def _validate_scalar(self, fill_value): @@ -838,5 +799,5 @@ def to_numpy( na_value: object = lib.no_default, ) -> np.ndarray: if dtype is None and na_value is not lib.no_default: - dtype = get_string_dtype(na_object=na_value) + dtype = get_numpy_string_dtype_instance(na_object=na_value) return super().to_numpy(dtype, copy, na_value) diff --git a/pandas/core/common.py b/pandas/core/common.py index 08f908ebc44b3..bd078a3a6ccd9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -39,7 +39,6 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_integer, - is_legacy_string_dtype, ) from pandas.core.dtypes.generic import ( ABCExtensionArray, @@ -256,7 +255,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi # has incompatible type "Iterable[Any]"; expected "Sized" return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type] - if is_legacy_string_dtype(result.dtype): + if result.dtype.kind == "U": result = np.asarray(values, dtype=object) if result.ndim == 2: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c777e8578a2fb..29b27af4e180d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -47,7 +47,6 @@ maybe_promote, ) from pandas.core.dtypes.common import ( - is_legacy_string_dtype, is_list_like, is_object_dtype, is_string_dtype, @@ -739,7 +738,7 @@ def _sanitize_str_dtypes( # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. - if is_legacy_string_dtype(result.dtype): + if result.dtype.kind == "U": # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, result has already the result diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index bd3bd7b067630..9a6cc44cdd101 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -18,7 +18,6 @@ from pandas.errors import IntCastingNaNError from pandas.core.dtypes.common import ( - is_legacy_string_dtype, is_object_dtype, is_string_dtype, pandas_dtype, @@ -90,7 +89,7 @@ def _astype_nansafe( res = arr.astype(dtype, copy=copy) return np.asarray(res) - if issubclass(dtype.type, str) and is_legacy_string_dtype(dtype): + if dtype.kind == "U": shape = arr.shape if arr.ndim > 1: arr = arr.ravel() @@ -182,8 +181,8 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra else: values = _astype_nansafe(values, dtype, copy=copy) - # in pandas we don't store numpy str dtypes, so convert to object - if isinstance(dtype, np.dtype) and is_legacy_string_dtype(values.dtype): + # in pandas we don't store the numpy.str_ dtype, so convert to object + if isinstance(dtype, np.dtype) and values.dtype.kind == "U": values = np.array(values, dtype=object) return values diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0e3ae59d4f4d9..05a1be3b866ba 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -56,7 +56,6 @@ is_complex, is_float, is_integer, - is_legacy_string_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -79,7 +78,6 @@ ) from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import ( - dtype_supports_na, is_valid_na_for_dtype, isna, na_value_for_dtype, @@ -628,7 +626,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = _dtype_obj return dtype, fill_value - if is_valid_na_for_dtype(fill_value, dtype) and dtype_supports_na(dtype): + if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmMT": dtype = ensure_dtype_can_hold_na(dtype) fv = na_value_for_dtype(dtype) return dtype, fv @@ -727,13 +725,13 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst - elif is_string_dtype(dtype) and not is_legacy_string_dtype(dtype): + elif is_string_dtype(dtype) and dtype.kind == "T": pass else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number - if is_legacy_string_dtype(dtype): + if dtype.kind == "U": dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 846ddb647a7b6..590934eba299b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -515,20 +515,23 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: """ Faster alternative to is_string_dtype, assumes we have a np.dtype object. """ - return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str) + return dtype == object or dtype.kind in "SUT" +def get_numpy_string_dtype_instance(na_object=libmissing.NA, coerce=False): + """Get a reference to a ``numpy.dtypes.StringDType`` instance. -def get_string_dtype(na_object=libmissing.NA, coerce=False): - import os - import sys - - if not os.environ.get("NUMPY_EXPERIMENTAL_DTYPE_API", None) == "1": - sys.exit() - - import stringdtype - - return stringdtype.StringDType(na_object=na_object, coerce=coerce) + This is a convenience wrapper around the StringDType initializer + with convenient defaults chosen for use with Pandas. + Parameters + ---------- + na_object : object + A missing data sentinel object. + coerce : bool + Whether or not non-strings entries in arrays should be converted + to strings. + """ + return np.dtypes.StringDType(na_object=na_object, coerce=coerce) def is_string_dtype(arr_or_dtype) -> bool: """ @@ -1039,7 +1042,7 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool: def needs_object_conversion(dtype: DtypeObj | None) -> bool: - return isinstance(dtype, type(get_string_dtype())) + return isinstance(dtype, type(get_numpy_string_dtype_instance())) def needs_i8_conversion(dtype: DtypeObj | None) -> bool: @@ -1695,44 +1698,6 @@ def is_all_strings(value: ArrayLike) -> bool: return dtype == "string" -def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool: - """Check if the dtype is a numpy legacy string dtype - - Parameters - ---------- - arr_or_dtype : array-like or dtype - The array-like or dtype to check - - include_bytes : boolean - whether or not to include bytestring dtypes - - Returns - ------- - boolean - True for legacy numpy dtypes that represent python strings, - False otherwise. If include_bytes is True, also true for - legacy bytes dtypes. - - """ - if arr_or_dtype is None: - return False - - dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) - - if not isinstance(dtype, np.dtype): - return False - - # the _legacy attribute was added in Numpy 1.25. If the attribute isn't - # defined on the dtype class, Numpy isn't sufficiently new, so we have to be - # dealing with a legacy dtype. - is_legacy = getattr(type(dtype), "_legacy", True) - if not is_legacy: - return False - if include_bytes: - return issubclass(dtype.type, (str, bytes)) - return issubclass(dtype.type, str) - - __all__ = [ "classes", "DT64NS_DTYPE", @@ -1766,7 +1731,6 @@ def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool: "is_integer_dtype", "is_interval_dtype", "is_iterator", - "is_legacy_string_dtype", "is_named_tuple", "is_nested_list_like", "is_number", diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 872e800c61a24..e4e9b9f21d38b 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -26,7 +26,7 @@ DT64NS_DTYPE, TD64NS_DTYPE, ensure_object, - get_string_dtype, + get_numpy_string_dtype_instance, is_scalar, is_string_or_object_np_dtype, ) @@ -311,7 +311,8 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) - elif isinstance(dtype, type(get_string_dtype())): + elif dtype.kind == "T": + # StringDType's isnan loop checks for null strings result = np.isnan(values) else: if values.ndim in {1, 2}: @@ -722,10 +723,6 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): return getattr(dtype, "na_object", np.nan) -def dtype_supports_na(dtype: np.dtype): - return dtype.kind in "iufcmM" or isinstance(dtype, type(get_string_dtype())) - - def remove_na_arraylike(arr: Series | Index | np.ndarray): """ Return array-like containing only true/non-NaN values, possibly empty. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e0d655f4101e2..0fd23bade7dc5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -60,7 +60,6 @@ is_1d_only_ea_dtype, is_float_dtype, is_integer_dtype, - is_legacy_string_dtype, is_list_like, is_scalar, is_string_dtype, @@ -2214,7 +2213,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike: if isinstance(values, np.ndarray): values = ensure_wrapped_if_datetimelike(values) - if is_legacy_string_dtype(values.dtype): + if values.dtype.kind == "U": values = np.array(values, dtype=object) if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index faae32310d2e9..047c25f4931a6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -28,7 +28,6 @@ from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, is_integer_dtype, - is_legacy_string_dtype, is_list_like, is_named_tuple, is_object_dtype, diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7973e50d0d760..18737e69f779e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -44,7 +44,6 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, - is_legacy_string_dtype, is_list_like, ) from pandas.core.dtypes.dtypes import ( @@ -2364,7 +2363,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if isinstance(dtype, np.dtype): is_dtlike = dtype.kind in "mM" - if is_legacy_string_dtype(dtype, include_bytes=True): + if dtype.kind in "SU": dtype = np.dtype(object) values, placement = _stack_arrays(list(tup_block), dtype) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4327048ea774a..a7b5346ac14ae 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -153,12 +153,10 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: - from stringdtype import StringDType - # Bottleneck chokes on datetime64, PeriodDtype (or and EA) if ( dtype != object - and dtype != StringDType(na_object=libmissing.NA) + and dtype != np.dtypes.StringDType(na_object=libmissing.NA) and not needs_i8_conversion(dtype) ): # GH 42878 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b9a6a7f397874..9d09c33a31c65 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from stringdtype import StringDType from pandas.compat.pyarrow import pa_version_under12p0 @@ -651,7 +650,7 @@ def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) if dtype.storage == "numpy": - res_dtype = StringDType(na_object=pd.NA, coerce=False) + res_dtype = np.dtypes.StringDType(na_object=pd.NA, coerce=False) else: res_dtype = object expected = np.array(["a", na_val(dtype), "b"], dtype=res_dtype) @@ -661,7 +660,7 @@ def test_to_numpy_returns_pdna_default(dtype): def test_to_numpy_na_value(dtype, nulls_fixture): na_value = nulls_fixture if dtype.storage == "numpy": - res_dtype = StringDType(na_object=na_value, coerce=False) + res_dtype = np.dtypes.StringDType(na_object=na_value, coerce=False) else: res_dtype = object arr = pd.array(["a", pd.NA, "b"], dtype=dtype) From 65abaa6cae78efbcdf4ba9421086f3910314e3a3 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 11 Mar 2024 13:36:37 -0600 Subject: [PATCH 14/52] some fixes for numpy support --- pandas/_libs/lib.pyx | 4 ++-- pandas/core/arrays/string_.py | 2 +- pandas/tests/arrays/string_/test_string.py | 23 +++++++++++++--------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ee4a374a20984..bd3a984162753 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1910,10 +1910,10 @@ cdef class StringValidator(Validator): return isinstance(value, str) cdef bint is_array_typed(self) except -1: - if self.dtype.kind in "TU": + if self.dtype.char == "T" or self.dtype.char == "U": return True # this lets user-defined string DTypes through - return issubclass(self.dtype.type, (np.str_, str)) + return issubclass(self.dtype.typeobj, (np.str_, str)) cpdef bint is_string_array(ndarray values, bint skipna=False): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a6f5e08af62f4..cd6d6e5ba404b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -214,7 +214,7 @@ def construct_array_type( # type: ignore[override] return ArrowStringArray elif self.storage == "numpy": return NumpyStringArray - elif self.storage == "pyarrow-numpy": + elif self.storage == "pyarrow_numpy": return ArrowStringArrayNumpySemantics else: raise NotImplementedError diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3331b3ce0bbd6..e64f85c47e0cc 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -52,15 +52,20 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": - arr_name = "ArrowStringArray" - elif dtype.storage == "python": - arr_name = "ObjectStringArray" - elif dtype.storage == "numpy": - arr_name = "NumpyStringArray" - elif dtype.storage == "pyarrow_numpy": - arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + arr_names = { + 'pyarrow': 'ArrowStringArray', + 'python': 'ObjectStringArray', + 'numpy': 'NumpyStringArray', + 'pyarrow_numpy': "ArrowStringArrayNumpySemantics" + } + + if dtype.storage == "pyarrow_numpy": + na_name = "nan" + else: + na_name = "" + + expected = (f"<{arr_names[dtype.storage]}>\n['a', {na_name}, 'b']\n" + + "Length: 3, dtype: string") assert repr(df.A.array) == expected From 85609caff65129e499254b1715c615c34b9eec34 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Thu, 14 Mar 2024 13:26:31 -0600 Subject: [PATCH 15/52] fix coercion tests --- pandas/core/arrays/string_.py | 38 ++++++++++++++++++---- pandas/tests/arrays/string_/test_string.py | 2 +- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 15407e02671fa..6e3852b7f83f8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -584,7 +584,7 @@ def _cmp_method(self, other, op): result = np.zeros(len(self._ndarray), dtype="bool") try: result[valid] = op(self._ndarray[valid], other) - except np.core._exceptions._UFuncNoLoopError: + except np._core._exceptions._UFuncNoLoopError: if hasattr(other, "_ndarray"): other_type = other._ndarray.dtype else: @@ -724,15 +724,41 @@ def _from_sequence( class NumpyStringArray(BaseNumpyStringArray): _na_value = libmissing.NA _storage = "numpy" + _ctor_err_msg = "StringArray requires a sequence of strings or pandas.NA" def __init__(self, values, copy: bool = False) -> None: + default_dtype = get_numpy_string_dtype_instance() try: - values = np.asarray(values, dtype=get_numpy_string_dtype_instance()) + arr_values = np.asarray(values) except (TypeError, ValueError): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if values.size == 0: - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - super().__init__(values, copy=copy) + raise ValueError(self._ctor_err_msg) + # this check exists purely to satisfy test_constructor_raises and could + # be deleted if that restriction was relaxed for NumpyStringArray + if (arr_values.size == 0 or arr_values.dtype.char == "S"): + raise ValueError(self._ctor_err_msg) + try: + str_values = arr_values.astype(default_dtype) + except ValueError: + # we want to emulate ObjectStringArray, which accepts nan and None + # as valid missing values + if arr_values.dtype.kind == "O": + # try again with NA set to np.nan or None + str_values = None + for na_object in (np.nan, None): + try: + dtype = get_numpy_string_dtype_instance( + na_object=na_object, coerce=False) + str_values = arr_values.astype(dtype) + continue + except ValueError: + pass + if str_values is None: + raise ValueError(self._ctor_err_msg) + else: + str_values = str_values.astype(default_dtype) + else: + raise ValueError(self._ctor_err_msg) + super().__init__(str_values, copy=copy) @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b950fbce42cb0..70a2b76908f8f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -338,7 +338,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls is pd.arrays.StringArray: + if cls in (pd.arrays.ObjectStringArray, pd.core.arrays.string_.NumpyStringArray): # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) From 86ffe1c8750e81276b856227f17d246df98367ef Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 15 Mar 2024 09:28:17 -0600 Subject: [PATCH 16/52] more test fixes --- pandas/core/arrays/string_.py | 3 ++- pandas/core/indexes/base.py | 2 +- pandas/tests/arrays/string_/test_string.py | 13 ++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6e3852b7f83f8..e4cc283e4d0b4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -734,7 +734,8 @@ def __init__(self, values, copy: bool = False) -> None: raise ValueError(self._ctor_err_msg) # this check exists purely to satisfy test_constructor_raises and could # be deleted if that restriction was relaxed for NumpyStringArray - if (arr_values.size == 0 or arr_values.dtype.char == "S"): + if (((arr_values.dtype.char == "d" and arr_values.size == 0) or + (arr_values.dtype.char == "S"))): raise ValueError(self._ctor_err_msg) try: str_values = arr_values.astype(default_dtype) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c2df773326dc9..62725b6ce0d3b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -513,7 +513,7 @@ def __new__( if isinstance(data, ABCMultiIndex): data = data._values - if data.dtype.kind not in "iufcbmM": + if data.dtype.kind not in "iufcbmMT": # GH#11836 we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 70a2b76908f8f..83955705c2e74 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -398,7 +398,7 @@ def test_astype_int(dtype): msg = "cannot convert float NaN to integer" elif dtype.storage == "numpy": err = ValueError - msg = "Arrays with missing data cannot be converted to integers" + msg = "Arrays with missing data cannot be converted to a non-nullable type" else: err = TypeError msg = ( @@ -501,11 +501,10 @@ def test_arrow_array(dtype): expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - if dtype.storage == "python": + if dtype.storage in ("python", "numpy"): expected = pc.cast(expected, pa.string()) assert arr.equals(expected) - @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -521,7 +520,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage == "python": + if dtype.storage in ("python", "numpy"): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" @@ -529,6 +528,8 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) expected = df.astype(f"string[{string_storage2}]") + if string_storage2 == "numpy": + pytest.xfail("pyarrow does notsupport conversion to string[numpy]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is na_val(result["a"].dtype) @@ -551,12 +552,14 @@ def test_arrow_load_from_zero_chunks( data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage == "python": + if dtype.storage in ("python", "numpy"): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) + if string_storage2 == "numpy": + pytest.xfail("pyarrow does notsupport conversion to string[numpy]") with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) From dc9419dc46a684945bf841a7f3075a6e9969c3b1 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 18 Mar 2024 13:08:37 -0600 Subject: [PATCH 17/52] fix memory usage test --- pandas/core/arrays/string_.py | 8 ++++++-- pandas/tests/arrays/string_/test_string.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e4cc283e4d0b4..0dc5071c46a82 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -537,8 +537,6 @@ def value_counts(self, dropna: bool = True) -> Series: def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes - if deep: - return result + lib.memory_usage_of_objects(self._ndarray) return result @doc(ExtensionArray.searchsorted) @@ -717,6 +715,12 @@ def _from_sequence( return new_string_array + def memory_usage(self, deep: bool = False) -> int: + ret = super().memory_usage() + if deep: + ret += lib.memory_usage_of_objects(self._ndarray) + return ret + StringArray = ObjectStringArray diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 83955705c2e74..1193df3f52f6e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -618,7 +618,7 @@ def test_memory_usage(dtype, arrow_string_storage): series = pd.Series(["a", "b", "c"], dtype=dtype) - assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) + assert 0 < series.nbytes <= series.memory_usage() <= series.memory_usage(deep=True) @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) From 155ec68cd6ad288c0ed1e0862f94abeab1c43003 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 19 Mar 2024 15:03:42 -0600 Subject: [PATCH 18/52] Avoid copying in NumpyStringArray initializer --- pandas/core/arrays/string_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0dc5071c46a82..99bee63cb5de4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -742,7 +742,7 @@ def __init__(self, values, copy: bool = False) -> None: (arr_values.dtype.char == "S"))): raise ValueError(self._ctor_err_msg) try: - str_values = arr_values.astype(default_dtype) + str_values = arr_values.astype(default_dtype, copy=copy) except ValueError: # we want to emulate ObjectStringArray, which accepts nan and None # as valid missing values @@ -777,8 +777,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal else: result = arr.astype(get_numpy_string_dtype_instance(), copy=False) - # Manually creating new array avoids the validation step in the __init__, so is - # faster. Refactor need for validation? + # Manually creating with new array avoids the validation step in the + # __init__, so is faster. Refactor need for validation? new_string_array = cls.__new__(cls) NDArrayBacked.__init__( new_string_array, result, StringDtype(storage=cls._storage) From 8dadaf9d0fa39a65c17918d8ad3278c89508d9f1 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 26 Mar 2024 14:22:12 -0600 Subject: [PATCH 19/52] more fixes --- pandas/core/arrays/string_.py | 8 +++++++- pandas/core/dtypes/common.py | 2 +- pandas/core/missing.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 99bee63cb5de4..59a893eccf237 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -571,6 +571,12 @@ def _cmp_method(self, other, op): other = np.asarray(other) other = other[valid].astype(self._ndarray.dtype) + else: + try: + other = np.asarray(other, dtype=self._ndarray.dtype) + except ValueError: + raise TypeError(f"operation {op.__name__} not supported for " + "the input types") if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray) @@ -582,7 +588,7 @@ def _cmp_method(self, other, op): result = np.zeros(len(self._ndarray), dtype="bool") try: result[valid] = op(self._ndarray[valid], other) - except np._core._exceptions._UFuncNoLoopError: + except TypeError if hasattr(other, "_ndarray"): other_type = other._ndarray.dtype else: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0b3c017cf024d..a0e7ef604aa9e 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1058,7 +1058,7 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool: def needs_object_conversion(dtype: DtypeObj | None) -> bool: - return isinstance(dtype, type(get_numpy_string_dtype_instance())) + return dtype.char == "T" def needs_i8_conversion(dtype: DtypeObj | None) -> bool: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 5fc6364032027..ad8fd25e685bd 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -935,7 +935,10 @@ def new_func( if mask is None: # This needs to occur before casting to int64 mask = isna(values) - + result, mask = func(values.astype(object), limit=limit, limit_area=limit_area, + mask=mask) + values[:] = result[:] + return result.astype(values.dtype), mask return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) From aad5f3257d3c377dafd04a37ae8d41a02fc2e9b5 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 27 Mar 2024 09:34:41 -0600 Subject: [PATCH 20/52] fix SyntaxError --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 59a893eccf237..4acaa3f0215c9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -588,7 +588,7 @@ def _cmp_method(self, other, op): result = np.zeros(len(self._ndarray), dtype="bool") try: result[valid] = op(self._ndarray[valid], other) - except TypeError + except TypeError: if hasattr(other, "_ndarray"): other_type = other._ndarray.dtype else: From 190ffe3a964469aeb4ff4ecde30e2ae2b8cb6c08 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 27 Mar 2024 09:44:00 -0600 Subject: [PATCH 21/52] fix comparisons with scalars --- pandas/core/ops/array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 810e30d369729..ef07a95d31a73 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -341,7 +341,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) - elif lvalues.dtype == object or isinstance(rvalues, str): + elif lvalues.dtype == object or (lvalues.dtype.kind != "T" and isinstance(rvalues, str)): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: From dcf2cec35363b42660baa1ee37a229fa87e30e34 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 2 Apr 2024 09:59:45 -0600 Subject: [PATCH 22/52] Implement some ufuncs --- pandas/core/arrays/string_.py | 39 +++++++++++++++++++++++++++++++++ pandas/core/internals/blocks.py | 6 ++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4acaa3f0215c9..e5f3459ce2b35 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -42,6 +42,7 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.floating import ( FloatingArray, FloatingDtype, @@ -842,3 +843,41 @@ def to_numpy( if dtype is None and na_value is not lib.no_default: dtype = get_numpy_string_dtype_instance(na_object=na_value) return super().to_numpy(dtype, copy, na_value) + + def _str_find(self, sub, start: int = 0, end=None): + sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) + return np.strings.find(self._ndarray, sub, start, end) + + def _str_rfind(self, sub, start: int = 0, end=None): + sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) + return np.strings.rfind(self._ndarray, sub, start, end) + + def _str_isalnum(self) -> BooleanArray: + return BooleanArray(np.strings.isalnum(self._ndarray), isna(self._ndarray)) + + def _str_isalpha(self) -> BooleanArray: + return BooleanArray(np.strings.isalpha(self._ndarray), isna(self._ndarray)) + + def _str_isdigit(self) -> BooleanArray: + return BooleanArray(np.strings.isdigit(self._ndarray), isna(self._ndarray)) + + def _str_isdecimal(self) -> BooleanArray: + return BooleanArray(np.strings.isdecimal(self._ndarray), isna(self._ndarray)) + + def _str_islower(self) -> BooleanArray: + return BooleanArray(np.strings.islower(self._ndarray), isna(self._ndarray)) + + def _str_isnumeric(self) -> BooleanArray: + return BooleanArray(np.strings.isnumeric(self._ndarray), isna(self._ndarray)) + + def _str_isspace(self) -> BooleanArray: + return BooleanArray(np.strings.isspace(self._ndarray), isna(self._ndarray)) + + def _str_istitle(self) -> BooleanArray: + return BooleanArray(np.strings.istitle(self._ndarray), isna(self._ndarray)) + + def _str_isupper(self) -> BooleanArray: + return BooleanArray(np.strings.isupper(self._ndarray), isna(self._ndarray)) + + def _str_len(self) -> IntegerArray: + return IntegerArray(np.strings.str_len(self._ndarray), isna(self._ndarray)) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 07b3ee70d31dc..074e0b2b09667 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -105,6 +105,7 @@ ExtensionArray, IntervalArray, NumpyExtensionArray, + NumpyStringArray, PeriodArray, TimedeltaArray, ) @@ -2133,7 +2134,10 @@ def is_view(self) -> bool: @property def array_values(self) -> ExtensionArray: - return NumpyExtensionArray(self.values) + if self.values.dtype.kind == 'T': + return NumpyStringArray(self.values) + else: + return NumpyExtensionArray(self.values) def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: if dtype == _dtype_obj: From b5cdea8d60c93976ee4fefeba5758c6f24a1b591 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 2 Apr 2024 15:06:08 -0600 Subject: [PATCH 23/52] Add index/rindex --- pandas/core/arrays/string_.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e5f3459ce2b35..2846fa7270f7c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -852,6 +852,16 @@ def _str_rfind(self, sub, start: int = 0, end=None): sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) return np.strings.rfind(self._ndarray, sub, start, end) + def _str_index(self, sub, start: int = 0, end=None): + sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) + result = np.strings.index(self._ndarray, sub, start, end) + return IntegerArray(result, isna(self._ndarray)) + + def _str_rindex(self, sub, start: int = 0, end=None): + sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) + result = np.strings.rindex(self._ndarray, sub, start, end) + return IntegerArray(result, isna(self._ndarray)) + def _str_isalnum(self) -> BooleanArray: return BooleanArray(np.strings.isalnum(self._ndarray), isna(self._ndarray)) From ba0a8b4f77f9672688d7075201a3b2e3e143d186 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 2 Apr 2024 15:06:53 -0600 Subject: [PATCH 24/52] drop unnecessary type annotations in map_infer_mask --- pandas/_libs/lib.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bd3a984162753..a42273fbfff89 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2835,7 +2835,7 @@ NoDefault = Literal[_NoDefault.no_default] def map_infer_mask( - ndarray[object] arr, + ndarray arr, object f, const uint8_t[:] mask, *, @@ -2883,8 +2883,8 @@ def map_infer_mask( @cython.boundscheck(False) @cython.wraparound(False) def _map_infer_mask( - ndarray[uint8_int64_object_t] out, - ndarray[object] arr, + ndarray out, + ndarray arr, object f, const uint8_t[:] mask, object na_value=no_default, From 5691409d495f522664ebe0305e69a3857fbbc758 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 19 Apr 2024 13:16:25 -0600 Subject: [PATCH 25/52] Add more string method implementations --- pandas/core/arrays/string_.py | 94 ++++++++++++++++++++++++----- pandas/core/strings/accessor.py | 4 +- pandas/core/strings/object_array.py | 4 ++ 3 files changed, 83 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2846fa7270f7c..fd41f7489d172 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -844,50 +844,112 @@ def to_numpy( dtype = get_numpy_string_dtype_instance(na_object=na_value) return super().to_numpy(dtype, copy, na_value) - def _str_find(self, sub, start: int = 0, end=None): + def _str_pad(self, width, side="left", fillchar=' '): + if side == 'left': + return np.strings.ljust(self._ndarray, width, fillchar) + elif side == 'right': + return np.strings.rjust(self._ndarray, width, fillchar) + elif side == 'both': + return np.strings.center(self._ndarray, width, fillchar) + raise ValueError("Invalid side") + + def _str_endswith(self, pat, na=None) -> BooleanArray: + pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) + result = np.strings.endswith(self._ndarray, pat) + return BooleanArray(result, isna(self._ndarray)) + + def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - return np.strings.find(self._ndarray, sub, start, end) + result = np.strings.find(self._ndarray, sub, start, end) + return IntegerArray(result, isna(self._ndarray)) - def _str_rfind(self, sub, start: int = 0, end=None): + def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - return np.strings.rfind(self._ndarray, sub, start, end) + result = np.strings.rfind(self._ndarray, sub, start, end) + return IntegerArray(result, isna(self._ndarray)) - def _str_index(self, sub, start: int = 0, end=None): + def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) result = np.strings.index(self._ndarray, sub, start, end) return IntegerArray(result, isna(self._ndarray)) - def _str_rindex(self, sub, start: int = 0, end=None): + def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) result = np.strings.rindex(self._ndarray, sub, start, end) return IntegerArray(result, isna(self._ndarray)) def _str_isalnum(self) -> BooleanArray: - return BooleanArray(np.strings.isalnum(self._ndarray), isna(self._ndarray)) + result = np.strings.isalnum(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_isalpha(self) -> BooleanArray: - return BooleanArray(np.strings.isalpha(self._ndarray), isna(self._ndarray)) + result = np.strings.isalpha(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_isdigit(self) -> BooleanArray: - return BooleanArray(np.strings.isdigit(self._ndarray), isna(self._ndarray)) + result = np.strings.isdigit(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_isdecimal(self) -> BooleanArray: - return BooleanArray(np.strings.isdecimal(self._ndarray), isna(self._ndarray)) + result = np.strings.isdecimal(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_islower(self) -> BooleanArray: - return BooleanArray(np.strings.islower(self._ndarray), isna(self._ndarray)) + result = np.strings.islower(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_isnumeric(self) -> BooleanArray: - return BooleanArray(np.strings.isnumeric(self._ndarray), isna(self._ndarray)) + result = np.strings.isnumeric(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_isspace(self) -> BooleanArray: - return BooleanArray(np.strings.isspace(self._ndarray), isna(self._ndarray)) + result = np.strings.isspace(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_istitle(self) -> BooleanArray: - return BooleanArray(np.strings.istitle(self._ndarray), isna(self._ndarray)) + result = np.strings.istitle(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_isupper(self) -> BooleanArray: - return BooleanArray(np.strings.isupper(self._ndarray), isna(self._ndarray)) + result = np.strings.isupper(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) def _str_len(self) -> IntegerArray: - return IntegerArray(np.strings.str_len(self._ndarray), isna(self._ndarray)) + result = np.strings.str_len(self._ndarray) + return IntegerArray(result, isna(self._ndarray)) + + def _str_lstrip(self, to_strip=None): + if to_strip is not None: + to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) + return np.strings.lstrip(self._ndarray, to_strip) + + def _str_partition(self, sep=' ', expand=True): + return np.column_stack(np.strings.partition(self._ndarray, sep)) + + def _str_rpartition(self, sep=' ', expand=True): + return np.column_stack(np.strings.rpartition(self._ndarray, sep)) + + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): + if regex: + super()._str_replace(pat, repl, n, case, flags, regex) + pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) + repl = np.asarray(repl, dtype=get_numpy_string_dtype_instance()) + return np.strings.replace(self._ndarray, pat, repl, n) + + def _str_rstrip(self, to_strip=None): + if to_strip is not None: + to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) + return np.strings.rstrip(self._ndarray, to_strip) + + def _str_strip(self, to_strip=None): + if to_strip is not None: + to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) + return np.strings.strip(self._ndarray, to_strip) + + def _str_startswith(self, pat, na=None) -> BooleanArray: + pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) + result = np.strings.startswith(self._ndarray, pat) + return BooleanArray(result, isna(self._ndarray)) + + def _str_zfill(self, width): + return np.strings.zfill(self._ndarray, width) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c177ed728f549..5270609d8cd87 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1843,9 +1843,7 @@ def zfill(self, width: int): if not is_integer(width): msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - f = lambda x: x.zfill(width) - result = self._data.array._str_map(f) - return self._wrap_result(result) + return self._wrap_result(self._data.array._str_zfill(width)) def slice(self, start=None, stop=None, step=None): """ diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 1481c069b392b..d4ad6417faa48 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -515,3 +515,7 @@ def f(x): return empty_row return [f(val) for val in np.asarray(self)] + + def _str_zfill(self, width): + f = lambda x: x.zfill(width) + return self._str_map(f) From 4b3e48b0fa9e43cf23f3f655f1d6c69ebf831a1b Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 19 Apr 2024 13:31:46 -0600 Subject: [PATCH 26/52] delete unnecessary input sanitization --- pandas/core/arrays/string_.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fd41f7489d172..7ae04a92cf0b8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -572,12 +572,6 @@ def _cmp_method(self, other, op): other = np.asarray(other) other = other[valid].astype(self._ndarray.dtype) - else: - try: - other = np.asarray(other, dtype=self._ndarray.dtype) - except ValueError: - raise TypeError(f"operation {op.__name__} not supported for " - "the input types") if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray) From 1e1d651a09efd2dc829fa41eb53fbbb850c1edf2 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 23 Apr 2024 18:04:42 -0600 Subject: [PATCH 27/52] hotfix issue with hashing --- pandas/core/util/hashing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 45492c30e2a83..3567271a5e430 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -310,6 +310,8 @@ def _hash_ndarray( # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. + if not vals.dtype.char == 'O': + vals = vals.astype('object') if categorize: from pandas import ( Categorical, From d27816c6336935dd54877f92cf28ed6702c2bd64 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 09:28:34 -0600 Subject: [PATCH 28/52] Avoid unnecessary copies in NumpyStringArray initializer --- pandas/core/arrays/string_.py | 3 ++- pandas/core/dtypes/common.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7ae04a92cf0b8..2c8be2501b22e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -732,11 +732,12 @@ class NumpyStringArray(BaseNumpyStringArray): _ctor_err_msg = "StringArray requires a sequence of strings or pandas.NA" def __init__(self, values, copy: bool = False) -> None: - default_dtype = get_numpy_string_dtype_instance() try: arr_values = np.asarray(values) except (TypeError, ValueError): raise ValueError(self._ctor_err_msg) + default_dtype = get_numpy_string_dtype_instance( + possible_dtype=getattr(arr_values, "dtype", None)) # this check exists purely to satisfy test_constructor_raises and could # be deleted if that restriction was relaxed for NumpyStringArray if (((arr_values.dtype.char == "d" and arr_values.size == 0) or diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 09a8872939512..ce9d4a3a086ce 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -533,7 +533,11 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: """ return dtype == object or dtype.kind in "SUT" -def get_numpy_string_dtype_instance(na_object=libmissing.NA, coerce=False): +def get_numpy_string_dtype_instance( + na_object=libmissing.NA, + coerce=False, + possible_dtype=None +): """Get a reference to a ``numpy.dtypes.StringDType`` instance. This is a convenience wrapper around the StringDType initializer @@ -546,7 +550,14 @@ def get_numpy_string_dtype_instance(na_object=libmissing.NA, coerce=False): coerce : bool Whether or not non-strings entries in arrays should be converted to strings. - """ + possible_dtype : numpy.dtype + Returned as the result if the dtype matches the provided settings + """ + if possible_dtype is not None: + possible_coerce = getattr(possible_dtype, "coerce", True) + possible_na = getattr(possible_dtype, "na_object", None) + if possible_coerce == coerce and possible_na is libmissing.NA: + return possible_dtype return np.dtypes.StringDType(na_object=na_object, coerce=coerce) def is_string_dtype(arr_or_dtype) -> bool: From 19d85bb7ad3864283e42f665e8775ac0359b3d21 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 09:28:55 -0600 Subject: [PATCH 29/52] copy to hotfix issue in groupby --- pandas/core/arrays/string_.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2c8be2501b22e..ed55fd97d0092 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -771,11 +771,14 @@ def __init__(self, values, copy: bool = False) -> None: def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): arr = np.asarray(scalars) if is_object_dtype(arr.dtype): - result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance()) + result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance(coerce=True)) na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True) result[~na_mask] = arr[~na_mask] if any_na: result[na_mask] = libmissing.NA + # TODO avoid copy + # could temporarily set coerce=True but that's not possible at the moment + result = result.astype(get_numpy_string_dtype_instance()) else: result = arr.astype(get_numpy_string_dtype_instance(), copy=False) From 11778ed006893767a22e62e57a8556f93a741b8c Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 10:39:39 -0600 Subject: [PATCH 30/52] Add stringdtype to more test fixtures --- pandas/conftest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index c03dab250c8d2..7eaa625051141 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -709,6 +709,12 @@ def _create_mi_with_dt64tz_level(): "string-python": Index( pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") ), + "string-numpy": Index( + pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]") + ), + "string-numpy-stringdtype": Index( + np.array([f"pandas_{i}" for i in range(100)], dtype="T") + ), } if has_pyarrow: idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) @@ -1276,6 +1282,7 @@ def string_dtype(request): params=[ "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[numpy]", ] ) def nullable_string_dtype(request): @@ -1284,6 +1291,7 @@ def nullable_string_dtype(request): * 'string[python]' * 'string[pyarrow]' + * 'string[numpy]' """ return request.param @@ -1355,6 +1363,7 @@ def object_dtype(request): "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + "string[numpy]", ] ) def any_string_dtype(request): From 2034a251032868bc79e20513f8a5f24f70dcc2d3 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 11:03:51 -0600 Subject: [PATCH 31/52] revert unnecessary changes to ObjectStringArrayMixin._str_map --- pandas/core/strings/object_array.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index d4ad6417faa48..91578debc0874 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -66,8 +66,6 @@ def _str_map( convert : bool, default True Whether to call `maybe_convert_objects` on the resulting ndarray """ - from pandas.arrays import BooleanArray - if dtype is None: dtype = np.dtype("object") if na_value is None: @@ -76,7 +74,7 @@ def _str_map( if not len(self): return np.array([], dtype=dtype) - arr = np.asarray(self) + arr = np.asarray(self, dtype=object) mask = isna(arr) map_convert = convert and not np.all(mask) try: @@ -110,18 +108,6 @@ def g(x): np.putmask(result, mask, na_value) if convert and result.dtype == object: result = lib.maybe_convert_objects(result) - - result = result.astype(dtype) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray] | type[BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - return constructor(result, mask) - return result def _str_count(self, pat, flags: int = 0): From 151fe64ae61289145c6d2ee099d1c46006549d74 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 13:11:03 -0600 Subject: [PATCH 32/52] handle NA values for inputs that might be coerced to string --- pandas/core/arrays/string_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ed55fd97d0092..1792207688b6b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -769,10 +769,10 @@ def __init__(self, values, copy: bool = False) -> None: @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + na_mask, any_na = libmissing.isnaobj(np.array(scalars, dtype=object), check_for_any_na=True) arr = np.asarray(scalars) if is_object_dtype(arr.dtype): result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance(coerce=True)) - na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True) result[~na_mask] = arr[~na_mask] if any_na: result[na_mask] = libmissing.NA @@ -781,6 +781,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal result = result.astype(get_numpy_string_dtype_instance()) else: result = arr.astype(get_numpy_string_dtype_instance(), copy=False) + if any_na: + result[na_mask] = libmissing.NA # Manually creating with new array avoids the validation step in the # __init__, so is faster. Refactor need for validation? From 83944950e36dc42692fb36202db791d7f867da53 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 16:02:43 -0600 Subject: [PATCH 33/52] remove implementations for string methods that won't be available until numpy 2.1 --- pandas/core/arrays/string_.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1792207688b6b..ad1795b349928 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -844,15 +844,6 @@ def to_numpy( dtype = get_numpy_string_dtype_instance(na_object=na_value) return super().to_numpy(dtype, copy, na_value) - def _str_pad(self, width, side="left", fillchar=' '): - if side == 'left': - return np.strings.ljust(self._ndarray, width, fillchar) - elif side == 'right': - return np.strings.rjust(self._ndarray, width, fillchar) - elif side == 'both': - return np.strings.center(self._ndarray, width, fillchar) - raise ValueError("Invalid side") - def _str_endswith(self, pat, na=None) -> BooleanArray: pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) result = np.strings.endswith(self._ndarray, pat) @@ -923,12 +914,6 @@ def _str_lstrip(self, to_strip=None): to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) return np.strings.lstrip(self._ndarray, to_strip) - def _str_partition(self, sep=' ', expand=True): - return np.column_stack(np.strings.partition(self._ndarray, sep)) - - def _str_rpartition(self, sep=' ', expand=True): - return np.column_stack(np.strings.rpartition(self._ndarray, sep)) - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): if regex: super()._str_replace(pat, repl, n, case, flags, regex) From aa7cec9e30503a106525021d28b9bb460006e742 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 16:03:16 -0600 Subject: [PATCH 34/52] delegate to superclass for some startswith and endswith parameters --- pandas/core/arrays/string_.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ad1795b349928..960ae4b66558d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -845,6 +845,8 @@ def to_numpy( return super().to_numpy(dtype, copy, na_value) def _str_endswith(self, pat, na=None) -> BooleanArray: + if isinstance(pat, tuple) or na is not None: + return super()._str_endswith(pat, na) pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) result = np.strings.endswith(self._ndarray, pat) return BooleanArray(result, isna(self._ndarray)) @@ -932,6 +934,8 @@ def _str_strip(self, to_strip=None): return np.strings.strip(self._ndarray, to_strip) def _str_startswith(self, pat, na=None) -> BooleanArray: + if isinstance(pat, tuple) or na is not None: + return super()._str_startswith(pat, na) pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) result = np.strings.startswith(self._ndarray, pat) return BooleanArray(result, isna(self._ndarray)) From dfedd1e1f96719c17206715037f4ad6109cdfda2 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 26 Apr 2024 16:03:29 -0600 Subject: [PATCH 35/52] fix null entries in findlike ufuncs --- pandas/core/arrays/string_.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 960ae4b66558d..2420cfab1fc02 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -853,23 +853,35 @@ def _str_endswith(self, pat, na=None) -> BooleanArray: def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - result = np.strings.find(self._ndarray, sub, start, end) - return IntegerArray(result, isna(self._ndarray)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype='int64') + result[~na_mask] = np.strings.find( + self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - result = np.strings.rfind(self._ndarray, sub, start, end) - return IntegerArray(result, isna(self._ndarray)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype='int64') + result[~na_mask] = np.strings.rfind( + self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - result = np.strings.index(self._ndarray, sub, start, end) - return IntegerArray(result, isna(self._ndarray)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype='int64') + result[~na_mask] = np.strings.index( + self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - result = np.strings.rindex(self._ndarray, sub, start, end) - return IntegerArray(result, isna(self._ndarray)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype='int64') + result[~na_mask] = np.strings.rindex( + self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) def _str_isalnum(self) -> BooleanArray: result = np.strings.isalnum(self._ndarray) From d64dcf89c4d8a82fedf641158be2d8510c8aec79 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 29 Apr 2024 11:54:47 -0400 Subject: [PATCH 36/52] revert np min API version and try to fix tests --- meson.build | 4 ++-- pandas/conftest.py | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/meson.build b/meson.build index 7831b43833d38..06623a305ab54 100644 --- a/meson.build +++ b/meson.build @@ -24,8 +24,8 @@ add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp') # Allow supporting older numpys than the version compiled against # Set the define to the min supported version of numpy for pandas # e.g. right now this is targeting numpy 1.21+ -add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'c') -add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'cpp') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp') if fs.exists('_version_meson.py') diff --git a/pandas/conftest.py b/pandas/conftest.py index 7eaa625051141..12976c8367e72 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -51,6 +51,7 @@ utc, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -709,16 +710,17 @@ def _create_mi_with_dt64tz_level(): "string-python": Index( pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") ), - "string-numpy": Index( - pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]") - ), - "string-numpy-stringdtype": Index( - np.array([f"pandas_{i}" for i in range(100)], dtype="T") - ), } if has_pyarrow: idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx +if np_version_gt2: + indices_dict["string-numpy"] = Index( + pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]") + ) + indices_dict["string-numpy-stringdtype"] = Index( + np.array([f"pandas_{i}" for i in range(100)], dtype="T") + ) @pytest.fixture(params=indices_dict.keys()) @@ -1282,7 +1284,7 @@ def string_dtype(request): params=[ "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - "string[numpy]", + pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")), ] ) def nullable_string_dtype(request): @@ -1300,7 +1302,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - "numpy", + pytest.param("numpy", marks=td.skip_if_no("numpy", "2.0")), pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) @@ -1310,6 +1312,7 @@ def string_storage(request): * 'python' * 'pyarrow' + * 'numpy' * 'pyarrow_numpy' """ return request.param @@ -1363,7 +1366,7 @@ def object_dtype(request): "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - "string[numpy]", + pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")), ] ) def any_string_dtype(request): From 8e32211d7e9344696e41b3d7d168d759d39ee5c1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 4 May 2024 17:31:05 -0400 Subject: [PATCH 37/52] modify base object string array instead --- asv_bench/asv.conf.json | 1 + asv_bench/benchmarks/strings.py | 30 +- pandas/_libs/lib.pyx | 41 +- pandas/_libs/missing.pxd | 2 +- pandas/_libs/missing.pyx | 12 +- pandas/arrays/__init__.py | 1 - pandas/conftest.py | 14 +- pandas/core/arrays/__init__.py | 8 +- pandas/core/arrays/numpy_.py | 34 +- pandas/core/arrays/string_.py | 443 +++++---------------- pandas/core/config_init.py | 2 +- pandas/core/dtypes/astype.py | 7 +- pandas/core/internals/blocks.py | 6 +- pandas/core/strings/object_array.py | 171 +++++++- pandas/tests/arrays/string_/test_string.py | 61 +-- 15 files changed, 352 insertions(+), 481 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 30c692115eab1..0c59858a3bf31 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -42,6 +42,7 @@ // followed by the pip installed packages). "matrix": { "pip+build": [], + "numpy": ["2.0rc1"], "Cython": ["3.0"], "matplotlib": [], "sqlalchemy": [], diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index c4fdaf61dc55b..467fab857d306 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,6 @@ import warnings import numpy as np -from numpy.dtypes import StringDType from pandas import ( NA, @@ -14,27 +13,14 @@ class Dtypes: - params = [ - "str", - "string[python]", - "string[pyarrow]", - "string[numpy]", - StringDType(), - ] + params = ["str", "string[python]", "string[pyarrow]"] param_names = ["dtype"] - dtype_mapping = { - "str": "str", - "string[python]": object, - "string[pyarrow]": object, - "string[numpy]": StringDType(), - StringDType(): StringDType(), - } def setup(self, dtype): try: self.s = Series( - Index([f"i-{i}" for i in range(10000)], dtype=self.dtype_mapping[dtype])._values, - dtype=dtype + Index([f"i-{i}" for i in range(10000)], dtype=object)._values, + dtype=dtype, ) except ImportError as err: raise NotImplementedError from err @@ -43,17 +29,11 @@ def setup(self, dtype): class Construction: params = ( ["series", "frame", "categorical_series"], - ["str", "string[python]", "string[pyarrow]", "string[numpy]", StringDType()], + ["str", "string[python]", "string[pyarrow]"], ) param_names = ["pd_type", "dtype"] pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series} - dtype_mapping = { - "str": "str", - "string[python]": object, - "string[pyarrow]": object, - "string[numpy]": StringDType(), - StringDType(): StringDType(), - } + dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object} def setup(self, pd_type, dtype): series_arr = np.array( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 23e71ec3903b2..553133faca6ed 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -53,6 +53,7 @@ from numpy cimport ( PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + PyArray_SETITEM, complex128_t, flatiter, float64_t, @@ -672,41 +673,37 @@ def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool return True -ctypedef fused ndarr_object: - ndarray[object, ndim=1] - ndarray[object, ndim=2] - # TODO: get rid of this in StringArray and modify # and go through ensure_string_array instead @cython.wraparound(False) @cython.boundscheck(False) -def convert_nans_to_NA(ndarr_object arr) -> ndarray: +def convert_nans_to_NA(ndarray arr) -> ndarray: """ Helper for StringArray that converts null values that are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements have already been validated as null. """ cdef: - Py_ssize_t i, m, n + Py_ssize_t i, m + Py_ssize_t n = len(arr) object val - ndarr_object result - result = np.asarray(arr, dtype="object") - if arr.ndim == 2: - m, n = arr.shape[0], arr.shape[1] - for i in range(m): - for j in range(n): - val = arr[i, j] - if not isinstance(val, str): - result[i, j] = C_NA - else: - n = len(arr) - for i in range(n): - val = arr[i] - if not isinstance(val, str): - result[i] = C_NA - return result + flatiter it = cnp.PyArray_IterNew(arr) + + for i in range(n): + # The PyArray_GETITEM and PyArray_ITER_NEXT are faster + # equivalents to `val = values[i]` + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + + + # Not string so has to be null since they're already validated + if not isinstance(val, str): + val = C_NA + + PyArray_SETITEM(arr, PyArray_ITER_DATA(it), val) + + PyArray_ITER_NEXT(it) @cython.wraparound(False) diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index f2768ae45cccd..899d729690451 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -8,7 +8,7 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*) cpdef bint check_na_tuples_nonequal(object left, object right) cpdef bint checknull(object val) -cpdef object isnaobj(ndarray arr, bint check_for_any_na=*) +cpdef ndarray[uint8_t] isnaobj(ndarray arr) cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 62814e955ca5b..2f44128cda822 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -180,7 +180,7 @@ cdef bint is_decimal_na(object val): @cython.wraparound(False) @cython.boundscheck(False) -cpdef object isnaobj(ndarray arr, bint check_for_any_na=False): +cpdef ndarray[uint8_t] isnaobj(ndarray arr): """ Return boolean mask denoting which elements of a 1-D array are na-like, according to the criteria defined in `checknull`: @@ -195,17 +195,15 @@ cpdef object isnaobj(ndarray arr, bint check_for_any_na=False): Parameters ---------- arr : ndarray - check_for_any_na : boolean - If true, the return value of this function + Returns ------- - result : ndarray (dtype=np.bool_) or tuple of boolean ndarray and a bool + result : ndarray (dtype=np.bool_) """ cdef: Py_ssize_t i, n = arr.size object val bint is_null - bint any_na = 0 ndarray result = np.empty((arr).shape, dtype=np.uint8) flatiter it = cnp.PyArray_IterNew(arr) flatiter it2 = cnp.PyArray_IterNew(result) @@ -218,11 +216,7 @@ cpdef object isnaobj(ndarray arr, bint check_for_any_na=False): is_null = checknull(val) # Dereference pointer (set value) ((cnp.PyArray_ITER_DATA(it2)))[0] = is_null - if not any_na and is_null: - any_na = 1 cnp.PyArray_ITER_NEXT(it2) - if check_for_any_na: - return (result.view(np.bool_), bool(any_na)) return result.view(np.bool_) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index d4852fd562867..9fd6948f16d50 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -14,7 +14,6 @@ IntegerArray, IntervalArray, NumpyExtensionArray, - ObjectStringArray, PeriodArray, SparseArray, StringArray, diff --git a/pandas/conftest.py b/pandas/conftest.py index 12976c8367e72..21100178262c8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -51,7 +51,6 @@ utc, ) -from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -158,6 +157,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), + ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), ( @@ -714,13 +714,6 @@ def _create_mi_with_dt64tz_level(): if has_pyarrow: idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx -if np_version_gt2: - indices_dict["string-numpy"] = Index( - pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]") - ) - indices_dict["string-numpy-stringdtype"] = Index( - np.array([f"pandas_{i}" for i in range(100)], dtype="T") - ) @pytest.fixture(params=indices_dict.keys()) @@ -1284,7 +1277,6 @@ def string_dtype(request): params=[ "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")), ] ) def nullable_string_dtype(request): @@ -1293,7 +1285,6 @@ def nullable_string_dtype(request): * 'string[python]' * 'string[pyarrow]' - * 'string[numpy]' """ return request.param @@ -1302,7 +1293,6 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("numpy", marks=td.skip_if_no("numpy", "2.0")), pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) @@ -1312,7 +1302,6 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'numpy' * 'pyarrow_numpy' """ return request.param @@ -1366,7 +1355,6 @@ def object_dtype(request): "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")), ] ) def any_string_dtype(request): diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 594283c82f112..245a171fea74b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -17,11 +17,7 @@ period_array, ) from pandas.core.arrays.sparse import SparseArray -from pandas.core.arrays.string_ import ( - NumpyStringArray, - ObjectStringArray, - StringArray, -) +from pandas.core.arrays.string_ import StringArray from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.arrays.timedeltas import TimedeltaArray @@ -43,7 +39,5 @@ "period_array", "SparseArray", "StringArray", - "ObjectStringArray", - "NumpyStringArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d8e72cc6499c9..ab48140857204 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -9,7 +9,10 @@ from pandas._libs import lib from pandas._libs.tslibs import is_supported_dtype -from pandas.compat.numpy import function as nv +from pandas.compat.numpy import ( + function as nv, + np_version_gt2, +) from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike @@ -26,7 +29,10 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core.strings.object_array import ObjectStringArrayMixin +from pandas.core.strings.object_array import ( + NumpyStringArrayMixin, + ObjectStringArrayMixin, +) if TYPE_CHECKING: from pandas._typing import ( @@ -43,12 +49,20 @@ from pandas import Index +if np_version_gt2: + str_mixin = NumpyStringArrayMixin +else: + str_mixin = ObjectStringArrayMixin + + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" class NumpyExtensionArray( # type: ignore[misc] OpsMixin, NDArrayBackedExtensionArray, - ObjectStringArrayMixin, + NumpyStringArrayMixin, + # str_mixin, + # ObjectStringArrayMixin, ): """ A pandas ExtensionArray for NumPy data. @@ -153,7 +167,12 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: - return np.asarray(self._ndarray, dtype=dtype) + array = self._ndarray + # to_numpy on StringArray backed by StringDType should still return object dtype + # for backwards compat + if self._ndarray.dtype.kind == "T": + array = array.astype(object) + return np.asarray(array, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # Lightly modified version of @@ -496,7 +515,12 @@ def to_numpy( na_value: object = lib.no_default, ) -> np.ndarray: mask = self.isna() - result = np.asarray(self._ndarray, dtype=dtype) + # to_numpy on StringArray backed by StringDType should still return object dtype + # for backwards compat + array = self._ndarray + if self._ndarray.dtype.kind == "T": + array = array.astype(object) + result = np.asarray(array, dtype=dtype) if na_value is not lib.no_default and mask.any(): result = result.copy() result[mask] = na_value diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2420cfab1fc02..55be0f51f2ed4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -17,11 +17,11 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import ( - is_numpy_dev, - pa_version_under10p1, +from pandas.compat import pa_version_under10p1 +from pandas.compat.numpy import ( + function as nv, + np_version_gt2, ) -from pandas.compat.numpy import function as nv from pandas.util._decorators import doc from pandas.core.dtypes.base import ( @@ -30,7 +30,6 @@ register_extension_dtype, ) from pandas.core.dtypes.common import ( - get_numpy_string_dtype_instance, is_array_like, is_bool_dtype, is_integer_dtype, @@ -42,7 +41,6 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray -from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.floating import ( FloatingArray, FloatingDtype, @@ -86,7 +84,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "numpy", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -132,17 +130,15 @@ def __init__(self, storage=None) -> None: storage = "pyarrow_numpy" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "numpy", "pyarrow_numpy"}: + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( - "Storage must be 'python', 'pyarrow', 'pyarrow_numpy', " - f"or 'numpy'. Got {storage} instead." + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." ) if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) - if storage == "numpy" and not is_numpy_dev: - raise ImportError("NumPy backed string storage requires numpy dev") self.storage = storage @property @@ -166,7 +162,6 @@ def construct_from_string(cls, string) -> Self: ``'string'`` pd.options.mode.string_storage, default python ``'string[python]'`` python ``'string[pyarrow]'`` pyarrow - ``'string[numpy]'`` numpy ========================== ============================================== Returns @@ -188,8 +183,6 @@ def construct_from_string(cls, string) -> Self: return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") - elif string == "string[numpy]": - return cls(storage="numpy") elif string == "string[pyarrow_numpy]": return cls(storage="pyarrow_numpy") else: @@ -214,15 +207,11 @@ def construct_array_type( # type: ignore[override] ) if self.storage == "python": - return ObjectStringArray + return StringArray elif self.storage == "pyarrow": return ArrowStringArray - elif self.storage == "numpy": - return NumpyStringArray - elif self.storage == "pyarrow_numpy": - return ArrowStringArrayNumpySemantics else: - raise NotImplementedError + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -291,7 +280,7 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" -class BaseNumpyStringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] +class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] """ Extension array for string data. @@ -378,20 +367,78 @@ class BaseNumpyStringArray(BaseStringArray, NumpyExtensionArray): # type: ignor def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) - super().__init__(values, copy=copy) if not isinstance(values, type(self)): - self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage)) + values = self._validate(values) + super().__init__(values, copy=copy) + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) - def _validate(self) -> None: + def _validate(self, values) -> None: """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + if len(values) and not lib.is_string_array(values, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": + if values.dtype != "object" and values.dtype.kind != "T": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " - f"'{self._ndarray.dtype}' dtype instead." + f"'{values.dtype}' dtype instead." ) + # Convert N/A values (if they exist to pd.NA + lib.convert_nans_to_NA(values) + + # Cast to the faster native numpy StringDType in numpy 2.0 + # if it's available + if np_version_gt2: + if not values.dtype.kind == "T": + from numpy.dtypes import StringDType + + values = values.astype(StringDType(na_object=libmissing.NA)) + + return values + + # if self._ndarray.ndim > 2: + # # Ravel if ndims > 2 b/c no cythonized version available + # lib.convert_nans_to_NA(self._ndarray.ravel("K")) + # else: + # lib.convert_nans_to_NA(self._ndarray) + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" + + from pandas.core.arrays.masked import BaseMaskedArray + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result[na_values] = libmissing.NA + + else: + if lib.is_pyarrow_array(scalars): + # pyarrow array; we cannot rely on the "to_numpy" check in + # ensure_string_array because calling scalars.to_numpy would set + # zero_copy_only to True which caused problems see GH#52076 + scalars = np.array(scalars) + # convert non-na-likes to str, and nan-likes to StringDtype().na_value + result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + + # TODO: Support converting directly to string array in ensure_string_array? + if np_version_gt2: + if not result.dtype.kind == "T": + from numpy.dtypes import StringDType + + result = result.astype(StringDType(na_object=libmissing.NA)) + + # Manually creating new array avoids the validation step in the __init__, so is + # faster. Refactor need for validation? + new_string_array = cls.__new__(cls) + NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + + return new_string_array @classmethod def _from_sequence_of_strings( @@ -401,7 +448,12 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=get_numpy_string_dtype_instance()) + dtype = object + if np_version_gt2: + from numpy.dtypes import StringDType + + dtype = StringDType(na_object=libmissing.NA) + values = np.empty(shape, dtype=dtype) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -413,8 +465,9 @@ def __arrow_array__(self, type=None): if type is None: type = pa.string() - - values = self._ndarray.astype("object").copy() + # TODO: avoid astype to object for numpy StringDType + # once pyarrow supports that + values = self._ndarray.astype("object", copy=True) values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) @@ -463,17 +516,6 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) - def _validate(self): - """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": - raise ValueError( - f"{type(self).__name__} requires a sequence of strings or " - "pandas.NA convertible to a NumPy array with dtype " - f"'object'. Got '{self._ndarray.dtype}' dtype instead." - ) - def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) @@ -485,7 +527,7 @@ def astype(self, dtype, copy: bool = True): elif isinstance(dtype, IntegerDtype): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = "0" + arr[mask] = 0 values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) elif isinstance(dtype, FloatingDtype): @@ -500,7 +542,7 @@ def astype(self, dtype, copy: bool = True): elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = "0" + arr[mask] = 0 values = arr.astype(dtype) values[mask] = np.nan return values @@ -538,6 +580,8 @@ def value_counts(self, dropna: bool = True) -> Series: def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes + if deep and self.dtype == object: + return result + lib.memory_usage_of_objects(self._ndarray) return result @doc(ExtensionArray.searchsorted) @@ -574,10 +618,15 @@ def _cmp_method(self, other, op): other = other[valid].astype(self._ndarray.dtype) if op.__name__ in ops.ARITHMETIC_BINOPS: - result = np.empty_like(self._ndarray) + dtype = object + if np_version_gt2: + from numpy.dtypes import StringDType + + dtype = StringDType(na_object=libmissing.NA) + result = np.empty_like(self._ndarray, dtype=dtype) result[mask] = libmissing.NA result[valid] = op(self._ndarray[valid], other) - return type(self)(result) + return StringArray(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -591,7 +640,7 @@ def _cmp_method(self, other, op): raise TypeError( f"'{op.__name__}' operator not supported between " f"'{self._ndarray.dtype}' and '{other_type}'" - ) + ) from None return BooleanArray(result, mask) _arith_method = _cmp_method @@ -656,301 +705,3 @@ def _str_map( # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) - - -class ObjectStringArray(BaseNumpyStringArray): - _na_value = None - _storage = "python" - - @classmethod - def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=object) - values[:] = libmissing.NA - return cls(values).astype(dtype, copy=False) - - def _validate(self): - super()._validate() - # Check to see if need to convert Na values to pd.NA - if self._ndarray.ndim > 2: - # Ravel if ndims > 2 b/c no cythonized version available - lib.convert_nans_to_NA(self._ndarray.ravel("K")) - else: - lib.convert_nans_to_NA(self._ndarray) - - def _values_for_factorize(self): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None - - @classmethod - def _from_sequence( - cls, scalars, *, dtype: Dtype | None = None, copy: bool = False - ) -> Self: - if dtype and not (isinstance(dtype, str) and dtype == "string"): - dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "python" - - from pandas.core.arrays.masked import BaseMaskedArray - - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA - - else: - if lib.is_pyarrow_array(scalars): - # pyarrow array; we cannot rely on the "to_numpy" check in - # ensure_string_array because calling scalars.to_numpy would set - # zero_copy_only to True which caused problems see GH#52076 - scalars = np.array(scalars) - # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) - - # Manually creating new array avoids the validation step in the __init__, so is - # faster. Refactor need for validation? - new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) - - return new_string_array - - def memory_usage(self, deep: bool = False) -> int: - ret = super().memory_usage() - if deep: - ret += lib.memory_usage_of_objects(self._ndarray) - return ret - - -StringArray = ObjectStringArray - - -class NumpyStringArray(BaseNumpyStringArray): - _na_value = libmissing.NA - _storage = "numpy" - _ctor_err_msg = "StringArray requires a sequence of strings or pandas.NA" - - def __init__(self, values, copy: bool = False) -> None: - try: - arr_values = np.asarray(values) - except (TypeError, ValueError): - raise ValueError(self._ctor_err_msg) - default_dtype = get_numpy_string_dtype_instance( - possible_dtype=getattr(arr_values, "dtype", None)) - # this check exists purely to satisfy test_constructor_raises and could - # be deleted if that restriction was relaxed for NumpyStringArray - if (((arr_values.dtype.char == "d" and arr_values.size == 0) or - (arr_values.dtype.char == "S"))): - raise ValueError(self._ctor_err_msg) - try: - str_values = arr_values.astype(default_dtype, copy=copy) - except ValueError: - # we want to emulate ObjectStringArray, which accepts nan and None - # as valid missing values - if arr_values.dtype.kind == "O": - # try again with NA set to np.nan or None - str_values = None - for na_object in (np.nan, None): - try: - dtype = get_numpy_string_dtype_instance( - na_object=na_object, coerce=False) - str_values = arr_values.astype(dtype) - continue - except ValueError: - pass - if str_values is None: - raise ValueError(self._ctor_err_msg) - else: - str_values = str_values.astype(default_dtype) - else: - raise ValueError(self._ctor_err_msg) - super().__init__(str_values, copy=copy) - - @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): - na_mask, any_na = libmissing.isnaobj(np.array(scalars, dtype=object), check_for_any_na=True) - arr = np.asarray(scalars) - if is_object_dtype(arr.dtype): - result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance(coerce=True)) - result[~na_mask] = arr[~na_mask] - if any_na: - result[na_mask] = libmissing.NA - # TODO avoid copy - # could temporarily set coerce=True but that's not possible at the moment - result = result.astype(get_numpy_string_dtype_instance()) - else: - result = arr.astype(get_numpy_string_dtype_instance(), copy=False) - if any_na: - result[na_mask] = libmissing.NA - - # Manually creating with new array avoids the validation step in the - # __init__, so is faster. Refactor need for validation? - new_string_array = cls.__new__(cls) - NDArrayBacked.__init__( - new_string_array, result, StringDtype(storage=cls._storage) - ) - - return new_string_array - - def _values_for_factorize(self): - arr = self._ndarray.copy() - # sentinel value used by StringHashTable - arr[np.isnan(arr)] = "__nan__" - return arr, "__nan__" - - @classmethod - def _from_factorized(cls, values, original): - values[values == "__nan__"] = libmissing.NA - return original._from_backing_data(values) - - @classmethod - def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=get_numpy_string_dtype_instance()) - return cls(values).astype(dtype, copy=False) - - def _validate(self): - """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != get_numpy_string_dtype_instance(): - raise ValueError( - f"{type(self).__name__} requires a sequence of strings or " - "pandas.NA convertible to a NumPy array with dtype " - f"{get_numpy_string_dtype_instance()}. Got " - f"'{self._ndarray.dtype}' dtype instead." - ) - - def _validate_setitem_value(self, value): - if value is np.nan: - value = np.array(libmissing.NA, dtype=get_numpy_string_dtype_instance()) - return value - - def _validate_scalar(self, fill_value): - fill_value = super()._validate_scalar(fill_value) - if fill_value is np.nan: - fill_value = self.dtype.na_value - if not isinstance(fill_value, str) and fill_value is not self.dtype.na_value: - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - return fill_value - - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value: object = lib.no_default, - ) -> np.ndarray: - if dtype is None and na_value is not lib.no_default: - dtype = get_numpy_string_dtype_instance(na_object=na_value) - return super().to_numpy(dtype, copy, na_value) - - def _str_endswith(self, pat, na=None) -> BooleanArray: - if isinstance(pat, tuple) or na is not None: - return super()._str_endswith(pat, na) - pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) - result = np.strings.endswith(self._ndarray, pat) - return BooleanArray(result, isna(self._ndarray)) - - def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray: - sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - na_mask = isna(self._ndarray) - result = np.empty_like(self._ndarray, dtype='int64') - result[~na_mask] = np.strings.find( - self._ndarray[~na_mask], sub, start, end) - return IntegerArray(result, na_mask) - - def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray: - sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - na_mask = isna(self._ndarray) - result = np.empty_like(self._ndarray, dtype='int64') - result[~na_mask] = np.strings.rfind( - self._ndarray[~na_mask], sub, start, end) - return IntegerArray(result, na_mask) - - def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: - sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - na_mask = isna(self._ndarray) - result = np.empty_like(self._ndarray, dtype='int64') - result[~na_mask] = np.strings.index( - self._ndarray[~na_mask], sub, start, end) - return IntegerArray(result, na_mask) - - def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: - sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance()) - na_mask = isna(self._ndarray) - result = np.empty_like(self._ndarray, dtype='int64') - result[~na_mask] = np.strings.rindex( - self._ndarray[~na_mask], sub, start, end) - return IntegerArray(result, na_mask) - - def _str_isalnum(self) -> BooleanArray: - result = np.strings.isalnum(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_isalpha(self) -> BooleanArray: - result = np.strings.isalpha(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_isdigit(self) -> BooleanArray: - result = np.strings.isdigit(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_isdecimal(self) -> BooleanArray: - result = np.strings.isdecimal(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_islower(self) -> BooleanArray: - result = np.strings.islower(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_isnumeric(self) -> BooleanArray: - result = np.strings.isnumeric(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_isspace(self) -> BooleanArray: - result = np.strings.isspace(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_istitle(self) -> BooleanArray: - result = np.strings.istitle(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_isupper(self) -> BooleanArray: - result = np.strings.isupper(self._ndarray) - return BooleanArray(result, isna(self._ndarray)) - - def _str_len(self) -> IntegerArray: - result = np.strings.str_len(self._ndarray) - return IntegerArray(result, isna(self._ndarray)) - - def _str_lstrip(self, to_strip=None): - if to_strip is not None: - to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) - return np.strings.lstrip(self._ndarray, to_strip) - - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): - if regex: - super()._str_replace(pat, repl, n, case, flags, regex) - pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) - repl = np.asarray(repl, dtype=get_numpy_string_dtype_instance()) - return np.strings.replace(self._ndarray, pat, repl, n) - - def _str_rstrip(self, to_strip=None): - if to_strip is not None: - to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) - return np.strings.rstrip(self._ndarray, to_strip) - - def _str_strip(self, to_strip=None): - if to_strip is not None: - to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance()) - return np.strings.strip(self._ndarray, to_strip) - - def _str_startswith(self, pat, na=None) -> BooleanArray: - if isinstance(pat, tuple) or na is not None: - return super()._str_startswith(pat, na) - pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance()) - result = np.strings.startswith(self._ndarray, pat) - return BooleanArray(result, isna(self._ndarray)) - - def _str_zfill(self, width): - return np.strings.zfill(self._ndarray, width) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 559de6eb8d46b..46c9139c3456c 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -460,7 +460,7 @@ def is_terminal() -> bool: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "numpy", "pyarrow_numpy"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 51af44460bfb7..56ce521ac0d76 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -97,10 +97,15 @@ def _astype_nansafe( elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu": return _astype_float_to_int_nansafe(arr, dtype, copy) - elif arr.dtype == object: + elif arr.dtype == object or arr.dtype.kind == "T": # if we have a datetime/timedelta array of objects # then coerce to datetime64[ns] and use DatetimeArray.astype + # array_to_timedelta64 doesn't support numpy stringdtype yet + # TODO: fix? + if arr.dtype.kind == "T": + arr = arr.astype(object) + if lib.is_np_dtype(dtype, "M"): from pandas.core.arrays import DatetimeArray diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 871c6cca642ed..7f0d4aa09cf6a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -103,7 +103,6 @@ ExtensionArray, IntervalArray, NumpyExtensionArray, - NumpyStringArray, PeriodArray, TimedeltaArray, ) @@ -2105,10 +2104,7 @@ def is_view(self) -> bool: @property def array_values(self) -> ExtensionArray: - if self.values.dtype.kind == 'T': - return NumpyStringArray(self.values) - else: - return NumpyExtensionArray(self.values) + return NumpyExtensionArray(self.values) def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: if dtype == _dtype_obj: diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 91578debc0874..cd034f23f2c67 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -17,12 +17,9 @@ import pandas._libs.missing as libmissing import pandas._libs.ops as libops -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, -) from pandas.core.dtypes.missing import isna +from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.integer import IntegerArray from pandas.core.strings.base import BaseStringArrayMethods @@ -505,3 +502,169 @@ def f(x): def _str_zfill(self, width): f = lambda x: x.zfill(width) return self._str_map(f) + + +# Tries to use the numpy string ufuncs if possible +# Will fallback to the object string methods even if ufunc is available +# for cases where the .str accessor is called on an array with object dtype + + +class NumpyStringArrayMixin(ObjectStringArrayMixin): + def _str_endswith(self, pat, na=None) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_endswith(pat, na) + if isinstance(pat, tuple) or na is not None: + return super()._str_endswith(pat, na) + + pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + result = np.strings.endswith(self._ndarray, pat) + return BooleanArray(result, isna(self._ndarray)) + + def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_find(sub, start, end) + sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.find(self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) + + def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_rfind(sub, start, end) + + sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.rfind(self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) + + def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_index(sub, start, end) + + sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.index(self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) + + def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_rindex(sub, start, end) + sub = np.asarray(sub, dtype=np.dtypesStringDType(na_object=libmissing.NA)) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.rindex(self._ndarray[~na_mask], sub, start, end) + return IntegerArray(result, na_mask) + + def _str_isalnum(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isalnum() + result = np.strings.isalnum(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_isalpha(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isalpha() + result = np.strings.isalpha(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_isdigit(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isdigit() + result = np.strings.isdigit(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_isdecimal(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isdecimal() + result = np.strings.isdecimal(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_islower(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_islower() + result = np.strings.islower(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_isnumeric(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isnumeric() + result = np.strings.isnumeric(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_isspace(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isspace() + result = np.strings.isspace(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_istitle(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_istitle() + result = np.strings.istitle(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_isupper(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isupper() + result = np.strings.isupper(self._ndarray) + return BooleanArray(result, isna(self._ndarray)) + + def _str_len(self) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_len() + result = np.strings.str_len(self._ndarray) + return IntegerArray(result, isna(self._ndarray)) + + def _str_lstrip(self, to_strip=None): + if self._ndarray.dtype == object: + return super()._str_lstrip(to_strip) + if to_strip is not None: + to_strip = np.asarray( + to_strip, dtype=np.dtypes.StringDType(na_object=libmissing.NA) + ) + return np.strings.lstrip(self._ndarray, to_strip) + + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): + if self._ndarray.dtype == object: + return super()._str_replace(pat, repl, n, case, flags, regex) + if regex: + return super()._str_replace(pat, repl, n, case, flags, regex) + + pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + repl = np.asarray(repl, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + return np.strings.replace(self._ndarray, pat, repl, n) + + def _str_rstrip(self, to_strip=None): + if self._ndarray.dtype == object: + return super()._str_rstrip(to_strip) + if to_strip is not None: + to_strip = np.asarray( + to_strip, dtype=np.dtypes.StringDType(na_object=libmissing.NA) + ) + return np.strings.rstrip(self._ndarray, to_strip) + + def _str_strip(self, to_strip=None): + if self._ndarray.dtype == object: + return super()._str_strip(to_strip) + if to_strip is not None: + to_strip = np.asarray( + to_strip, dtype=np.dtypes.StringDType(na_object=libmissing.NA) + ) + return np.strings.strip(self._ndarray, to_strip) + + def _str_startswith(self, pat, na=None) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_startswith(pat, na) + if isinstance(pat, tuple) or na is not None: + return super()._str_startswith(pat, na) + pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) + result = np.strings.startswith(self._ndarray, pat) + return BooleanArray(result, isna(self._ndarray)) + + def _str_zfill(self, width): + if self._ndarray.dtype == object: + return super()._str_zfill(width) + return np.strings.zfill(self._ndarray, width) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1193df3f52f6e..c048d7c835ef2 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -53,20 +53,15 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_names = { - 'pyarrow': 'ArrowStringArray', - 'python': 'ObjectStringArray', - 'numpy': 'NumpyStringArray', - 'pyarrow_numpy': "ArrowStringArrayNumpySemantics" - } - - if dtype.storage == "pyarrow_numpy": - na_name = "nan" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: - na_name = "" - - expected = (f"<{arr_names[dtype.storage]}>\n['a', {na_name}, 'b']\n" + - "Length: 3, dtype: string") + arr_name = "StringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -79,16 +74,14 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - is_string = issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray) - - if is_string: + if cls is pd.arrays.StringArray: msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if is_string: + if cls is pd.arrays.StringArray: msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -273,6 +266,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): if op_name not in ["__eq__", "__ne__"]: with pytest.raises(TypeError, match="Invalid comparison|not supported between"): getattr(a, op_name)(other) + return result = getattr(a, op_name)(other) @@ -327,7 +321,7 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): - if issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray): + if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -338,7 +332,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls in (pd.arrays.ObjectStringArray, pd.core.arrays.string_.NumpyStringArray): + if cls is pd.arrays.StringArray: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) @@ -396,9 +390,6 @@ def test_astype_int(dtype): if dtype.storage == "pyarrow_numpy": err = ValueError msg = "cannot convert float NaN to integer" - elif dtype.storage == "numpy": - err = ValueError - msg = "Arrays with missing data cannot be converted to a non-nullable type" else: err = TypeError msg = ( @@ -501,10 +492,11 @@ def test_arrow_array(dtype): expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - if dtype.storage in ("python", "numpy"): + if dtype.storage == "python": expected = pc.cast(expected, pa.string()) assert arr.equals(expected) + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -520,7 +512,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage in ("python", "numpy"): + if dtype.storage == "python": assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" @@ -528,8 +520,6 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) expected = df.astype(f"string[{string_storage2}]") - if string_storage2 == "numpy": - pytest.xfail("pyarrow does notsupport conversion to string[numpy]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is na_val(result["a"].dtype) @@ -552,14 +542,12 @@ def test_arrow_load_from_zero_chunks( data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage in ("python", "numpy"): + if dtype.storage == "python": assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - if string_storage2 == "numpy": - pytest.xfail("pyarrow does notsupport conversion to string[numpy]") with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) @@ -633,23 +621,15 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - if dtype.storage == "numpy": - res_dtype = np.dtypes.StringDType(na_object=pd.NA, coerce=False) - else: - res_dtype = object - expected = np.array(["a", na_val(dtype), "b"], dtype=res_dtype) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_to_numpy_na_value(dtype, nulls_fixture): na_value = nulls_fixture - if dtype.storage == "numpy": - res_dtype = np.dtypes.StringDType(na_object=na_value, coerce=False) - else: - res_dtype = object arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = arr.to_numpy(na_value=na_value) - expected = np.array(["a", na_value, "b"], dtype=res_dtype) + expected = np.array(["a", na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -685,8 +665,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - - if isinstance(ser.array, pd.core.arrays.string_.BaseNumpyStringArray): + if type(ser.array) is pd.arrays.StringArray: msg = "Cannot set non-string value" else: msg = "Scalar must be NA or str" From 187d06886c50fd5c78e0ccd3087b4ac9d80964f4 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 4 May 2024 23:10:56 -0400 Subject: [PATCH 38/52] go for green --- pandas/_libs/hashtable_class_helper.pxi.in | 28 +++++++++++++----- pandas/core/algorithms.py | 33 ++++++++++------------ pandas/core/arrays/_mixins.py | 27 ++++++++++++++++-- pandas/core/arrays/numpy_.py | 4 +-- pandas/core/arrays/string_.py | 33 +++++++++++----------- 5 files changed, 78 insertions(+), 47 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5c6254c6a1ec7..04037b22bc912 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,6 +5,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ from cpython.unicode cimport PyUnicode_AsUTF8 +from numpy cimport ( + flatiter, + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, +) + {{py: # name @@ -1090,7 +1098,7 @@ cdef class StringHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def map_locations(self, ndarray[object] values, object mask = None) -> None: + def map_locations(self, ndarray values, object mask = None) -> None: # mask not yet implemented cdef: Py_ssize_t i, n = len(values) @@ -1099,13 +1107,14 @@ cdef class StringHashTable(HashTable): const char *v const char **vecs khiter_t k + flatiter it = PyArray_IterNew(values) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) if isinstance(val, str): # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize @@ -1115,6 +1124,8 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v + PyArray_ITER_NEXT(it) + with nogil: for i in range(n): v = vecs[i] @@ -1124,7 +1135,7 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, + def _unique(self, ndarray values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, bint return_inverse=False): @@ -1171,18 +1182,19 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + flatiter it = PyArray_IterNew(values) if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) - use_na_value = na_value is not None + use_na_value = na_value is not None and na_value is not C_NA # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) if (ignore_na and (not isinstance(val, str) @@ -1199,6 +1211,8 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(repr(val)) vecs[i] = v + PyArray_ITER_NEXT(it) + # compute with nogil: for i in range(n): @@ -1232,7 +1246,7 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None): + def unique(self, ndarray values, *, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1257,7 +1271,7 @@ cdef class StringHashTable(HashTable): return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + def factorize(self, ndarray values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 187792d8ff0e6..f5a461ba74fa1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -178,6 +178,11 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: npvalues = cast(np.ndarray, npvalues) return npvalues + elif values.dtype.kind == "T": + # numpy String Dtype + # no modifications needed + return values + # we have failed, return object values = np.asarray(values, dtype=object) return ensure_object(values) @@ -299,6 +304,9 @@ def _check_object_for_strings(values: np.ndarray) -> str: # StringHashTable and ObjectHashtable if lib.is_string_array(values, skipna=False): ndtype = "string" + elif values.dtype.kind == "T": + # numpy StringDType case + ndtype = "string" return ndtype @@ -921,6 +929,11 @@ def value_counts_arraylike( original = values values = _ensure_data(values) + # TODO: Fixup value_counts in hashtable_func_helper.pxi.in + # to accept numpy StringDType + if values.dtype.kind == "T": + values = values.astype(object) + keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): @@ -1678,25 +1691,9 @@ def map_array( if not len(arr): return arr.copy() - if isinstance(arr.dtype, np.dtype): - ret_dtype = arr.dtype - else: - # NJG TODO: simplify this - try: - ret_dtype = arr._ndarray.dtype - except AttributeError: - ret_dtype = None - # we must convert to python types values = arr.astype(object, copy=False) if na_action is None: - ret = lib.map_infer(values, mapper) + return lib.map_infer(values, mapper) else: - ret = lib.map_infer_mask( - values, mapper, mask=isna(values).view(np.uint8)) - - if ret.dtype == object and ret_dtype is not None: - # cast from object back to StringDType - return ret.astype(ret_dtype, copy=False) - - return ret + return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8)) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cbd0221cc2082..b9afe6f3752b6 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -11,7 +11,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._libs.arrays import NDArrayBacked from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( @@ -42,6 +45,7 @@ ExtensionDtype, PeriodDtype, ) +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import array_equivalent from pandas.core import missing @@ -400,7 +404,26 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: """ value = self._validate_setitem_value(value) - res_values = np.where(mask, self._ndarray, value) + # Note: For backwards compatibility purposes + # StringArray returns an object array in __array__ + # when it is backed by a numpy StringDType + # We need to work around that here. + if hasattr(value, "_ndarray") and value._ndarray.dtype.kind == "T": + value = value._ndarray + + # np.where will not preserve the StringDType + # TODO: ask Nathan about this + # also TODO: this is a mess + if self._ndarray.dtype.kind == "T": + if value is np.nan: + value = libmissing.NA + res_values = self._ndarray.copy() + res_values[~mask] = value + elif is_array_like(value): + value = np.asarray(value, dtype=self._ndarray.dtype) + res_values = np.where(mask, self._ndarray, value) + else: + res_values = np.where(mask, self._ndarray, value) if res_values.dtype != self._ndarray.dtype: raise AssertionError( # GH#56410 diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index ab48140857204..77b0171cfe1c1 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -60,9 +60,7 @@ class NumpyExtensionArray( # type: ignore[misc] OpsMixin, NDArrayBackedExtensionArray, - NumpyStringArrayMixin, - # str_mixin, - # ObjectStringArrayMixin, + str_mixin, ): """ A pandas ExtensionArray for NumPy data. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 55be0f51f2ed4..3f2675752dddd 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -388,9 +388,9 @@ def _validate(self, values) -> None: # if it's available if np_version_gt2: if not values.dtype.kind == "T": - from numpy.dtypes import StringDType - - values = values.astype(StringDType(na_object=libmissing.NA)) + values = values.astype( + np.dtypes.StringDType(na_object=libmissing.NA, coerce=False) + ) return values @@ -429,9 +429,9 @@ def _from_sequence( # TODO: Support converting directly to string array in ensure_string_array? if np_version_gt2: if not result.dtype.kind == "T": - from numpy.dtypes import StringDType - - result = result.astype(StringDType(na_object=libmissing.NA)) + result = result.astype( + np.dtypes.StringDType(na_object=libmissing.NA, coerce=False) + ) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? @@ -448,12 +448,10 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - dtype = object + arr_dtype = object if np_version_gt2: - from numpy.dtypes import StringDType - - dtype = StringDType(na_object=libmissing.NA) - values = np.empty(shape, dtype=dtype) + arr_dtype = np.dtypes.StringDType(na_object=libmissing.NA, coerce=False) + values = np.empty(shape, dtype=arr_dtype) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -473,9 +471,12 @@ def __arrow_array__(self, type=None): def _values_for_factorize(self) -> tuple[np.ndarray, None]: arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None + if self._ndarray.dtype == object: + mask = self.isna() + arr[mask] = None + return arr, None + else: + return arr, libmissing.NA def __setitem__(self, key, value) -> None: value = extract_array(value, extract_numpy=True) @@ -620,9 +621,7 @@ def _cmp_method(self, other, op): if op.__name__ in ops.ARITHMETIC_BINOPS: dtype = object if np_version_gt2: - from numpy.dtypes import StringDType - - dtype = StringDType(na_object=libmissing.NA) + dtype = np.dtypes.StringDType(na_object=libmissing.NA, coerce=False) result = np.empty_like(self._ndarray, dtype=dtype) result[mask] = libmissing.NA result[valid] = op(self._ndarray[valid], other) From 3626c63cba75932b89383a24d9ef6f701c352481 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 5 May 2024 11:06:43 -0400 Subject: [PATCH 39/52] try again for green --- pandas/_libs/hashtable_class_helper.pxi.in | 3 ++- pandas/core/arrays/string_.py | 4 ++-- pandas/core/dtypes/cast.py | 4 ++-- pandas/tests/copy_view/test_array.py | 8 +++++++- pandas/tests/copy_view/test_astype.py | 15 ++++++++++++++- 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 04037b22bc912..b03ec01077a09 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1207,7 +1207,8 @@ cdef class StringHashTable(HashTable): # if ignore_na is False, we also stringify NaN/None/etc. try: v = PyUnicode_AsUTF8(val) - except UnicodeEncodeError: + except (UnicodeEncodeError,TypeError): + # pd.NA will raise TypeError v = PyUnicode_AsUTF8(repr(val)) vecs[i] = v diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3f2675752dddd..533052b32b611 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -528,7 +528,7 @@ def astype(self, dtype, copy: bool = True): elif isinstance(dtype, IntegerDtype): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = 0 + arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) elif isinstance(dtype, FloatingDtype): @@ -543,7 +543,7 @@ def astype(self, dtype, copy: bool = True): elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = 0 + arr[mask] = "0" values = arr.astype(dtype) values[mask] = np.nan return values diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0594f0edd4d5d..e96855bc1e31e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -729,7 +729,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = np.dtype(np.object_) # in case we have a string that looked like a number - if dtype.kind == "U": + if dtype.kind in "SU": dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) @@ -1481,7 +1481,7 @@ def find_common_type(types): if t.kind in "iufc": return np.dtype("object") - return np.result_type(*types) + return np_find_common_type(*types) def construct_2d_arraylike_from_scalar( diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index bb238d08bd9bd..5fb4e5a2e8ab8 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -120,7 +122,11 @@ def test_dataframe_array_ea_dtypes(): def test_dataframe_array_string_dtype(): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) - assert np.shares_memory(arr, get_array(df, "a")) + if not np_version_gt2: + # Numpy 2.0 will return an object array in __array__ + # despite there actually being a StringArray backing the df + # for backwards compatibility reasons + assert np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is False diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 2d959bb16e7d5..b0cb5a922d89d 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -84,6 +85,10 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) +@pytest.mark.skipif( + np_version_gt2, + reason="When numpy 2.0 is available, StringArray is not backed by object array", +) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -97,6 +102,10 @@ def test_astype_string_and_object(dtype, new_dtype): tm.assert_frame_equal(df, df_orig) +@pytest.mark.skipif( + np_version_gt2, + reason="When numpy 2.0 is available, StringArray is not backed by object array", +) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -217,7 +226,11 @@ def test_convert_dtypes(): df_orig = df.copy() df2 = df.convert_dtypes() - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if not np_version_gt2: + # With numpy 2.0, StringArray will no longer be backed by an object array + # but a numpy StringDType backed array + # so this equivalence doesn't hold anymore + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) From 908c9e1cb7ada7ed4ef5a738dcd83f5cf8392819 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 6 May 2024 11:49:02 -0400 Subject: [PATCH 40/52] hopefully fix hashtable stuff --- pandas/_libs/hashtable.pyx | 3 ++ pandas/_libs/hashtable_class_helper.pxi.in | 56 +++++++++++++++++++++- pandas/_libs/khash.pxd | 8 ++++ pandas/core/strings/object_array.py | 2 +- 4 files changed, 66 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 97fae1d6480ce..1ebe7ce23eee4 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -25,6 +25,9 @@ from pandas._libs.khash cimport ( are_equivalent_float64_t, are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, + kh_end, + kh_exist, + kh_key, kh_needed_n_buckets, kh_python_hash_equal, kh_python_hash_func, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b03ec01077a09..ebf97251e79f1 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -13,6 +13,9 @@ from numpy cimport ( PyArray_IterNew, ) + +from libc.string cimport strdup + {{py: # name @@ -978,7 +981,12 @@ cdef class StringHashTable(HashTable): kh_resize_str(self.table, size_hint) def __dealloc__(self): + cdef: + khiter_t k if self.table is not NULL: + for k in range(kh_end(self.table)): + if kh_exist(self.table, k): + free(kh_key(self.table, k)) kh_destroy_str(self.table) self.table = NULL @@ -1059,7 +1067,7 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def lookup(self, ndarray[object] values, object mask = None) -> ndarray: + def lookup(self, ndarray values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] # mask not yet implemented cdef: @@ -1069,13 +1077,14 @@ cdef class StringHashTable(HashTable): const char *v khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) + flatiter it = PyArray_IterNew(values) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) if isinstance(val, str): # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize @@ -1083,8 +1092,20 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(val) else: v = PyUnicode_AsUTF8(self.na_string_sentinel) + + # Need to copy result from PyUnicode_AsUTF8 when we have + # numpy strings + # Since numpy strings aren't backed by object arrays + # the buffer returned by PyUnicode_AsUTF8 will get freed + # in the next iteration when the created str object is GC'ed, + # clobbering the value of v + #if values.dtype.kind == "T": + v = strdup(v) + vecs[i] = v + PyArray_ITER_NEXT(it) + with nogil: for i in range(n): v = vecs[i] @@ -1094,6 +1115,11 @@ cdef class StringHashTable(HashTable): else: locs[i] = -1 + if values.dtype.kind == "T": + # free copied strings + for i in range(n): + free(vecs[i]) + free(vecs) return np.asarray(locs) @@ -1122,6 +1148,16 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(val) else: v = PyUnicode_AsUTF8(self.na_string_sentinel) + + # Need to copy result from PyUnicode_AsUTF8 when we have + # numpy strings + # Since numpy strings aren't backed by object arrays + # the buffer returned by PyUnicode_AsUTF8 will get freed + # in the next iteration when the created str object is GC'ed, + # clobbering the value of v + #if values.dtype.kind == "T": + v = strdup(v) + vecs[i] = v PyArray_ITER_NEXT(it) @@ -1131,6 +1167,7 @@ cdef class StringHashTable(HashTable): v = vecs[i] k = kh_put_str(self.table, v, &ret) self.table.vals[k] = i + free(vecs) @cython.boundscheck(False) @@ -1210,6 +1247,16 @@ cdef class StringHashTable(HashTable): except (UnicodeEncodeError,TypeError): # pd.NA will raise TypeError v = PyUnicode_AsUTF8(repr(val)) + + # Need to copy result from PyUnicode_AsUTF8 when we have + # numpy strings + # Since numpy strings aren't backed by object arrays + # the buffer returned by PyUnicode_AsUTF8 will get freed + # in the next iteration when the created str object is GC'ed, + # clobbering the value of v + #if values.dtype.kind == "T": + v = strdup(v) + vecs[i] = v PyArray_ITER_NEXT(it) @@ -1237,6 +1284,11 @@ cdef class StringHashTable(HashTable): idx = self.table.vals[k] labels[i] = idx + #if values.dtype.kind == "T": + # free copied strings + # for i in range(n): + # free(vecs[i]) + free(vecs) # uniques diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index c439e1cca772b..f450551febd84 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h": khuint_t kh_needed_n_buckets(khuint_t element_n) nogil + # Needed to free the strings we copied in StringHashTable + + khuint_t kh_end(kh_str_t* h) nogil + + int kh_exist(kh_str_t* h, khuint_t x) nogil + + void* kh_key(kh_str_t* h, khuint_t x) nogil + include "khash_for_primitive_helper.pxi" diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index cd034f23f2c67..f052f8f68539a 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -552,7 +552,7 @@ def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: if self._ndarray.dtype == object: return super()._str_rindex(sub, start, end) - sub = np.asarray(sub, dtype=np.dtypesStringDType(na_object=libmissing.NA)) + sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) na_mask = isna(self._ndarray) result = np.empty_like(self._ndarray, dtype="int64") result[~na_mask] = np.strings.rindex(self._ndarray[~na_mask], sub, start, end) From 70be1f64ef4f51184878ad4a6df9401066db92ed Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 7 May 2024 17:17:40 -0400 Subject: [PATCH 41/52] wip --- pandas/_libs/hashtable_class_helper.pxi.in | 10 ++-------- pandas/core/construction.py | 5 ++++- pandas/core/strings/object_array.py | 14 +++++++++++--- pandas/tests/base/test_misc.py | 7 ++----- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ebf97251e79f1..f220798d9b3d8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1029,6 +1029,8 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(key) + v = strdup(v) + k = kh_put_str(self.table, v, &ret) if kh_exist_str(self.table, k): self.table.vals[k] = val @@ -1099,7 +1101,6 @@ cdef class StringHashTable(HashTable): # the buffer returned by PyUnicode_AsUTF8 will get freed # in the next iteration when the created str object is GC'ed, # clobbering the value of v - #if values.dtype.kind == "T": v = strdup(v) vecs[i] = v @@ -1155,7 +1156,6 @@ cdef class StringHashTable(HashTable): # the buffer returned by PyUnicode_AsUTF8 will get freed # in the next iteration when the created str object is GC'ed, # clobbering the value of v - #if values.dtype.kind == "T": v = strdup(v) vecs[i] = v @@ -1254,7 +1254,6 @@ cdef class StringHashTable(HashTable): # the buffer returned by PyUnicode_AsUTF8 will get freed # in the next iteration when the created str object is GC'ed, # clobbering the value of v - #if values.dtype.kind == "T": v = strdup(v) vecs[i] = v @@ -1284,11 +1283,6 @@ cdef class StringHashTable(HashTable): idx = self.table.vals[k] labels[i] = idx - #if values.dtype.kind == "T": - # free copied strings - # for i in range(n): - # free(vecs[i]) - free(vecs) # uniques diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e24bc0f72e6f2..ab7c083e832c3 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -735,7 +735,10 @@ def _sanitize_str_dtypes( # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. - if result.dtype.kind == "U": + + # TODO: Don't cast for numpy 2.0 StringDType and directly create + # StringArray? + if issubclass(result.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, result has already the result diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index f052f8f68539a..f11bb63396a1d 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -546,7 +546,11 @@ def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) na_mask = isna(self._ndarray) result = np.empty_like(self._ndarray, dtype="int64") - result[~na_mask] = np.strings.index(self._ndarray[~na_mask], sub, start, end) + if start is None: + start = 0 + result[~na_mask] = np.strings.index( + self._ndarray[~na_mask], sub, start=start, end=end + ) return IntegerArray(result, na_mask) def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: @@ -555,7 +559,11 @@ def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) na_mask = isna(self._ndarray) result = np.empty_like(self._ndarray, dtype="int64") - result[~na_mask] = np.strings.rindex(self._ndarray[~na_mask], sub, start, end) + if start is None: + start = 0 + result[~na_mask] = np.strings.rindex( + self._ndarray[~na_mask], sub, start=start, end=end + ) return IntegerArray(result, na_mask) def _str_isalnum(self) -> BooleanArray: @@ -630,7 +638,7 @@ def _str_lstrip(self, to_strip=None): def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): if self._ndarray.dtype == object: return super()._str_replace(pat, repl, n, case, flags, regex) - if regex: + if regex or case is not None: return super()._str_replace(pat, repl, n, case, flags, regex) pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA)) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index f6a4396ca5be0..6f00f12094724 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -83,7 +83,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( PYPY or using_pyarrow_string_dtype(), - reason="not relevant for PyPy doesn't work properly for arrow strings", + reason="not relevant for PyPy, doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): obj = index_or_series_memory_obj @@ -102,14 +102,11 @@ def test_memory_usage(index_or_series_memory_obj): is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or ( is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype) ) - is_object_string = is_dtype_equal(obj, "string[python]") or ( - is_ser and is_dtype_equal(obj.index.dtype, "string[python]") - ) if len(obj) == 0: expected = 0 assert res_deep == res == expected - elif is_object or is_categorical or is_object_string: + elif is_object or is_categorical: # only deep will pick them up assert res_deep > res else: From ffe133b5f8b8ed85b59083b7a5301cf91c6add23 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 10 May 2024 12:24:58 -0600 Subject: [PATCH 42/52] Update test for directly passing in numpy StringDType arrays --- pandas/tests/frame/test_constructors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 53476c2f7ce38..66efc53e4c83d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3063,12 +3063,12 @@ def test_from_dict_with_columns_na_scalar(self): {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, ], ) - def test_np_string_array_object_cast(self, data): + def test_np_string_array(self, data): from numpy.dtypes import StringDType data["a"] = np.array(data["a"], dtype=StringDType()) res = DataFrame(data) - assert res["a"].dtype == np.object_ + assert res["a"].dtype == np.dtypes.StringDType() assert (res["a"] == data["a"]).all() From b684da0398293bea6d36baf23df0f493a51d4a65 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 10 May 2024 12:46:47 -0600 Subject: [PATCH 43/52] xfail memory usage test --- pandas/tests/base/test_misc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index f6a4396ca5be0..237196cee75ce 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -6,6 +6,7 @@ from pandas._config import using_pyarrow_string_dtype from pandas.compat import PYPY +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import ( is_dtype_equal, @@ -85,7 +86,7 @@ def test_ndarray_compat_properties(index_or_series_obj): PYPY or using_pyarrow_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) -def test_memory_usage(index_or_series_memory_obj): +def test_memory_usage(index_or_series_memory_obj, request): obj = index_or_series_memory_obj # Clear index caches so that len(obj) == 0 report 0 memory usage if isinstance(obj, Series): @@ -105,6 +106,11 @@ def test_memory_usage(index_or_series_memory_obj): is_object_string = is_dtype_equal(obj, "string[python]") or ( is_ser and is_dtype_equal(obj.index.dtype, "string[python]") ) + if is_object_string and np_version_gt2: + mark = pytest.mark.xfail( + True, + reason="NumPy does not expose an API to get StringDType memory usage") + request.applymarker(mark) if len(obj) == 0: expected = 0 From 7e0649f3189ddb2b90e18eb4df195e0714230330 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 19 May 2024 16:35:12 -0700 Subject: [PATCH 44/52] update --- pandas/core/arrays/_mixins.py | 13 +++---------- pandas/core/arrays/numpy_.py | 6 +++--- pandas/core/strings/object_array.py | 4 +++- pandas/tests/base/test_misc.py | 10 ++++------ pandas/tests/frame/test_constructors.py | 2 +- 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 378782c5d8201..2cb8fc19d8abc 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -411,17 +411,10 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: """ value = self._validate_setitem_value(value) - # Note: For backwards compatibility purposes - # StringArray returns an object array in __array__ - # when it is backed by a numpy StringDType - # We need to work around that here. - if hasattr(value, "_ndarray") and value._ndarray.dtype.kind == "T": - value = value._ndarray - - # np.where will not preserve the StringDType - # TODO: ask Nathan about this - # also TODO: this is a mess if self._ndarray.dtype.kind == "T": + # Handling non-string values and numpy StringDtype + # explicitly since we don't want to end up with object + # and lose the string dtype if value is np.nan: value = libmissing.NA res_values = self._ndarray.copy() diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 77b0171cfe1c1..040ff72d2486a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -166,7 +166,7 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: array = self._ndarray - # to_numpy on StringArray backed by StringDType should still return object dtype + # np.array on StringArray backed by StringDType should still return object dtype # for backwards compat if self._ndarray.dtype.kind == "T": array = array.astype(object) @@ -516,8 +516,8 @@ def to_numpy( # to_numpy on StringArray backed by StringDType should still return object dtype # for backwards compat array = self._ndarray - if self._ndarray.dtype.kind == "T": - array = array.astype(object) + if dtype is None and self._ndarray.dtype.kind == "T": + dtype = object result = np.asarray(array, dtype=dtype) if na_value is not lib.no_default and mask.any(): result = result.copy() diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index f11bb63396a1d..ba66bd844165b 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -623,7 +623,9 @@ def _str_isupper(self) -> BooleanArray: def _str_len(self) -> IntegerArray: if self._ndarray.dtype == object: return super()._str_len() - result = np.strings.str_len(self._ndarray) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.str_len(self._ndarray[~na_mask]) return IntegerArray(result, isna(self._ndarray)) def _str_lstrip(self, to_strip=None): diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index a844379cf860e..796531d1c4ad9 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -103,19 +103,17 @@ def test_memory_usage(index_or_series_memory_obj, request): is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or ( is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype) ) - is_object_string = is_dtype_equal(obj, "string[python]") or ( + is_string_array = is_dtype_equal(obj, "string[python]") or ( is_ser and is_dtype_equal(obj.index.dtype, "string[python]") ) - if is_object_string and np_version_gt2: - mark = pytest.mark.xfail( - True, reason="NumPy does not expose an API to get StringDType memory usage" - ) + if is_string_array and np_version_gt2: + mark = pytest.mark.xfail(reason="NumPy does not expose an API to get StringDType memory usage") request.applymarker(mark) if len(obj) == 0: expected = 0 assert res_deep == res == expected - elif is_object or is_categorical: + elif is_object or is_categorical or is_string_array: # only deep will pick them up assert res_deep > res else: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a6038ae43778f..c44bf61c2028c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3068,7 +3068,7 @@ def test_np_string_array(self, data): data["a"] = np.array(data["a"], dtype=StringDType()) res = DataFrame(data) - assert res["a"].dtype == np.dtypes.StringDType() + assert res["a"].dtype == np.object_ assert (res["a"] == data["a"]).all() From fd2ba65c7a3892b61e5d93d228e177642fde069b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 19 May 2024 20:57:04 -0700 Subject: [PATCH 45/52] fix ci --- pandas/core/nanops.py | 3 ++- pandas/tests/indexes/test_numpy_compat.py | 4 ++++ pandas/tests/series/test_reductions.py | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4ecf9e1a06f4e..10249c338c16d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -32,6 +32,7 @@ npt, ) from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import ( is_complex, @@ -155,7 +156,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # Bottleneck chokes on datetime64, PeriodDtype (or and EA) if ( dtype != object - and dtype != np.dtypes.StringDType(na_object=libmissing.NA) + and (np_version_gt2 and dtype != np.dtypes.StringDType(na_object=libmissing.NA)) and not needs_i8_conversion(dtype) ): # GH 42878 diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index ace78d77350cb..a28c286f025f1 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -124,6 +124,10 @@ def test_numpy_ufuncs_other(index, func): with tm.external_error_raised(TypeError): func(index) + elif index.dtype == "string[python]" and func is np.isnan: + with tm.external_error_raised(ValueError): + func(index) + elif is_numeric_dtype(index) and not ( is_complex_dtype(index) and func is np.signbit ): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 0bc3092d30b43..02922ef685e47 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -191,10 +191,10 @@ def test_mean_dont_convert_j_to_complex(): with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot pass" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot pass" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) From f3015069023c59cb34626a46d186dbb34842b3f8 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 19 May 2024 21:29:30 -0700 Subject: [PATCH 46/52] try to fix rest --- pandas/core/arrays/numpy_.py | 11 +++++++---- pandas/core/nanops.py | 10 ++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 040ff72d2486a..33f1f6cd5594e 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -516,12 +516,15 @@ def to_numpy( # to_numpy on StringArray backed by StringDType should still return object dtype # for backwards compat array = self._ndarray - if dtype is None and self._ndarray.dtype.kind == "T": - dtype = object - result = np.asarray(array, dtype=dtype) + if self._ndarray.dtype.kind == "T": + array = array.astype(object) if na_value is not lib.no_default and mask.any(): - result = result.copy() + result = array.copy() result[mask] = na_value + else: + result = self._ndarray + + result = np.asarray(result, dtype=dtype) if copy and result is self._ndarray: result = result.copy() diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 10249c338c16d..5b74893585b89 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -18,7 +18,6 @@ NaTType, iNaT, lib, - missing as libmissing, ) from pandas._typing import ( ArrayLike, @@ -32,7 +31,6 @@ npt, ) from pandas.compat._optional import import_optional_dependency -from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import ( is_complex, @@ -153,12 +151,8 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: - # Bottleneck chokes on datetime64, PeriodDtype (or and EA) - if ( - dtype != object - and (np_version_gt2 and dtype != np.dtypes.StringDType(na_object=libmissing.NA)) - and not needs_i8_conversion(dtype) - ): + # Bottleneck chokes on datetime64, numpy strins, PeriodDtype (or and EA) + if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype): # GH 42878 # Bottleneck uses naive summation leading to O(n) loss of precision # unlike numpy which implements pairwise summation, which has O(log(n)) loss From 2c46b75d6a9e17484cd905c50feea1c6d820aa88 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 24 May 2024 12:38:39 -0600 Subject: [PATCH 47/52] avoid nanops test failures --- pandas/core/nanops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 5b74893585b89..8dd88936e50a1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -151,7 +151,9 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: - # Bottleneck chokes on datetime64, numpy strins, PeriodDtype (or and EA) + if issubclass(dtype, np.generic): + dtype = np.dtype(dtype) + # Bottleneck chokes on datetime64, numpy strings, PeriodDtype (or and EA) if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype): # GH 42878 # Bottleneck uses naive summation leading to O(n) loss of precision From 4a538a05ce2711893e4143ba24febfc03f83d121 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 24 May 2024 15:07:09 -0600 Subject: [PATCH 48/52] fix ruff lints --- pandas/core/missing.py | 4 ++-- pandas/core/ops/array_ops.py | 3 ++- pandas/tests/base/test_misc.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 4938ac8070837..2c79d5f480e6b 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -898,8 +898,8 @@ def new_func( if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.astype(object), limit=limit, limit_area=limit_area, - mask=mask) + result, mask = func(values.astype(object), limit=limit, + limit_area=limit_area, mask=mask) values[:] = result[:] return result.astype(values.dtype), mask return func(values, limit=limit, limit_area=limit_area, mask=mask) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 378f135278b58..bff3b5260f560 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -339,7 +339,8 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) - elif lvalues.dtype == object or (lvalues.dtype.kind != "T" and isinstance(rvalues, str)): + elif (lvalues.dtype == object or + (lvalues.dtype.kind != "T" and isinstance(rvalues, str))): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 796531d1c4ad9..2cf6d94d6cb97 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -107,7 +107,8 @@ def test_memory_usage(index_or_series_memory_obj, request): is_ser and is_dtype_equal(obj.index.dtype, "string[python]") ) if is_string_array and np_version_gt2: - mark = pytest.mark.xfail(reason="NumPy does not expose an API to get StringDType memory usage") + mark = pytest.mark.xfail( + reason="NumPy does not expose an API to get StringDType memory usage") request.applymarker(mark) if len(obj) == 0: From c88884af526bcb5ac2268ea3cdae889dd071c4fd Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 24 May 2024 15:09:50 -0600 Subject: [PATCH 49/52] fix cython lints --- pandas/_libs/lib.pyx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4fce3c5743dc1..0fa00af85cfcd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -685,7 +685,7 @@ def convert_nans_to_NA(ndarray arr) -> ndarray: have already been validated as null. """ cdef: - Py_ssize_t i, m + Py_ssize_t i Py_ssize_t n = len(arr) object val flatiter it = cnp.PyArray_IterNew(arr) @@ -695,7 +695,6 @@ def convert_nans_to_NA(ndarray arr) -> ndarray: # equivalents to `val = values[i]` val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) - # Not string so has to be null since they're already validated if not isinstance(val, str): val = C_NA @@ -1572,8 +1571,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str: # Anything other than object-dtype should return here. return inferred elif values.dtype.kind == "T": - # NumPy StringDType - return values.dtype + # NumPy StringDType + return values.dtype if values.descr.type_num != NPY_OBJECT: # i.e. values.dtype != np.object_ @@ -1589,7 +1588,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: it = PyArray_IterNew(values) for i in range(n): # The PyArray_GETITEM and PyArray_ITER_NEXT are faster - # equivalents to `val = values[i]` + # equivalents to `val = values[i]` val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) PyArray_ITER_NEXT(it) From d0e3f1eebe1a45e03ec1bc410b27d275a818e0e6 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 24 May 2024 15:10:25 -0600 Subject: [PATCH 50/52] fix more fuff lints --- pandas/core/dtypes/missing.py | 1 - pandas/core/util/hashing.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 50c2e20d8d9db..9d690cd8f0185 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -24,7 +24,6 @@ DT64NS_DTYPE, TD64NS_DTYPE, ensure_object, - get_numpy_string_dtype_instance, is_scalar, is_string_or_object_np_dtype, ) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 3567271a5e430..d2534e00b25c5 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -310,8 +310,8 @@ def _hash_ndarray( # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. - if not vals.dtype.char == 'O': - vals = vals.astype('object') + if not vals.dtype.char == "O": + vals = vals.astype("object") if categorize: from pandas import ( Categorical, From a175c7ac4de4ffed1848af73cd471d74583b15d2 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 24 May 2024 15:18:38 -0600 Subject: [PATCH 51/52] run ruff-format --- pandas/_libs/missing.pyi | 4 +--- pandas/core/dtypes/common.py | 6 +++--- pandas/core/missing.py | 5 +++-- pandas/core/ops/array_ops.py | 5 +++-- pandas/tests/base/test_misc.py | 3 ++- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index c121136537206..ea6dbae1879a2 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -15,9 +15,7 @@ def isposinf_scalar(val: object) -> bool: ... def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... @overload -def isnaobj( - arr: np.ndarray, check_for_any_na=... -) -> npt.NDArray[np.bool_]: ... +def isnaobj(arr: np.ndarray, check_for_any_na=...) -> npt.NDArray[np.bool_]: ... @overload def isnaobj( arr: np.ndarray, check_for_any_na=True diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ce9d4a3a086ce..820bda8fcb7c0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -533,10 +533,9 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: """ return dtype == object or dtype.kind in "SUT" + def get_numpy_string_dtype_instance( - na_object=libmissing.NA, - coerce=False, - possible_dtype=None + na_object=libmissing.NA, coerce=False, possible_dtype=None ): """Get a reference to a ``numpy.dtypes.StringDType`` instance. @@ -560,6 +559,7 @@ def get_numpy_string_dtype_instance( return possible_dtype return np.dtypes.StringDType(na_object=na_object, coerce=coerce) + def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 2c79d5f480e6b..136c42651bfdb 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -898,8 +898,9 @@ def new_func( if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.astype(object), limit=limit, - limit_area=limit_area, mask=mask) + result, mask = func( + values.astype(object), limit=limit, limit_area=limit_area, mask=mask + ) values[:] = result[:] return result.astype(values.dtype), mask return func(values, limit=limit, limit_area=limit_area, mask=mask) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index bff3b5260f560..ac615963e3638 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -339,8 +339,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) - elif (lvalues.dtype == object or - (lvalues.dtype.kind != "T" and isinstance(rvalues, str))): + elif lvalues.dtype == object or ( + lvalues.dtype.kind != "T" and isinstance(rvalues, str) + ): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 2cf6d94d6cb97..907dda1e1739d 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -108,7 +108,8 @@ def test_memory_usage(index_or_series_memory_obj, request): ) if is_string_array and np_version_gt2: mark = pytest.mark.xfail( - reason="NumPy does not expose an API to get StringDType memory usage") + reason="NumPy does not expose an API to get StringDType memory usage" + ) request.applymarker(mark) if len(obj) == 0: From 961a67ca3e69bd1f494a83644514f92a50f4976c Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Fri, 24 May 2024 15:56:00 -0600 Subject: [PATCH 52/52] tweak for nanops case --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 8dd88936e50a1..7233685b27af6 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -151,7 +151,7 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: - if issubclass(dtype, np.generic): + if isinstance(dtype, type): dtype = np.dtype(dtype) # Bottleneck chokes on datetime64, numpy strings, PeriodDtype (or and EA) if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype):