diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 30c692115eab1..0c59858a3bf31 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -42,6 +42,7 @@ // followed by the pip installed packages). "matrix": { "pip+build": [], + "numpy": ["2.0rc1"], "Cython": ["3.0"], "matplotlib": [], "sqlalchemy": [], diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index b5ae5a3440f39..e6c0a26aca92f 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -25,6 +25,9 @@ from pandas._libs.khash cimport ( are_equivalent_float64_t, are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, + kh_end, + kh_exist, + kh_key, kh_needed_n_buckets, kh_python_hash_equal, kh_python_hash_func, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 210df09f07db6..a0939029b65e5 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,6 +5,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ from cpython.unicode cimport PyUnicode_AsUTF8 +from numpy cimport ( + flatiter, + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, +) + + +from libc.string cimport strdup + {{py: # name @@ -970,7 +981,12 @@ cdef class StringHashTable(HashTable): kh_resize_str(self.table, size_hint) def __dealloc__(self): + cdef: + khiter_t k if self.table is not NULL: + for k in range(kh_end(self.table)): + if kh_exist(self.table, k): + free(kh_key(self.table, k)) kh_destroy_str(self.table) self.table = NULL @@ -1013,6 +1029,8 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(key) + v = strdup(v) + k = kh_put_str(self.table, v, &ret) if kh_exist_str(self.table, k): self.table.vals[k] = val @@ -1051,7 +1069,7 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def lookup(self, ndarray[object] values, object mask = None) -> ndarray: + def lookup(self, ndarray values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] # mask not yet implemented cdef: @@ -1061,13 +1079,14 @@ cdef class StringHashTable(HashTable): const char *v khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) + flatiter it = PyArray_IterNew(values) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) if isinstance(val, str): # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize @@ -1075,8 +1094,19 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(val) else: v = PyUnicode_AsUTF8(self.na_string_sentinel) + + # Need to copy result from PyUnicode_AsUTF8 when we have + # numpy strings + # Since numpy strings aren't backed by object arrays + # the buffer returned by PyUnicode_AsUTF8 will get freed + # in the next iteration when the created str object is GC'ed, + # clobbering the value of v + v = strdup(v) + vecs[i] = v + PyArray_ITER_NEXT(it) + with nogil: for i in range(n): v = vecs[i] @@ -1086,11 +1116,16 @@ cdef class StringHashTable(HashTable): else: locs[i] = -1 + if values.dtype.kind == "T": + # free copied strings + for i in range(n): + free(vecs[i]) + free(vecs) return np.asarray(locs) @cython.boundscheck(False) - def map_locations(self, ndarray[object] values, object mask = None) -> None: + def map_locations(self, ndarray values, object mask = None) -> None: # mask not yet implemented cdef: Py_ssize_t i, n = len(values) @@ -1099,13 +1134,14 @@ cdef class StringHashTable(HashTable): const char *v const char **vecs khiter_t k + flatiter it = PyArray_IterNew(values) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) if isinstance(val, str): # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize @@ -1113,18 +1149,30 @@ cdef class StringHashTable(HashTable): v = PyUnicode_AsUTF8(val) else: v = PyUnicode_AsUTF8(self.na_string_sentinel) + + # Need to copy result from PyUnicode_AsUTF8 when we have + # numpy strings + # Since numpy strings aren't backed by object arrays + # the buffer returned by PyUnicode_AsUTF8 will get freed + # in the next iteration when the created str object is GC'ed, + # clobbering the value of v + v = strdup(v) + vecs[i] = v + PyArray_ITER_NEXT(it) + with nogil: for i in range(n): v = vecs[i] k = kh_put_str(self.table, v, &ret) self.table.vals[k] = i + free(vecs) @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, + def _unique(self, ndarray values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, bint return_inverse=False): @@ -1171,11 +1219,13 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + flatiter it = PyArray_IterNew(values) bint non_null_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None non_null_na_value = not checknull(na_value) @@ -1184,7 +1234,7 @@ cdef class StringHashTable(HashTable): if vecs is NULL: raise MemoryError() for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) if (ignore_na and (not isinstance(val, str) @@ -1202,10 +1252,22 @@ cdef class StringHashTable(HashTable): # if ignore_na is False, we also stringify NaN/None/etc. try: v = PyUnicode_AsUTF8(val) - except UnicodeEncodeError: + except (UnicodeEncodeError,TypeError): + # pd.NA will raise TypeError v = PyUnicode_AsUTF8(repr(val)) + + # Need to copy result from PyUnicode_AsUTF8 when we have + # numpy strings + # Since numpy strings aren't backed by object arrays + # the buffer returned by PyUnicode_AsUTF8 will get freed + # in the next iteration when the created str object is GC'ed, + # clobbering the value of v + v = strdup(v) + vecs[i] = v + PyArray_ITER_NEXT(it) + # compute with nogil: for i in range(n): @@ -1239,7 +1301,7 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None): + def unique(self, ndarray values, *, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1264,7 +1326,7 @@ cdef class StringHashTable(HashTable): return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + def factorize(self, ndarray values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None, ignore_na=True): """ Calculate unique values and labels (no sorting!) diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index c439e1cca772b..f450551febd84 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h": khuint_t kh_needed_n_buckets(khuint_t element_n) nogil + # Needed to free the strings we copied in StringHashTable + + khuint_t kh_end(kh_str_t* h) nogil + + int kh_exist(kh_str_t* h, khuint_t x) nogil + + void* kh_key(kh_str_t* h, khuint_t x) nogil + include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..27d9798009a5d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -675,41 +675,36 @@ def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool: return True -ctypedef fused ndarr_object: - ndarray[object, ndim=1] - ndarray[object, ndim=2] - # TODO: get rid of this in StringArray and modify # and go through ensure_string_array instead @cython.wraparound(False) @cython.boundscheck(False) -def convert_nans_to_NA(ndarr_object arr) -> ndarray: +def convert_nans_to_NA(ndarray arr) -> ndarray: """ Helper for StringArray that converts null values that are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements have already been validated as null. """ cdef: - Py_ssize_t i, m, n + Py_ssize_t i + Py_ssize_t n = len(arr) object val - ndarr_object result - result = np.asarray(arr, dtype="object") - if arr.ndim == 2: - m, n = arr.shape[0], arr.shape[1] - for i in range(m): - for j in range(n): - val = arr[i, j] - if not isinstance(val, str): - result[i, j] = C_NA - else: - n = len(arr) - for i in range(n): - val = arr[i] - if not isinstance(val, str): - result[i] = C_NA - return result + flatiter it = cnp.PyArray_IterNew(arr) + + for i in range(n): + # The PyArray_GETITEM and PyArray_ITER_NEXT are faster + # equivalents to `val = values[i]` + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + + # Not string so has to be null since they're already validated + if not isinstance(val, str): + val = C_NA + + PyArray_SETITEM(arr, PyArray_ITER_DATA(it), val) + + PyArray_ITER_NEXT(it) @cython.wraparound(False) @@ -1475,6 +1470,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str: - mixed - unknown-array + Returns a dtype object for non-legacy numpy dtypes + Raises ------ TypeError @@ -1585,6 +1582,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if inferred is not None: # Anything other than object-dtype should return here. return inferred + elif values.dtype.kind == "T": + # NumPy StringDType + return values.dtype if values.descr.type_num != NPY_OBJECT: # i.e. values.dtype != np.object_ @@ -1600,7 +1600,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: it = PyArray_IterNew(values) for i in range(n): # The PyArray_GETITEM and PyArray_ITER_NEXT are faster - # equivalents to `val = values[i]` + # equivalents to `val = values[i]` val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) PyArray_ITER_NEXT(it) @@ -1911,7 +1911,10 @@ cdef class StringValidator(Validator): return isinstance(value, str) cdef bint is_array_typed(self) except -1: - return self.dtype.type_num == cnp.NPY_UNICODE + if self.dtype.char == "T" or self.dtype.char == "U": + return True + # this lets user-defined string DTypes through + return issubclass(self.dtype.typeobj, (np.str_, str)) cpdef bint is_string_array(ndarray values, bint skipna=False): diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 6bf30a03cef32..ea6dbae1879a2 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -1,3 +1,5 @@ +from typing import overload + import numpy as np from numpy import typing as npt @@ -12,5 +14,10 @@ def is_matching_na( def isposinf_scalar(val: object) -> bool: ... def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... -def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... +@overload +def isnaobj(arr: np.ndarray, check_for_any_na=...) -> npt.NDArray[np.bool_]: ... +@overload +def isnaobj( + arr: np.ndarray, check_for_any_na=True +) -> tuple[npt.NDArray[np.bool_], bool]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index b5c1c98da1c78..9fd6948f16d50 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -33,5 +33,6 @@ "PeriodArray", "SparseArray", "StringArray", + "ObjectStringArray", "TimedeltaArray", ] diff --git a/pandas/conftest.py b/pandas/conftest.py index d11213f1164bc..8e428b4123254 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -150,7 +150,6 @@ def pytest_collection_modifyitems(items, config) -> None: ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), - ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"), ("NDFrame.reindex_like", "keyword argument 'method' is deprecated"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda93251..8a168800ff6d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -179,6 +179,11 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: npvalues = cast(np.ndarray, npvalues) return npvalues + elif values.dtype.kind == "T": + # numpy String Dtype + # no modifications needed + return values + # we have failed, return object values = np.asarray(values, dtype=object) return ensure_object(values) @@ -304,6 +309,9 @@ def _check_object_for_strings(values: np.ndarray) -> str: # StringHashTable and ObjectHashtable if lib.is_string_array(values, skipna=False): ndtype = "string" + elif values.dtype.kind == "T": + # numpy StringDType case + ndtype = "string" return ndtype @@ -937,6 +945,11 @@ def value_counts_arraylike( original = values values = _ensure_data(values) + # TODO: Fixup value_counts in hashtable_func_helper.pxi.in + # to accept numpy StringDType + if values.dtype.kind == "T": + values = values.astype(object) + keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 4e6f20e6ad3dd..8b27eeb31ac73 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -11,7 +11,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._libs.arrays import NDArrayBacked from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( @@ -42,6 +45,7 @@ ExtensionDtype, PeriodDtype, ) +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import array_equivalent from pandas.core import missing @@ -407,7 +411,19 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: """ value = self._validate_setitem_value(value) - res_values = np.where(mask, self._ndarray, value) + if self._ndarray.dtype.kind == "T": + # Handling non-string values and numpy StringDtype + # explicitly since we don't want to end up with object + # and lose the string dtype + if value is np.nan: + value = libmissing.NA + res_values = self._ndarray.copy() + res_values[~mask] = value + elif is_array_like(value): + value = np.asarray(value, dtype=self._ndarray.dtype) + res_values = np.where(mask, self._ndarray, value) + else: + res_values = np.where(mask, self._ndarray, value) if res_values.dtype != self._ndarray.dtype: raise AssertionError( # GH#56410 diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..5398d69465638 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -9,7 +9,10 @@ from pandas._libs import lib from pandas._libs.tslibs import is_supported_dtype -from pandas.compat.numpy import function as nv +from pandas.compat.numpy import ( + function as nv, + np_version_gt2, +) from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike @@ -26,7 +29,10 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core.strings.object_array import ObjectStringArrayMixin +from pandas.core.strings.object_array import ( + NumpyStringArrayMixin, + ObjectStringArrayMixin, +) if TYPE_CHECKING: from pandas._typing import ( @@ -43,12 +49,18 @@ from pandas import Index +if np_version_gt2: + str_mixin = NumpyStringArrayMixin +else: + str_mixin = ObjectStringArrayMixin + + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" class NumpyExtensionArray( # type: ignore[misc] OpsMixin, NDArrayBackedExtensionArray, - ObjectStringArrayMixin, + str_mixin, ): """ A pandas ExtensionArray for NumPy data. @@ -150,7 +162,12 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: - return np.asarray(self._ndarray, dtype=dtype) + array = self._ndarray + # np.array on StringArray backed by StringDType should still return object dtype + # for backwards compat + if self._ndarray.dtype.kind == "T": + array = array.astype(object) + return np.asarray(array, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # Lightly modified version of @@ -493,8 +510,13 @@ def to_numpy( na_value: object = lib.no_default, ) -> np.ndarray: mask = self.isna() + # to_numpy on StringArray backed by StringDType should still return object dtype + # for backwards compat + array = self._ndarray + if self._ndarray.dtype.kind == "T": + array = array.astype(object) if na_value is not lib.no_default and mask.any(): - result = self._ndarray.copy() + result = array.copy() result[mask] = na_value else: result = self._ndarray diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 143a13c54dbbb..7c35eec587fed 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -25,7 +25,10 @@ HAS_PYARROW, pa_version_under10p1, ) -from pandas.compat.numpy import function as nv +from pandas.compat.numpy import ( + function as nv, + np_version_gt2, +) from pandas.util._decorators import doc from pandas.core.dtypes.base import ( @@ -560,30 +563,37 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) - super().__init__(values, copy=copy) if not isinstance(values, type(self)): - self._validate() + values = self._validate(values) + super().__init__(values, copy=copy) NDArrayBacked.__init__( self, self._ndarray, StringDtype(storage=self._storage, na_value=self._na_value), ) - def _validate(self) -> None: + def _validate(self, values) -> None: """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + if len(values) and not lib.is_string_array(values, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": + if values.dtype != "object" and values.dtype.kind != "T": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " - f"'{self._ndarray.dtype}' dtype instead." + f"'{values.dtype}' dtype instead." ) - # Check to see if need to convert Na values to pd.NA - if self._ndarray.ndim > 2: - # Ravel if ndims > 2 b/c no cythonized version available - lib.convert_nans_to_NA(self._ndarray.ravel("K")) - else: - lib.convert_nans_to_NA(self._ndarray) + # Convert N/A values (if they exist to pd.NA + # TODO: maybe astype instead if input is a numpy string array + lib.convert_nans_to_NA(values) + + # Cast to the faster native numpy StringDType in numpy 2.0 + # if it's available + if np_version_gt2: + if not values.dtype.kind == "T": + values = values.astype( + np.dtypes.StringDType(na_object=self._na_value, coerce=False) + ) + + return values def _validate_scalar(self, value): # used by NDArrayBackedExtensionIndex.insert @@ -627,6 +637,13 @@ def _from_sequence( # convert non-na-likes to str, and nan-likes to StringDtype().na_value result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) + # TODO: Support converting directly to string array in ensure_string_array? + if np_version_gt2: + if not result.dtype.kind == "T": + result = result.astype( + np.dtypes.StringDType(na_object=na_value, coerce=False) + ) + # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) @@ -642,7 +659,10 @@ def _from_sequence_of_strings( @classmethod def _empty(cls, shape, dtype) -> StringArray: - values = np.empty(shape, dtype=object) + arr_dtype = object + if np_version_gt2: + arr_dtype = np.dtypes.StringDType(na_object=libmissing.NA, coerce=False) + values = np.empty(shape, dtype=arr_dtype) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -654,8 +674,9 @@ def __arrow_array__(self, type=None): if type is None: type = pa.string() - - values = self._ndarray.copy() + # TODO: avoid astype to object for numpy StringDType + # once pyarrow supports that + values = self._ndarray.astype("object", copy=True) values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) @@ -714,7 +735,7 @@ def astype(self, dtype, copy: bool = True): elif isinstance(dtype, IntegerDtype): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = 0 + arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) elif isinstance(dtype, FloatingDtype): @@ -729,7 +750,7 @@ def astype(self, dtype, copy: bool = True): elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() - arr[mask] = 0 + arr[mask] = "0" values = arr.astype(dtype) values[mask] = np.nan return values @@ -791,7 +812,7 @@ def value_counts(self, dropna: bool = True) -> Series: def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes - if deep: + if deep and self.dtype == object: return result + lib.memory_usage_of_objects(self._ndarray) return result @@ -815,9 +836,15 @@ def _cmp_method(self, other, op): if isinstance(other, StringArray): other = other._ndarray + # TODO: masking shouldn't be required when numpy 2.0 is the min + # NaN operations will automatically propagate mask = isna(self) | isna(other) valid = ~mask + dtype = object + if np_version_gt2: + dtype = np.dtypes.StringDType(na_object=self.dtype.na_value, coerce=False) + if not lib.is_scalar(other): if len(other) != len(self): # prevent improper broadcasting when other is 2D @@ -827,14 +854,15 @@ def _cmp_method(self, other, op): # for array-likes, first filter out NAs before converting to numpy if not is_array_like(other): - other = np.asarray(other) + other = np.asarray(other, dtype=dtype) other = other[valid] - other = np.asarray(other) + other = np.asarray(other, dtype=dtype) if op.__name__ in ops.ARITHMETIC_BINOPS: - result = np.empty_like(self._ndarray, dtype="object") + result = np.empty_like(self._ndarray, dtype=dtype) result[mask] = self.dtype.na_value - result[valid] = op(self._ndarray[valid], other) + res = op(self._ndarray[valid], other) + result[valid] = res return self._from_backing_data(result) else: # logical @@ -855,18 +883,19 @@ class StringArrayNumpySemantics(StringArray): _storage = "python" _na_value = np.nan - def _validate(self) -> None: + def _validate(self, values) -> None: """Validate that we only store NaN or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + if len(values) and not lib.is_string_array(values, skipna=True): raise ValueError( "StringArrayNumpySemantics requires a sequence of strings or NaN" ) - if self._ndarray.dtype != "object": + if values.dtype != "object" and values.dtype.kind != "T": raise ValueError( "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " - f"'{self._ndarray.dtype}' dtype instead." + f"'{values.dtype}' dtype instead." ) # TODO validate or force NA/None to NaN + return values @classmethod def _from_sequence( diff --git a/pandas/core/common.py b/pandas/core/common.py index ec0473a20458b..50f4997383ddb 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -255,7 +255,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi # has incompatible type "Iterable[Any]"; expected "Sized" return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type] - if issubclass(result.dtype.type, str): + if result.dtype.kind == "U": result = np.asarray(values, dtype=object) if result.ndim == 2: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 665eb75953078..5d964d0c5cedb 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -735,6 +735,9 @@ def _sanitize_str_dtypes( # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. + + # TODO: Don't cast for numpy 2.0 StringDType and directly create + # StringArray? if issubclass(result.dtype.type, str): # GH#16605 # If not empty convert the data to dtype diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 086f7d2da6640..56ce521ac0d76 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -86,7 +86,7 @@ def _astype_nansafe( res = arr.astype(dtype, copy=copy) return np.asarray(res) - if issubclass(dtype.type, str): + if dtype.kind == "U": shape = arr.shape if arr.ndim > 1: arr = arr.ravel() @@ -97,10 +97,15 @@ def _astype_nansafe( elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu": return _astype_float_to_int_nansafe(arr, dtype, copy) - elif arr.dtype == object: + elif arr.dtype == object or arr.dtype.kind == "T": # if we have a datetime/timedelta array of objects # then coerce to datetime64[ns] and use DatetimeArray.astype + # array_to_timedelta64 doesn't support numpy stringdtype yet + # TODO: fix? + if arr.dtype.kind == "T": + arr = arr.astype(object) + if lib.is_np_dtype(dtype, "M"): from pandas.core.arrays import DatetimeArray @@ -178,8 +183,8 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra else: values = _astype_nansafe(values, dtype, copy=copy) - # in pandas we don't store numpy str dtypes, so convert to object - if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + # in pandas we don't store the numpy.str_ dtype, so convert to object + if isinstance(dtype, np.dtype) and values.dtype.kind == "U": values = np.array(values, dtype=object) return values diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3394bf091e228..64f3461ba39ba 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -623,7 +623,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = _dtype_obj return dtype, fill_value - if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmM": + if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmMT": dtype = ensure_dtype_can_hold_na(dtype) fv = na_value_for_dtype(dtype) return dtype, fv @@ -722,11 +722,13 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst + elif is_string_dtype(dtype) and dtype.kind == "T": + pass else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number - if issubclass(dtype.type, (bytes, str)): + if dtype.kind in "SU": dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bcf1ade9b0320..69616033bd204 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -17,6 +17,7 @@ Period, algos, lib, + missing as libmissing, ) from pandas._libs.tslibs import conversion from pandas.util._exceptions import find_stack_level @@ -549,7 +550,33 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: """ Faster alternative to is_string_dtype, assumes we have a np.dtype object. """ - return dtype == object or dtype.kind in "SU" + return dtype == object or dtype.kind in "SUT" + + +def get_numpy_string_dtype_instance( + na_object=libmissing.NA, coerce=False, possible_dtype=None +): + """Get a reference to a ``numpy.dtypes.StringDType`` instance. + + This is a convenience wrapper around the StringDType initializer + with convenient defaults chosen for use with Pandas. + + Parameters + ---------- + na_object : object + A missing data sentinel object. + coerce : bool + Whether or not non-strings entries in arrays should be converted + to strings. + possible_dtype : numpy.dtype + Returned as the result if the dtype matches the provided settings + """ + if possible_dtype is not None: + possible_coerce = getattr(possible_dtype, "coerce", True) + possible_na = getattr(possible_dtype, "na_object", None) + if possible_coerce == coerce and possible_na is libmissing.NA: + return possible_dtype + return np.dtypes.StringDType(na_object=na_object, coerce=coerce) def is_string_dtype(arr_or_dtype) -> bool: @@ -1097,6 +1124,10 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool: ) +def needs_object_conversion(dtype: DtypeObj | None) -> bool: + return dtype.char == "T" + + def needs_i8_conversion(dtype: DtypeObj | None) -> bool: """ Check whether the dtype should be converted to int64. @@ -1824,6 +1855,7 @@ def is_all_strings(value: ArrayLike) -> bool: "is_timedelta64_ns_dtype", "is_unsigned_integer_dtype", "needs_i8_conversion", + "needs_object_conversion", "pandas_dtype", "TD64NS_DTYPE", "validate_all_hashable", diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index c0587d36bcb5a..4ea2152958eda 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -309,7 +309,6 @@ def _from_values_or_dtype( >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object) """ - if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, str): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index b9cd6ae2f13e8..567e44c1c6b86 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -261,6 +261,9 @@ def _isna_string_dtype(values: np.ndarray) -> npt.NDArray[np.bool_]: if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) + elif dtype.kind == "T": + # StringDType's isnan loop checks for null strings + result = np.isnan(values) else: if values.ndim in {1, 2}: result = libmissing.isnaobj(values) @@ -455,6 +458,11 @@ def array_equivalent( # or `in ("O", "S", "U")` return _array_equivalent_object(left, right, strict_nan) + if is_string_or_object_np_dtype(left.dtype) or is_string_or_object_np_dtype( + right.dtype + ): + return _array_equivalent_object(left, right, strict_nan) + # NaNs can occur in float and complex arrays. if left.dtype.kind in "fc": if not (left.size and right.size): @@ -641,7 +649,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): if compat: return False return np.nan - return np.nan + return getattr(dtype, "na_object", np.nan) def remove_na_arraylike(arr: Series | Index | np.ndarray): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c8dbea1fd39ea..078be5b199e17 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -513,7 +513,7 @@ def __new__( if isinstance(data, ABCMultiIndex): data = data._values - if data.dtype.kind not in "iufcbmM": + if data.dtype.kind not in "iufcbmMT": # GH#11836 we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index dfb96162f0ac1..7038973f6a458 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2156,7 +2156,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike: if isinstance(values, np.ndarray): values = ensure_wrapped_if_datetimelike(values) - if issubclass(values.dtype.type, str): + if values.dtype.kind == "U": values = np.array(values, dtype=object) if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bade5fd1bdcf2..8f6de3bb6542f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2341,7 +2341,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if isinstance(dtype, np.dtype): is_dtlike = dtype.kind in "mM" - if issubclass(dtype.type, (str, bytes)): + if dtype.kind in "SU": dtype = np.dtype(object) values, placement = _stack_arrays(tup_block, dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 039d868bccd16..136c42651bfdb 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -37,6 +37,7 @@ is_numeric_v_string_like, is_object_dtype, needs_i8_conversion, + needs_object_conversion, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import ( @@ -872,9 +873,9 @@ def _fillna_prep( return mask -def _datetimelike_compat(func: F) -> F: +def _no_buffer_protocol_compat(func: F) -> F: """ - Wrapper to handle datetime64 and timedelta64 dtypes. + Wrapper to handle dtypes that don't support the buffer protocol """ @wraps(func) @@ -893,13 +894,21 @@ def new_func( values.view("i8"), limit=limit, limit_area=limit_area, mask=mask ) return result.view(values.dtype), mask - + if needs_object_conversion(values.dtype): + if mask is None: + # This needs to occur before casting to int64 + mask = isna(values) + result, mask = func( + values.astype(object), limit=limit, limit_area=limit_area, mask=mask + ) + values[:] = result[:] + return result.astype(values.dtype), mask return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) -@_datetimelike_compat +@_no_buffer_protocol_compat def _pad_1d( values: np.ndarray, limit: int | None = None, @@ -913,7 +922,7 @@ def _pad_1d( return values, mask -@_datetimelike_compat +@_no_buffer_protocol_compat def _backfill_1d( values: np.ndarray, limit: int | None = None, @@ -927,7 +936,7 @@ def _backfill_1d( return values, mask -@_datetimelike_compat +@_no_buffer_protocol_compat def _pad_2d( values: np.ndarray, limit: int | None = None, @@ -943,7 +952,7 @@ def _pad_2d( return values, mask -@_datetimelike_compat +@_no_buffer_protocol_compat def _backfill_2d( values, limit: int | None = None, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e775156a6ae2f..cf23b6cd27d59 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -154,8 +154,10 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: - # Bottleneck chokes on datetime64, PeriodDtype (or and EA) - if dtype != object and not needs_i8_conversion(dtype): + if isinstance(dtype, type): + dtype = np.dtype(dtype) + # Bottleneck chokes on datetime64, numpy strings, PeriodDtype (or and EA) + if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype): # GH 42878 # Bottleneck uses naive summation leading to O(n) loss of precision # unlike numpy which implements pairwise summation, which has O(log(n)) loss @@ -1007,6 +1009,7 @@ def nanvar( # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..ac615963e3638 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -339,7 +339,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) - elif lvalues.dtype == object or isinstance(rvalues, str): + elif lvalues.dtype == object or ( + lvalues.dtype.kind != "T" and isinstance(rvalues, str) + ): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index bdb88e981bcda..097b9fc0967f9 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -128,12 +128,14 @@ def _forbid_nonstring_types(func: F) -> F: @wraps(func) def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) + dtype = self._inferred_dtype + if dtype not in allowed_types: + if not (isinstance(dtype, np.dtype) and issubclass(dtype.type, str)): + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) return func(self, *args, **kwargs) wrapper.__name__ = func_name @@ -249,9 +251,11 @@ def _validate(data): values = getattr(data, "categories", data) # categorical / normal - inferred_dtype = lib.infer_dtype(values, skipna=True) + inferred_dtype = lib.infer_dtype(values) - if inferred_dtype not in allowed_types: + if inferred_dtype not in allowed_types and not isinstance( + inferred_dtype, np.dtype + ): raise AttributeError("Can only use .str accessor with string values!") return inferred_dtype @@ -1899,9 +1903,7 @@ def zfill(self, width: int): if not is_integer(width): msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - f = lambda x: x.zfill(width) - result = self._data.array._str_map(f) - return self._wrap_result(result) + return self._wrap_result(self._data.array._str_zfill(width)) def slice(self, start=None, stop=None, step=None): """ diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 100afa956bd24..b77a3e6386559 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -18,6 +18,8 @@ from pandas.core.dtypes.missing import isna +from pandas.core.arrays.boolean import BooleanArray +from pandas.core.arrays.integer import IntegerArray from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: @@ -258,6 +260,7 @@ def _str_find_(self, sub, start, end, side): f = lambda x: getattr(x, method)(sub, start) else: f = lambda x: getattr(x, method)(sub, start, end) + return self._str_map(f, dtype="int64") def _str_findall(self, pat, flags: int = 0): @@ -486,3 +489,252 @@ def f(x): return empty_row return [f(val) for val in np.asarray(self)] + + def _str_zfill(self, width): + f = lambda x: x.zfill(width) + return self._str_map(f) + + +# Tries to use the numpy string ufuncs if possible +# Will fallback to the object string methods even if ufunc is available +# for cases where the .str accessor is called on an array with object dtype + + +class NumpyStringArrayMixin(ObjectStringArrayMixin): + def _str_endswith(self, pat, na=None) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_endswith(pat, na) + if isinstance(pat, tuple) or na is not None: + return super()._str_endswith(pat, na) + + pat = np.asarray( + pat, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + result = np.strings.endswith(self._ndarray, pat) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_find(sub, start, end) + sub = np.asarray( + sub, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.find(self._ndarray[~na_mask], sub, start, end) + res = IntegerArray(result, na_mask) + if self.dtype.na_value is not libmissing.NA: + # Cast to float64 if necessary + res = res.to_numpy() + return res + + def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_rfind(sub, start, end) + + sub = np.asarray( + sub, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.rfind(self._ndarray[~na_mask], sub, start, end) + res = IntegerArray(result, na_mask) + if self.dtype.na_value is not libmissing.NA: + # Cast to float64 if necessary + res = res.to_numpy() + return res + + def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_index(sub, start, end) + + sub = np.asarray( + sub, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + if start is None: + start = 0 + result[~na_mask] = np.strings.index( + self._ndarray[~na_mask], sub, start=start, end=end + ) + res = IntegerArray(result, na_mask) + if self.dtype.na_value is not libmissing.NA: + # Cast to float64 if necessary + res = res.to_numpy() + return res + + def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_rindex(sub, start, end) + sub = np.asarray( + sub, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + if start is None: + start = 0 + result[~na_mask] = np.strings.rindex( + self._ndarray[~na_mask], sub, start=start, end=end + ) + res = IntegerArray(result, na_mask) + if self.dtype.na_value is not libmissing.NA: + # Cast to float64 if necessary + res = res.to_numpy() + return res + + def _str_isalnum(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isalnum() + result = np.strings.isalnum(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_isalpha(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isalpha() + result = np.strings.isalpha(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_isdigit(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isdigit() + result = np.strings.isdigit(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_isdecimal(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isdecimal() + result = np.strings.isdecimal(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_islower(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_islower() + result = np.strings.islower(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_isnumeric(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isnumeric() + result = np.strings.isnumeric(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_isspace(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isspace() + result = np.strings.isspace(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_istitle(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_istitle() + result = np.strings.istitle(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_isupper(self) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_isupper() + result = np.strings.isupper(self._ndarray) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_len(self) -> IntegerArray: + if self._ndarray.dtype == object: + return super()._str_len() + na_mask = isna(self._ndarray) + result = np.empty_like(self._ndarray, dtype="int64") + result[~na_mask] = np.strings.str_len(self._ndarray[~na_mask]) + res = IntegerArray(result, na_mask) + if self.dtype.na_value is not libmissing.NA: + # Cast to float64 if necessary + res = res.to_numpy() + return res + + def _str_lstrip(self, to_strip=None): + if self._ndarray.dtype == object: + return super()._str_lstrip(to_strip) + if to_strip is not None: + to_strip = np.asarray( + to_strip, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + return np.strings.lstrip(self._ndarray, to_strip) + + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False): + if self._ndarray.dtype == object: + return super()._str_replace(pat, repl, n, case, flags, regex) + if regex or case is not None: + return super()._str_replace(pat, repl, n, case, flags, regex) + + pat = np.asarray( + pat, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + repl = np.asarray( + repl, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + return np.strings.replace(self._ndarray, pat, repl, n) + + def _str_rstrip(self, to_strip=None): + if self._ndarray.dtype == object: + return super()._str_rstrip(to_strip) + if to_strip is not None: + to_strip = np.asarray( + to_strip, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + return np.strings.rstrip(self._ndarray, to_strip) + + def _str_strip(self, to_strip=None): + if self._ndarray.dtype == object: + return super()._str_strip(to_strip) + if to_strip is not None: + to_strip = np.asarray( + to_strip, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + return np.strings.strip(self._ndarray, to_strip) + + def _str_startswith(self, pat, na=None) -> BooleanArray: + if self._ndarray.dtype == object: + return super()._str_startswith(pat, na) + if isinstance(pat, tuple) or na is not None: + return super()._str_startswith(pat, na) + pat = np.asarray( + pat, dtype=np.dtypes.StringDType(na_object=self.dtype.na_value) + ) + result = np.strings.startswith(self._ndarray, pat) + res = BooleanArray(result, isna(self._ndarray)) + if self.dtype.na_value is not libmissing.NA: + res = res.to_numpy(na_value=self.dtype.na_value) + return res + + def _str_zfill(self, width): + if self._ndarray.dtype == object: + return super()._str_zfill(width) + return np.strings.zfill(self._ndarray, width) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index e120e69dc27cf..051a6403cb36c 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -316,6 +316,8 @@ def _hash_ndarray( # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. + if not vals.dtype.char == "O": + vals = vals.astype("object") if categorize: from pandas import ( Categorical, @@ -337,6 +339,9 @@ def _hash_ndarray( vals = hash_object_array( vals.astype(str).astype(object), hash_key, encoding ) + except ValueError: + # the dtype doesn't support the buffer protocol (e.g. StringDType) + vals = hash_object_array(vals.astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 91ad01a2fb0eb..708622f45f78a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -624,7 +624,7 @@ def test_memory_usage(dtype): series = pd.Series(["a", "b", "c"], dtype=dtype) - assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) + assert 0 < series.nbytes <= series.memory_usage() <= series.memory_usage(deep=True) @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index bbd9b150b88a8..4a1105ae15e02 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -6,6 +6,7 @@ from pandas._config import using_string_dtype from pandas.compat import PYPY +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import ( is_dtype_equal, @@ -82,10 +83,10 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_string_dtype(), + PYPY or using_string_dtype() or np_version_gt2, reason="not relevant for PyPy doesn't work properly for arrow strings", ) -def test_memory_usage(index_or_series_memory_obj): +def test_memory_usage(index_or_series_memory_obj, request): obj = index_or_series_memory_obj # Clear index caches so that len(obj) == 0 report 0 memory usage if isinstance(obj, Series): @@ -102,14 +103,14 @@ def test_memory_usage(index_or_series_memory_obj): is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or ( is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype) ) - is_object_string = is_dtype_equal(obj, "string[python]") or ( + is_string_array = is_dtype_equal(obj, "string[python]") or ( is_ser and is_dtype_equal(obj.index.dtype, "string[python]") ) if len(obj) == 0: expected = 0 assert res_deep == res == expected - elif is_object or is_categorical or is_object_string: + elif is_object or is_categorical or is_string_array: # only deep will pick them up assert res_deep > res else: diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index bb238d08bd9bd..5fb4e5a2e8ab8 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -120,7 +122,11 @@ def test_dataframe_array_ea_dtypes(): def test_dataframe_array_string_dtype(): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) - assert np.shares_memory(arr, get_array(df, "a")) + if not np_version_gt2: + # Numpy 2.0 will return an object array in __array__ + # despite there actually being a StringArray backing the df + # for backwards compatibility reasons + assert np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is False diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..cd0ec3e02b353 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -6,6 +6,7 @@ from pandas._config import using_string_dtype from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -85,6 +86,10 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) +@pytest.mark.skipif( + np_version_gt2, + reason="When numpy 2.0 is available, StringArray is not backed by object array", +) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -98,6 +103,10 @@ def test_astype_string_and_object(dtype, new_dtype): tm.assert_frame_equal(df, df_orig) +@pytest.mark.skipif( + np_version_gt2, + reason="When numpy 2.0 is available, StringArray is not backed by object array", +) @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -220,7 +229,11 @@ def test_convert_dtypes(): df_orig = df.copy() df2 = df.convert_dtypes() - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if not np_version_gt2: + # With numpy 2.0, StringArray will no longer be backed by an object array + # but a numpy StringDType backed array + # so this equivalence doesn't hold anymore + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0176a36fe78d7..676d63a1f1f1d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3099,7 +3099,7 @@ def test_from_dict_with_columns_na_scalar(self): {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, ], ) - def test_np_string_array_object_cast(self, data): + def test_np_string_array(self, data): from numpy.dtypes import StringDType data["a"] = np.array(data["a"], dtype=StringDType()) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index ace78d77350cb..a28c286f025f1 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -124,6 +124,10 @@ def test_numpy_ufuncs_other(index, func): with tm.external_error_raised(TypeError): func(index) + elif index.dtype == "string[python]" and func is np.isnan: + with tm.external_error_raised(ValueError): + func(index) + elif is_numeric_dtype(index) and not ( is_complex_dtype(index) and func is np.signbit ): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 7bbb902e14a36..d6c10c6e7eca2 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -201,10 +201,10 @@ def test_mean_dont_convert_j_to_complex(): with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot pass" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot pass" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array)