diff --git a/arkouda/numpy/_typing/_typing.py b/arkouda/numpy/_typing/_typing.py index 0965c6e6511..a0f6748c5b7 100644 --- a/arkouda/numpy/_typing/_typing.py +++ b/arkouda/numpy/_typing/_typing.py @@ -28,7 +28,9 @@ None, ] -StringDTypeTypes: TypeAlias = _Union[Literal["str", "str_"], type[str_], type[str], type[Strings]] +StringDTypeTypes: TypeAlias = _Union[ + Literal["str", "str_", "string"], type[str_], type[str], type[Strings] +] _ArrayLikeNum: TypeAlias = _Union[ np.ndarray, # keeps it simple; or list your NDArray[...] @@ -48,6 +50,7 @@ type[Strings], ] + _NumericLikeDType: TypeAlias = _Union[ # string literals for common names Literal[ @@ -88,4 +91,4 @@ def is_string_dtype_hint(x: object) -> TypeGuard["_StringDType"]: # accept the spellings you want to map to Arkouda Strings - return x in ("str", "str_") or x is str_ or x is str_ or x is Strings + return x in ("str", "str_", "string") or x is str_ or x is str or x is Strings diff --git a/arkouda/numpy/dtypes.py b/arkouda/numpy/dtypes.py index 8eebcfd71a7..426f389853a 100644 --- a/arkouda/numpy/dtypes.py +++ b/arkouda/numpy/dtypes.py @@ -246,7 +246,7 @@ def dtype(x): return bigint() # ---- String dtype spellings ---- - if isinstance(x, str) and x.lower() in {"str", "str_", "Strings", "strings"}: + if isinstance(x, str) and x.lower() in {"str", "str_", "Strings", "strings", "string"}: return np.dtype(np.str_) if x in (str, np.str_): return np.dtype(np.str_) diff --git a/arkouda/pandas/extension/_arkouda_array.py b/arkouda/pandas/extension/_arkouda_array.py index 56eabafc226..17c0b75782f 100644 --- a/arkouda/pandas/extension/_arkouda_array.py +++ b/arkouda/pandas/extension/_arkouda_array.py @@ -1,14 +1,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence, TypeVar +from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, cast, overload from typing import cast as type_cast import numpy as np from numpy import ndarray -from pandas.api.extensions import ExtensionArray - -from arkouda.numpy.dtypes import dtype as ak_dtype +from numpy.typing import NDArray +from pandas.core.arrays.base import ExtensionArray +from pandas.core.dtypes.dtypes import ExtensionDtype from ._arkouda_extension_array import ArkoudaExtensionArray from ._dtypes import ( @@ -166,27 +166,111 @@ def __setitem__(self, key, value): self._data[key] = value - def astype(self, dtype, copy: bool = False): - # Always hand back a real object-dtype ndarray when object is requested - if dtype in (object, np.object_, "object", np.dtype("O")): - return self.to_ndarray().astype(object, copy=copy) + @overload + def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ... - if isinstance(dtype, _ArkoudaBaseDtype): - dtype = dtype.numpy_dtype + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ... + + @overload + def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ... + + def astype( + self, + dtype: Any, + copy: bool = True, + ) -> Union[ExtensionArray, NDArray[Any]]: + """ + Cast the array to a specified dtype. + + Casting rules: + + * If ``dtype`` requests ``object``, returns a NumPy ``NDArray[Any]`` of + dtype ``object`` containing the array values. + * Otherwise, the target dtype is normalized using Arkouda's dtype + resolution rules. + * If the normalized dtype matches the current dtype and ``copy=False``, + returns ``self``. + * In all other cases, casts the underlying Arkouda array to the target + dtype and returns an Arkouda-backed ``ArkoudaExtensionArray``. + + Parameters + ---------- + dtype : Any + Target dtype. May be a NumPy dtype, pandas dtype, Arkouda dtype, + or any dtype-like object accepted by Arkouda. + copy : bool + Whether to force a copy when the target dtype matches the current dtype. + Default is False. + + Returns + ------- + Union[ExtensionArray, NDArray[Any]] + The cast result. Returns a NumPy array only when casting to ``object``; + otherwise returns an Arkouda-backed ExtensionArray. + + Examples + -------- + Basic numeric casting returns an Arkouda-backed array: + + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaArray + >>> a = ArkoudaArray(ak.array([1, 2, 3], dtype="int64")) + >>> a.astype("float64").to_ndarray() + array([1., 2., 3.]) + + Casting to the same dtype with ``copy=False`` returns the original object: + + >>> b = a.astype("int64", copy=False) + >>> b is a + True + + Forcing a copy when the dtype is unchanged returns a new array: + + >>> c = a.astype("int64", copy=True) + >>> c is a + False + >>> c.to_ndarray() + array([1, 2, 3]) + + Casting to ``object`` materializes the data to a NumPy array: + + >>> a.astype(object) + array([1, 2, 3], dtype=object) + + NumPy and pandas dtype objects are also accepted: + + >>> import numpy as np + >>> a.astype(np.dtype("bool")).to_ndarray() + array([ True, True, True]) + """ + from arkouda.numpy.dtypes import dtype as ak_dtype + + # --- 1) ExtensionDtype branch (satisfies overload #2) --- + if isinstance(dtype, ExtensionDtype): + # pandas extension dtypes typically have .numpy_dtype + if hasattr(dtype, "numpy_dtype"): + dtype = dtype.numpy_dtype + + dtype = ak_dtype(dtype) - # Server-side cast for numeric/bool - try: - npdt = np.dtype(dtype) - except Exception: - return self.to_ndarray().astype(dtype, copy=copy) + if copy is False and self.dtype.numpy_dtype == dtype: + return self + + casted = self._data.astype(dtype) + return cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted)) + + # --- 2) object -> numpy (satisfies overload #1 / general) --- + if dtype in (object, np.object_, "object", np.dtype("O")): + return self.to_ndarray().astype(object, copy=copy) - from arkouda.numpy.numeric import cast as ak_cast + dtype = ak_dtype(dtype) - if npdt.kind in {"i", "u", "f", "b"}: - return type(self)(ak_cast(self._data, ak_dtype(npdt.name))) + if copy is False and self.dtype.numpy_dtype == dtype: + return self - # Fallback: local cast - return self.to_ndarray().astype(npdt, copy=copy) + casted = self._data.astype(dtype) + return ArkoudaExtensionArray._from_sequence(casted) def isna(self) -> ExtensionArray | ndarray[Any, Any]: from arkouda.numpy import isnan diff --git a/arkouda/pandas/extension/_arkouda_categorical_array.py b/arkouda/pandas/extension/_arkouda_categorical_array.py index 8e10e427a67..63469012171 100644 --- a/arkouda/pandas/extension/_arkouda_categorical_array.py +++ b/arkouda/pandas/extension/_arkouda_categorical_array.py @@ -1,16 +1,20 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence, TypeVar +from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, cast, overload import numpy as np # new from numpy import ndarray -from pandas.api.extensions import ExtensionArray +from numpy.typing import NDArray +from pandas import CategoricalDtype as pd_CategoricalDtype +from pandas.core.arrays.base import ExtensionArray +from pandas.core.dtypes.dtypes import ExtensionDtype import arkouda as ak from ._arkouda_array import ArkoudaArray from ._arkouda_extension_array import ArkoudaExtensionArray +from ._arkouda_string_array import ArkoudaStringArray from ._dtypes import ArkoudaCategoricalDtype @@ -84,8 +88,125 @@ def __getitem__(self, idx): return self._data[idx] return ArkoudaCategoricalArray(self._data[idx]) - def astype(self, x, dtype): - raise NotImplementedError("array_api.astype is not implemented in Arkouda yet") + @overload + def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ... + + @overload + def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ... + + def astype( + self, + dtype: Any, + copy: bool = False, + ) -> Union[ExtensionArray, NDArray[Any]]: + """ + Cast to a specified dtype. + + * If ``dtype`` is categorical (pandas ``category`` / ``CategoricalDtype`` / + ``ArkoudaCategoricalDtype``), returns an Arkouda-backed + ``ArkoudaCategoricalArray`` (optionally copied). + * If ``dtype`` requests ``object``, returns a NumPy ``ndarray`` of dtype object + containing the category labels (materialized to the client). + * If ``dtype`` requests a string dtype, returns an Arkouda-backed + ``ArkoudaStringArray`` containing the labels as strings. + * Otherwise, casts the labels (as strings) to the requested dtype and returns an + Arkouda-backed ExtensionArray. + + Parameters + ---------- + dtype : Any + Target dtype. + copy : bool + Whether to force a copy when possible. If categorical-to-categorical and + ``copy=True``, attempts to copy the underlying Arkouda ``Categorical`` (if + supported). Default is False. + + Returns + ------- + Union[ExtensionArray, NDArray[Any]] + The cast result. Returns a NumPy array only when casting to ``object``; + otherwise returns an Arkouda-backed ExtensionArray. + + Examples + -------- + Casting to ``category`` returns an Arkouda-backed categorical array: + + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaCategoricalArray + >>> c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + >>> out = c.astype("category") + >>> out is c + True + + Forcing a copy when casting to the same categorical dtype returns a new array: + + >>> out2 = c.astype("category", copy=True) + >>> out2 is c + False + >>> out2.to_ndarray() + array(['x', 'y', 'x'], dtype='>> c.astype(object) + array(['x', 'y', 'x'], dtype=object) + + Casting to a string dtype returns an Arkouda-backed string array of labels: + + >>> s = c.astype("string") + >>> s.to_ndarray() + array(['x', 'y', 'x'], dtype='>> c_num = ArkoudaCategoricalArray(ak.Categorical(ak.array(["1", "2", "3"]))) + >>> a = c_num.astype("int64") + >>> a.to_ndarray() + array([1, 2, 3]) + """ + from arkouda.numpy._typing._typing import is_string_dtype_hint + + # --- 1) ExtensionDtype branch first: proves overload #2 returns ExtensionArray --- + if isinstance(dtype, ExtensionDtype): + if hasattr(dtype, "numpy_dtype"): + dtype = dtype.numpy_dtype + + if isinstance(dtype, (ArkoudaCategoricalDtype, pd_CategoricalDtype)) or dtype in ( + "category", + ): + if not copy: + return self + data = self._data.copy() if hasattr(self._data, "copy") else self._data + return cast(ExtensionArray, type(self)(data)) + + data = self._data.to_strings() + + if is_string_dtype_hint(dtype): + return cast(ExtensionArray, ArkoudaStringArray._from_sequence(data)) + + casted = data.astype(dtype) + return cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted)) + + # --- 2) object -> numpy --- + if dtype in (object, np.object_, "object", np.dtype("O")): + return self.to_ndarray().astype(object, copy=copy) + + if isinstance(dtype, (ArkoudaCategoricalDtype, pd_CategoricalDtype)) or dtype in ("category",): + if not copy: + return self + data = self._data.copy() if hasattr(self._data, "copy") else self._data + return type(self)(data) + + data = self._data.to_strings() + + if is_string_dtype_hint(dtype): + return ArkoudaStringArray._from_sequence(data) + + casted = data.astype(dtype) + return ArkoudaExtensionArray._from_sequence(casted) def isna(self): return ak.zeros(self._data.size, dtype=ak.bool) diff --git a/arkouda/pandas/extension/_arkouda_extension_array.py b/arkouda/pandas/extension/_arkouda_extension_array.py index 5cda4787a36..da554a9d501 100644 --- a/arkouda/pandas/extension/_arkouda_extension_array.py +++ b/arkouda/pandas/extension/_arkouda_extension_array.py @@ -261,9 +261,7 @@ def _from_sequence( from arkouda.numpy.pdarraycreation import array as ak_array from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical - from arkouda.pandas.extension._arkouda_array import ArkoudaArray - from arkouda.pandas.extension._arkouda_categorical_array import ArkoudaCategoricalArray - from arkouda.pandas.extension._arkouda_string_array import ArkoudaStringArray + from arkouda.pandas.extension import ArkoudaArray, ArkoudaCategoricalArray, ArkoudaStringArray # Fast path: already an Arkouda column. Pick the matching subclass. if isinstance(scalars, pdarray): diff --git a/arkouda/pandas/extension/_arkouda_string_array.py b/arkouda/pandas/extension/_arkouda_string_array.py index 1bdf4bef020..9995703dfa2 100644 --- a/arkouda/pandas/extension/_arkouda_string_array.py +++ b/arkouda/pandas/extension/_arkouda_string_array.py @@ -1,12 +1,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Sequence, TypeVar +from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, cast, overload from typing import cast as type_cast import numpy as np from numpy import ndarray -from pandas.api.extensions import ExtensionArray +from numpy.typing import NDArray +from pandas import StringDtype as pd_StringDtype +from pandas.core.arrays.base import ExtensionArray +from pandas.core.dtypes.dtypes import ExtensionDtype from arkouda.numpy.dtypes import str_ from arkouda.pandas.extension import ArkoudaArray @@ -86,11 +89,119 @@ def __getitem__(self, key): return result return ArkoudaStringArray(result) - def astype(self, dtype, copy: bool = False): + @overload + def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ... + + @overload + def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ... + + def astype( + self, + dtype: Any, + copy: bool = False, + ) -> Union[ExtensionArray, NDArray[Any]]: + """ + Cast to a specified dtype. + + Casting rules: + + * If ``dtype`` requests ``object``, returns a NumPy ``NDArray[Any]`` of dtype + ``object`` containing the string values. + * If ``dtype`` is a string dtype (e.g. pandas ``StringDtype``, NumPy unicode, + or Arkouda string dtype), returns an ``ArkoudaStringArray``. If ``copy=True``, + attempts to copy the underlying Arkouda ``Strings`` data. + * For all other dtypes, casts the underlying Arkouda ``Strings`` using + ``Strings.astype`` and returns an Arkouda-backed ``ArkoudaExtensionArray`` + constructed from the result. + + Parameters + ---------- + dtype : Any + Target dtype. May be a NumPy dtype, pandas dtype, or Arkouda dtype. + copy : bool + Whether to force a copy when the result is an ``ArkoudaStringArray``. + Default is False. + + Returns + ------- + Union[ExtensionArray, NDArray[Any]] + The cast result. Returns a NumPy array only when casting to ``object``; + otherwise returns an Arkouda-backed ExtensionArray. + + Examples + -------- + Casting to a string dtype returns an Arkouda-backed string array: + + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaStringArray + >>> s = ArkoudaStringArray(ak.array(["a", "b", "c"])) + >>> out = s.astype("string") + >>> out is s + True + + Forcing a copy when casting to a string dtype returns a new array: + + >>> out2 = s.astype("string", copy=True) + >>> out2 is s + False + >>> out2.to_ndarray() + array(['a', 'b', 'c'], dtype='>> s.astype(object) + array(['a', 'b', 'c'], dtype=object) + + Casting to a non-string dtype uses Arkouda to cast the underlying strings + and returns an Arkouda-backed ExtensionArray: + + >>> s_num = ArkoudaStringArray(ak.array(["1", "2", "3"])) + >>> a = s_num.astype("int64") + >>> a.to_ndarray() + array([1, 2, 3]) + + NumPy and pandas dtype objects are also accepted: + + >>> import numpy as np + >>> a = s_num.astype(np.dtype("float64")) + >>> a.to_ndarray() + array([1., 2., 3.]) + """ + from arkouda.numpy._typing._typing import is_string_dtype_hint + from arkouda.numpy.dtypes import dtype as ak_dtype + + # --- 1) ExtensionDtype branch first (satisfies overload #2) --- + if isinstance(dtype, ExtensionDtype): + if hasattr(dtype, "numpy_dtype"): + dtype = dtype.numpy_dtype + + if isinstance(dtype, pd_StringDtype) or is_string_dtype_hint(dtype): + if not copy: + return self + data = self._data.copy() if hasattr(self._data, "copy") else self._data + return cast(ExtensionArray, type(self)(data)) + + dtype = ak_dtype(dtype) + casted = self._data.astype(dtype) + return cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted)) + + # --- 2) object -> numpy (satisfies overload #1 / general) --- if dtype in (object, np.object_, "object", np.dtype("O")): return self.to_ndarray().astype(object, copy=copy) - # Let pandas do the rest locally - return self.to_ndarray().astype(dtype, copy=copy) + + # string targets -> stay string EA + if isinstance(dtype, pd_StringDtype) or is_string_dtype_hint(dtype): + if not copy: + return self + data = self._data.copy() if hasattr(self._data, "copy") else self._data + return type(self)(data) + + dtype = ak_dtype(dtype) + casted = self._data.astype(dtype) + return ArkoudaExtensionArray._from_sequence(casted) def isna(self): from arkouda.numpy.pdarraycreation import zeros diff --git a/arkouda/pandas/extension/_dtypes.py b/arkouda/pandas/extension/_dtypes.py index 2c6881bc819..e881ab61718 100644 --- a/arkouda/pandas/extension/_dtypes.py +++ b/arkouda/pandas/extension/_dtypes.py @@ -41,7 +41,7 @@ from __future__ import annotations -from typing import Any +from typing import Any, TypeAlias, Union import numpy as np @@ -589,3 +589,13 @@ def construct_array_type(cls): from ._arkouda_categorical_array import ArkoudaCategoricalArray return ArkoudaCategoricalArray + + +arkouda_numeric_extension_dtypes: TypeAlias = Union[ + ArkoudaInt64Dtype, + ArkoudaFloat64Dtype, + ArkoudaBigintDtype, + ArkoudaUint8Dtype, + ArkoudaUint64Dtype, + ArkoudaUint8Dtype, +] diff --git a/tests/pandas/extension/arkouda_array_extension.py b/tests/pandas/extension/arkouda_array_extension.py index 0de5df06d12..31f6cc3ebe1 100644 --- a/tests/pandas/extension/arkouda_array_extension.py +++ b/tests/pandas/extension/arkouda_array_extension.py @@ -7,9 +7,15 @@ from arkouda import numeric_and_bool_scalars from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array as ak_array -from arkouda.pandas.extension import ArkoudaCategoricalArray, ArkoudaStringArray -from arkouda.pandas.extension._arkouda_array import ArkoudaArray -from arkouda.pandas.extension._dtypes import ArkoudaBoolDtype, ArkoudaFloat64Dtype, ArkoudaInt64Dtype +from arkouda.pandas.extension import ( + ArkoudaArray, + ArkoudaBoolDtype, + ArkoudaCategoricalArray, + ArkoudaExtensionArray, + ArkoudaFloat64Dtype, + ArkoudaInt64Dtype, + ArkoudaStringArray, +) from arkouda.testing import assert_equivalent @@ -23,6 +29,16 @@ def base_arr(self): data = ak.array([10, 20, 30, 40, 50]) return ArkoudaArray(data) + def test_array_extension_docstrings(self): + import doctest + + from arkouda.pandas.extension import _arkouda_array + + result = doctest.testmod( + _arkouda_array, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + ) + assert result.failed == 0, f"Doctest failed: {result.failed} failures" + def test_copy_shallow_creates_new_wrapper_but_shares_data(self, ea): """ deep=False should: @@ -41,16 +57,6 @@ def test_copy_shallow_creates_new_wrapper_but_shares_data(self, ea): # Values are equal np.testing.assert_array_equal(shallow.to_numpy(), ea.to_numpy()) - def test_array_extension_docstrings(self): - import doctest - - from arkouda.pandas.extension import _arkouda_array - - result = doctest.testmod( - _arkouda_array, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE - ) - assert result.failed == 0, f"Doctest failed: {result.failed} failures" - def test_constructor_from_pdarray(self): arr = ArkoudaArray(ak.arange(5)) assert isinstance(arr, ArkoudaArray) @@ -206,14 +212,6 @@ def test_to_numpy(self): assert isinstance(np_arr, np.ndarray) assert np_arr.tolist() == [0, 1, 2, 3, 4] - def test_astype(self): - ak_data = ak.arange(10) - arr = ArkoudaArray(ak_data) - casted = arr.astype(np.float64) - assert isinstance(casted, ArkoudaArray) - assert isinstance(casted._data, pdarray) - assert casted._data.dtype == np.float64 - def test_equals_true(self): ak_data = ak.arange(10) arr1 = ArkoudaArray(ak_data) @@ -428,6 +426,78 @@ def test_copy_default_behaves_like_deep_true(self, ea): np.testing.assert_array_equal(default_copy.to_numpy(), ea.to_numpy()) +class TestArkoudaArrayAsType: + def test_arkouda_array_astype_object_returns_numpy_object_array(self): + a = ArkoudaArray(ak.array([1, 2, 3], dtype="int64")) + out = a.astype(object) + + assert isinstance(out, np.ndarray) + assert out.dtype == object + assert out.tolist() == [1, 2, 3] + + def test_arkouda_array_astype_same_dtype_copy_false_returns_self(self): + a = ArkoudaArray(ak.array([1, 2, 3], dtype="int64")) + out = a.astype("int64", copy=False) + + assert out is a + + def test_arkouda_array_astype_same_dtype_copy_true_returns_new_array(self): + a = ArkoudaArray(ak.array([1, 2, 3], dtype="int64")) + out = a.astype("int64", copy=True) + + assert isinstance(out, ArkoudaExtensionArray) + assert out is not a + np.testing.assert_array_equal(out.to_ndarray(), np.array([1, 2, 3], dtype=np.int64)) + + @pytest.mark.parametrize( + "src_dtype, target_dtype, values", + [ + ("int64", "float64", [1, 2, 3]), + ("float64", "int64", [1.2, 2.0, 3.7]), + ("int64", "bool", [0, 1, 2]), + ("bool", "int64", [True, False, True]), + ], + ) + def test_arkouda_array_astype_casts_and_returns_extension_array( + self, src_dtype, target_dtype, values + ): + a = ArkoudaArray(ak.array(values, dtype=src_dtype)) + out = a.astype(target_dtype) + + # Should return an Arkouda-backed EA, not NumPy (unless object) + assert isinstance(out, ArkoudaExtensionArray) + assert not isinstance(out, np.ndarray) + + expected = np.array(values, dtype=np.dtype(src_dtype)).astype(np.dtype(target_dtype)) + np.testing.assert_array_equal(out.to_ndarray(), expected) + + @pytest.mark.parametrize( + "src_dtype, target_ext_dtype, values, expected_dtype", + [ + ("int64", pd.Int64Dtype(), [1, 2, 3], np.int64), + ("int64", pd.BooleanDtype(), [0, 1, 2], np.bool_), + ], + ) + def test_arkouda_array_astype_extensiondtype_casts_and_returns_extension_array( + self, src_dtype, target_ext_dtype, values, expected_dtype + ): + a = ArkoudaArray(ak.array(values, dtype=src_dtype)) + out = a.astype(target_ext_dtype) + + assert isinstance(out, ArkoudaExtensionArray) + assert not isinstance(out, np.ndarray) + + expected = np.array(values, dtype=np.dtype(src_dtype)).astype(np.dtype(expected_dtype)) + np.testing.assert_array_equal(out.to_ndarray(), expected) + + def test_arkouda_array_astype_accepts_numpy_dtype_objects(self): + a = ArkoudaArray(ak.array([1, 2, 3], dtype="int64")) + out = a.astype(np.dtype("float64")) + + assert isinstance(out, ArkoudaExtensionArray) + np.testing.assert_array_equal(out.to_ndarray(), np.array([1, 2, 3], dtype=np.float64)) + + class TestArkoudaArrayEq: def test_eq_arkouda_array_same_length_all_equal(self): left = ArkoudaArray(ak.arange(5)) diff --git a/tests/pandas/extension/arkouda_categorical_extension.py b/tests/pandas/extension/arkouda_categorical_extension.py index 6a5c05eee76..0b8a6fdaa0f 100644 --- a/tests/pandas/extension/arkouda_categorical_extension.py +++ b/tests/pandas/extension/arkouda_categorical_extension.py @@ -6,8 +6,14 @@ from arkouda.numpy.pdarraycreation import array as ak_array from arkouda.pandas.categorical import Categorical -from arkouda.pandas.extension import ArkoudaArray, ArkoudaCategoricalArray, ArkoudaCategoricalDtype -from arkouda.testing import assert_equivalent +from arkouda.pandas.extension import ( + ArkoudaArray, + ArkoudaCategoricalArray, + ArkoudaCategoricalDtype, + ArkoudaExtensionArray, + ArkoudaStringArray, +) +from arkouda.testing import assert_equal, assert_equivalent class TestArkoudaCategoricalExtension: @@ -139,6 +145,80 @@ def test_take_categorical_scaling(self, prob_size): assert_equivalent(arr.take(idx1)._data.to_strings(), s.take(idx1.to_ndarray()).to_numpy()) +class TestArkoudaCategoricalArrayAsType: + def test_categorical_array_astype_category_stays_extension( + self, + ): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + out = c.astype("category") + assert isinstance(out, ArkoudaCategoricalArray) + assert_equal(out._data, c._data) + + def test_categorical_array_astype_object_returns_numpy_labels( + self, + ): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + out = c.astype(object) + assert isinstance(out, np.ndarray) + assert out.dtype == object + assert out.tolist() == ["x", "y", "x"] + + @pytest.mark.parametrize("dtype", ["string", "str", "str_"]) + def test_categorical_array_astype_string_targets_return_string_array(self, dtype): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + out = c.astype(dtype) + assert isinstance(out, ArkoudaStringArray) + assert out.to_ndarray().tolist() == ["x", "y", "x"] + + def test_categorical_array_astype_other_returns_extension_array_not_numpy(self): + # New behavior: does NOT fall back to NumPy; returns an Arkouda-backed EA + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["1", "2", "3"]))) + out = c.astype("int64") + + assert isinstance(out, ArkoudaExtensionArray) + assert not isinstance(out, np.ndarray) + + # Values should match numeric cast of labels + np.testing.assert_array_equal(out.to_ndarray(), np.array([1, 2, 3], dtype=np.int64)) + + def test_categorical_array_astype_other_uses_labels_once(self): + # (Optional sanity) ensure it is casting labels, not codes/categories + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["10", "20", "10"]))) + out = c.astype("int64") + np.testing.assert_array_equal(out.to_ndarray(), np.array([10, 20, 10], dtype=np.int64)) + + +def test_categorical_array_astype_extensiondtype_categoricaldtype_copy_false_returns_self(): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + out = c.astype(pd.CategoricalDtype(), copy=False) + assert out is c + + def test_categorical_array_astype_extensiondtype_categoricaldtype_copy_true_returns_new_array(self): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + out = c.astype(pd.CategoricalDtype(), copy=True) + + assert isinstance(out, ArkoudaCategoricalArray) + assert out is not c + assert out.to_ndarray().tolist() == ["x", "y", "x"] + + def test_categorical_array_astype_extensiondtype_stringdtype_returns_string_array(self): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"]))) + out = c.astype(pd.StringDtype()) # ExtensionDtype path + + assert isinstance(out, ArkoudaStringArray) + assert out.to_ndarray().tolist() == ["x", "y", "x"] + + def test_categorical_array_astype_extensiondtype_numeric_casts_labels_and_returns_extension_array( + self, + ): + c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["1", "2", "3"]))) + out = c.astype(pd.Int64Dtype()) # ExtensionDtype path + + assert isinstance(out, ArkoudaExtensionArray) + assert not isinstance(out, np.ndarray) + np.testing.assert_array_equal(out.to_ndarray(), np.array([1, 2, 3], dtype=np.int64)) + + class TestArkoudaCategoricalArrayEq: def _make(self, values): """Helper to construct an ArkoudaCategoricalArray from Python/NumPy values.""" diff --git a/tests/pandas/extension/arkouda_strings_extension.py b/tests/pandas/extension/arkouda_strings_extension.py index 92a5466f171..d6f4a1bee55 100644 --- a/tests/pandas/extension/arkouda_strings_extension.py +++ b/tests/pandas/extension/arkouda_strings_extension.py @@ -5,7 +5,12 @@ import arkouda as ak from arkouda.numpy.pdarraycreation import array as ak_array -from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray, ArkoudaStringDtype +from arkouda.pandas.extension import ( + ArkoudaArray, + ArkoudaExtensionArray, + ArkoudaStringArray, + ArkoudaStringDtype, +) from arkouda.testing import assert_equivalent @@ -117,6 +122,89 @@ def test_take_strings_scaling(self, prob_size): assert_equivalent(arr.take(idx1)._data, s.take(idx1.to_ndarray()).to_numpy()) +class TestArkoudaStringArrayAsType: + def test_string_array_astype_object_returns_numpy_object_array(self): + s = ArkoudaStringArray(ak.array(["a", "b", "c"])) + out = s.astype(object) + + assert isinstance(out, np.ndarray) + assert out.dtype == object + assert out.tolist() == ["a", "b", "c"] + + @pytest.mark.parametrize("dtype", ["string", "str", "str_", str, np.str_, pd.StringDtype()]) + def test_string_array_astype_string_targets_stay_string_array(self, dtype): + s = ArkoudaStringArray(ak.array(["a", "b", "c"])) + + out = s.astype(dtype, copy=False) + assert isinstance(out, ArkoudaStringArray) + # fast-path: should return the same object when copy=False + assert out is s + assert out.to_ndarray().tolist() == ["a", "b", "c"] + + def test_string_array_astype_string_copy_true_returns_new_array(self): + s = ArkoudaStringArray(ak.array(["a", "b", "c"])) + + out = s.astype("string", copy=True) + assert isinstance(out, ArkoudaStringArray) + assert out is not s + assert out.to_ndarray().tolist() == ["a", "b", "c"] + + @pytest.mark.parametrize( + "dtype, values, expected", + [ + ("int64", ["1", "2", "3"], np.array([1, 2, 3], dtype=np.int64)), + ("float64", ["1.5", "2.0", "3.25"], np.array([1.5, 2.0, 3.25], dtype=np.float64)), + ("bool", ["True", "False", "True"], np.array([True, False, True], dtype=bool)), + ], + ) + def test_string_array_astype_non_string_returns_extension_array(self, dtype, values, expected): + s = ArkoudaStringArray(ak.array(values)) + + out = s.astype(dtype) + + # must not fall back to numpy for non-object casts + assert isinstance(out, ArkoudaExtensionArray) + assert not isinstance(out, np.ndarray) + + np.testing.assert_array_equal(out.to_ndarray(), expected) + + def test_string_array_astype_non_string_dtype_object_uses_numpy_dtype_normalization(self): + # This checks the `hasattr(dtype, "numpy_dtype")` normalization path. + # We use pandas' numpy dtype wrapper as a proxy (pd.Int64Dtype has numpy_dtype). + s = ArkoudaStringArray(ak.array(["1", "2", "3"])) + + out = s.astype(pd.Int64Dtype()) + assert isinstance(out, ArkoudaExtensionArray) + np.testing.assert_array_equal(out.to_ndarray(), np.array([1, 2, 3], dtype=np.int64)) + + def test_string_array_astype_invalid_parse_raises(self): + s = ArkoudaStringArray(ak.array(["x", "2", "3"])) + + # exact exception type depends on arkouda Strings.astype implementation + with pytest.raises(RuntimeError): + _ = s.astype("int64") + + def test_string_array_astype_extensiondtype_stringdtype_returns_self_when_copy_false(self): + s = ArkoudaStringArray(ak.array(["a", "b", "c"])) + out = s.astype(pd.StringDtype(), copy=False) + assert out is s + + def test_string_array_astype_extensiondtype_stringdtype_copy_true_returns_new_array(self): + s = ArkoudaStringArray(ak.array(["a", "b", "c"])) + out = s.astype(pd.StringDtype(), copy=True) + assert isinstance(out, ArkoudaStringArray) + assert out is not s + assert out.to_ndarray().tolist() == ["a", "b", "c"] + + def test_string_array_astype_extensiondtype_numeric_casts_and_returns_extension_array(self): + s = ArkoudaStringArray(ak.array(["1", "2", "3"])) + out = s.astype(pd.Int64Dtype()) # ExtensionDtype path + + assert isinstance(out, ArkoudaExtensionArray) + assert not isinstance(out, np.ndarray) + np.testing.assert_array_equal(out.to_ndarray(), np.array([1, 2, 3], dtype=np.int64)) + + class TestArkoudaStringArrayEq: def _make(self, values): """Helper to construct an ArkoudaStringArray from Python/NumPy values."""