Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions arkouda/numpy/_typing/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
None,
]

StringDTypeTypes: TypeAlias = _Union[Literal["str", "str_"], type[str_], type[str], type[Strings]]
StringDTypeTypes: TypeAlias = _Union[
Literal["str", "str_", "string"], type[str_], type[str], type[Strings]
]

_ArrayLikeNum: TypeAlias = _Union[
np.ndarray, # keeps it simple; or list your NDArray[...]
Expand All @@ -48,6 +50,7 @@
type[Strings],
]


_NumericLikeDType: TypeAlias = _Union[
# string literals for common names
Literal[
Expand Down Expand Up @@ -88,4 +91,4 @@

def is_string_dtype_hint(x: object) -> TypeGuard["_StringDType"]:
# accept the spellings you want to map to Arkouda Strings
return x in ("str", "str_") or x is str_ or x is str_ or x is Strings
return x in ("str", "str_", "string") or x is str_ or x is str or x is Strings
2 changes: 1 addition & 1 deletion arkouda/numpy/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def dtype(x):
return bigint()

# ---- String dtype spellings ----
if isinstance(x, str) and x.lower() in {"str", "str_", "Strings", "strings"}:
if isinstance(x, str) and x.lower() in {"str", "str_", "Strings", "strings", "string"}:
return np.dtype(np.str_)
if x in (str, np.str_):
return np.dtype(np.str_)
Expand Down
124 changes: 104 additions & 20 deletions arkouda/pandas/extension/_arkouda_array.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Sequence, TypeVar
from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, cast, overload
from typing import cast as type_cast

import numpy as np

from numpy import ndarray
from pandas.api.extensions import ExtensionArray

from arkouda.numpy.dtypes import dtype as ak_dtype
from numpy.typing import NDArray
from pandas.core.arrays.base import ExtensionArray
from pandas.core.dtypes.dtypes import ExtensionDtype

from ._arkouda_extension_array import ArkoudaExtensionArray
from ._dtypes import (
Expand Down Expand Up @@ -166,27 +166,111 @@ def __setitem__(self, key, value):

self._data[key] = value

def astype(self, dtype, copy: bool = False):
# Always hand back a real object-dtype ndarray when object is requested
if dtype in (object, np.object_, "object", np.dtype("O")):
return self.to_ndarray().astype(object, copy=copy)
@overload
def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ...

if isinstance(dtype, _ArkoudaBaseDtype):
dtype = dtype.numpy_dtype
@overload
def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ...

@overload
def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ...

def astype(
self,
dtype: Any,
copy: bool = True,
) -> Union[ExtensionArray, NDArray[Any]]:
"""
Cast the array to a specified dtype.
Casting rules:
* If ``dtype`` requests ``object``, returns a NumPy ``NDArray[Any]`` of
dtype ``object`` containing the array values.
* Otherwise, the target dtype is normalized using Arkouda's dtype
resolution rules.
* If the normalized dtype matches the current dtype and ``copy=False``,
returns ``self``.
* In all other cases, casts the underlying Arkouda array to the target
dtype and returns an Arkouda-backed ``ArkoudaExtensionArray``.
Parameters
----------
dtype : Any
Target dtype. May be a NumPy dtype, pandas dtype, Arkouda dtype,
or any dtype-like object accepted by Arkouda.
copy : bool
Whether to force a copy when the target dtype matches the current dtype.
Default is False.
Returns
-------
Union[ExtensionArray, NDArray[Any]]
The cast result. Returns a NumPy array only when casting to ``object``;
otherwise returns an Arkouda-backed ExtensionArray.
Examples
--------
Basic numeric casting returns an Arkouda-backed array:
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaArray
>>> a = ArkoudaArray(ak.array([1, 2, 3], dtype="int64"))
>>> a.astype("float64").to_ndarray()
array([1., 2., 3.])
Casting to the same dtype with ``copy=False`` returns the original object:
>>> b = a.astype("int64", copy=False)
>>> b is a
True
Forcing a copy when the dtype is unchanged returns a new array:
>>> c = a.astype("int64", copy=True)
>>> c is a
False
>>> c.to_ndarray()
array([1, 2, 3])
Casting to ``object`` materializes the data to a NumPy array:
>>> a.astype(object)
array([1, 2, 3], dtype=object)
NumPy and pandas dtype objects are also accepted:
>>> import numpy as np
>>> a.astype(np.dtype("bool")).to_ndarray()
array([ True, True, True])
"""
from arkouda.numpy.dtypes import dtype as ak_dtype

# --- 1) ExtensionDtype branch (satisfies overload #2) ---
if isinstance(dtype, ExtensionDtype):
# pandas extension dtypes typically have .numpy_dtype
if hasattr(dtype, "numpy_dtype"):
dtype = dtype.numpy_dtype

dtype = ak_dtype(dtype)

# Server-side cast for numeric/bool
try:
npdt = np.dtype(dtype)
except Exception:
return self.to_ndarray().astype(dtype, copy=copy)
if copy is False and self.dtype.numpy_dtype == dtype:
return self

casted = self._data.astype(dtype)
return cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted))

# --- 2) object -> numpy (satisfies overload #1 / general) ---
if dtype in (object, np.object_, "object", np.dtype("O")):
return self.to_ndarray().astype(object, copy=copy)

from arkouda.numpy.numeric import cast as ak_cast
dtype = ak_dtype(dtype)

if npdt.kind in {"i", "u", "f", "b"}:
return type(self)(ak_cast(self._data, ak_dtype(npdt.name)))
if copy is False and self.dtype.numpy_dtype == dtype:
return self

# Fallback: local cast
return self.to_ndarray().astype(npdt, copy=copy)
casted = self._data.astype(dtype)
return ArkoudaExtensionArray._from_sequence(casted)

def isna(self) -> ExtensionArray | ndarray[Any, Any]:
from arkouda.numpy import isnan
Expand Down
129 changes: 125 additions & 4 deletions arkouda/pandas/extension/_arkouda_categorical_array.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Sequence, TypeVar
from typing import TYPE_CHECKING, Any, Sequence, TypeVar, Union, cast, overload

import numpy as np # new

from numpy import ndarray
from pandas.api.extensions import ExtensionArray
from numpy.typing import NDArray
from pandas import CategoricalDtype as pd_CategoricalDtype
from pandas.core.arrays.base import ExtensionArray
from pandas.core.dtypes.dtypes import ExtensionDtype

import arkouda as ak

from ._arkouda_array import ArkoudaArray
from ._arkouda_extension_array import ArkoudaExtensionArray
from ._arkouda_string_array import ArkoudaStringArray
from ._dtypes import ArkoudaCategoricalDtype


Expand Down Expand Up @@ -84,8 +88,125 @@ def __getitem__(self, idx):
return self._data[idx]
return ArkoudaCategoricalArray(self._data[idx])

def astype(self, x, dtype):
raise NotImplementedError("array_api.astype is not implemented in Arkouda yet")
@overload
def astype(self, dtype: np.dtype[Any], copy: bool = True) -> NDArray[Any]: ...

@overload
def astype(self, dtype: ExtensionDtype, copy: bool = True) -> ExtensionArray: ...

@overload
def astype(self, dtype: Any, copy: bool = True) -> Union[ExtensionArray, NDArray[Any]]: ...

def astype(
self,
dtype: Any,
copy: bool = False,
) -> Union[ExtensionArray, NDArray[Any]]:
"""
Cast to a specified dtype.
* If ``dtype`` is categorical (pandas ``category`` / ``CategoricalDtype`` /
``ArkoudaCategoricalDtype``), returns an Arkouda-backed
``ArkoudaCategoricalArray`` (optionally copied).
* If ``dtype`` requests ``object``, returns a NumPy ``ndarray`` of dtype object
containing the category labels (materialized to the client).
* If ``dtype`` requests a string dtype, returns an Arkouda-backed
``ArkoudaStringArray`` containing the labels as strings.
* Otherwise, casts the labels (as strings) to the requested dtype and returns an
Arkouda-backed ExtensionArray.
Parameters
----------
dtype : Any
Target dtype.
copy : bool
Whether to force a copy when possible. If categorical-to-categorical and
``copy=True``, attempts to copy the underlying Arkouda ``Categorical`` (if
supported). Default is False.
Returns
-------
Union[ExtensionArray, NDArray[Any]]
The cast result. Returns a NumPy array only when casting to ``object``;
otherwise returns an Arkouda-backed ExtensionArray.
Examples
--------
Casting to ``category`` returns an Arkouda-backed categorical array:
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaCategoricalArray
>>> c = ArkoudaCategoricalArray(ak.Categorical(ak.array(["x", "y", "x"])))
>>> out = c.astype("category")
>>> out is c
True
Forcing a copy when casting to the same categorical dtype returns a new array:
>>> out2 = c.astype("category", copy=True)
>>> out2 is c
False
>>> out2.to_ndarray()
array(['x', 'y', 'x'], dtype='<U...')
Casting to ``object`` materializes the category labels to a NumPy object array:
>>> c.astype(object)
array(['x', 'y', 'x'], dtype=object)
Casting to a string dtype returns an Arkouda-backed string array of labels:
>>> s = c.astype("string")
>>> s.to_ndarray()
array(['x', 'y', 'x'], dtype='<U1')
Casting to another dtype casts the labels-as-strings and returns an Arkouda-backed array:
>>> c_num = ArkoudaCategoricalArray(ak.Categorical(ak.array(["1", "2", "3"])))
>>> a = c_num.astype("int64")
>>> a.to_ndarray()
array([1, 2, 3])
"""
from arkouda.numpy._typing._typing import is_string_dtype_hint

# --- 1) ExtensionDtype branch first: proves overload #2 returns ExtensionArray ---
if isinstance(dtype, ExtensionDtype):
if hasattr(dtype, "numpy_dtype"):
dtype = dtype.numpy_dtype

if isinstance(dtype, (ArkoudaCategoricalDtype, pd_CategoricalDtype)) or dtype in (
"category",
):
if not copy:
return self
data = self._data.copy() if hasattr(self._data, "copy") else self._data
return cast(ExtensionArray, type(self)(data))

data = self._data.to_strings()

if is_string_dtype_hint(dtype):
return cast(ExtensionArray, ArkoudaStringArray._from_sequence(data))

casted = data.astype(dtype)
return cast(ExtensionArray, ArkoudaExtensionArray._from_sequence(casted))

# --- 2) object -> numpy ---
if dtype in (object, np.object_, "object", np.dtype("O")):
return self.to_ndarray().astype(object, copy=copy)

if isinstance(dtype, (ArkoudaCategoricalDtype, pd_CategoricalDtype)) or dtype in ("category",):
if not copy:
return self
data = self._data.copy() if hasattr(self._data, "copy") else self._data
return type(self)(data)

data = self._data.to_strings()

if is_string_dtype_hint(dtype):
return ArkoudaStringArray._from_sequence(data)

casted = data.astype(dtype)
return ArkoudaExtensionArray._from_sequence(casted)

def isna(self):
return ak.zeros(self._data.size, dtype=ak.bool)
Expand Down
4 changes: 1 addition & 3 deletions arkouda/pandas/extension/_arkouda_extension_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,7 @@ def _from_sequence(
from arkouda.numpy.pdarraycreation import array as ak_array
from arkouda.numpy.strings import Strings
from arkouda.pandas.categorical import Categorical
from arkouda.pandas.extension._arkouda_array import ArkoudaArray
from arkouda.pandas.extension._arkouda_categorical_array import ArkoudaCategoricalArray
from arkouda.pandas.extension._arkouda_string_array import ArkoudaStringArray
from arkouda.pandas.extension import ArkoudaArray, ArkoudaCategoricalArray, ArkoudaStringArray

# Fast path: already an Arkouda column. Pick the matching subclass.
if isinstance(scalars, pdarray):
Expand Down
Loading