From 0c940d81172745f938cbbcdbdbbd6593e7b96af8 Mon Sep 17 00:00:00 2001 From: ajpotts Date: Wed, 24 Dec 2025 07:29:14 -0500 Subject: [PATCH 1/2] working --- arkouda/pandas/extension/_arkouda_array.py | 198 ++++++++++++++-- .../extension/_arkouda_categorical_array.py | 210 ++++++++++------ .../pandas/extension/_arkouda_string_array.py | 224 ++++++++---------- 3 files changed, 409 insertions(+), 223 deletions(-) diff --git a/arkouda/pandas/extension/_arkouda_array.py b/arkouda/pandas/extension/_arkouda_array.py index 56eabafc226..3535f06670e 100644 --- a/arkouda/pandas/extension/_arkouda_array.py +++ b/arkouda/pandas/extension/_arkouda_array.py @@ -120,48 +120,206 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): # If scalars is already a numpy array, we can preserve its dtype return cls(ak_array(scalars, dtype=dtype, copy=copy)) - def __getitem__(self, key): + def __getitem__(self, key: Any) -> Any: + """ + Retrieve one or more values using a pandas/NumPy-style indexer. + + Parameters + ---------- + key : Any + A valid indexer for 1D array-like data. This may be: + - A scalar integer position (e.g. ``1``) + - A Python ``slice`` (e.g. ``1:3``) + - A list-like of integer positions + - A boolean mask (NumPy array, pandas Series, or Arkouda ``pdarray``) + - A NumPy array, pandas Index/Series, or Arkouda ``pdarray``/``Strings``. + + Returns + ------- + Any + A scalar value for scalar indexers, or an ``ArkoudaArray`` for sequence-like + indexers. + + Raises + ------ + TypeError + If ``key`` is not a supported indexer type, or if a NumPy array or + list-like indexer has an unsupported dtype. + NotImplementedError + If a list-like indexer contains mixed element dtypes (e.g. a mixture + of booleans and integers), which is not supported. + + Examples + -------- + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaArray + >>> data = ak.arange(5) + >>> arr = ArkoudaArray(data) + + Scalar integer index returns a Python scalar: + + >>> arr[1] + np.int64(1) + + Slicing returns another ArkoudaArray: + + >>> arr[1:4] + ArkoudaArray([1 2 3]) + + List-like integer positions: + + >>> arr[[0, 2, 4]] + ArkoudaArray([0 2 4]) + + Boolean mask (NumPy array): + + >>> import numpy as np + >>> mask = np.array([True, False, True, False, True]) + >>> arr[mask] + ArkoudaArray([0 2 4]) + """ from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array as ak_array - # Convert numpy boolean mask to arkouda pdarray + # Normalize NumPy ndarray indexers if isinstance(key, np.ndarray): - if key.dtype == bool: - key = ak_array(key) - elif key.dtype.kind in {"i"}: + if key.dtype == bool or key.dtype == np.bool_: + key = ak_array(key, dtype=bool) + elif np.issubdtype(key.dtype, np.integer): key = ak_array(key, dtype="int64") - elif key.dtype.kind in {"u"}: + elif np.issubdtype(key.dtype, np.unsignedinteger): key = ak_array(key, dtype="uint64") else: - raise TypeError(f"Unsupported numpy index type {key.dtype}") + raise TypeError(f"Unsupported NumPy index type {key.dtype}") + + # Normalize Python lists + elif isinstance(key, list): + if len(key) == 0: + # Empty selection -> empty ArkoudaArray of same dtype + empty = ak_array([], dtype=self._data.dtype) + return self.__class__(empty) + + first = key[0] + first_dtype = ak_dtype(first) + for item in key: + item_dtype = ak_dtype(item) + if first_dtype != item_dtype: + raise NotImplementedError( + f"Mixed dtypes are not supported: {item_dtype} vs {first_dtype}" + ) + + if isinstance(first, (bool, np.bool_)): + key = ak_array(np.array(key, dtype=bool)) + elif isinstance(first, (int, np.integer)): + key = ak_array(np.array(key, dtype=np.int64)) + else: + raise TypeError(f"Unsupported list index type: {type(first)}") + # Perform the indexing operation result = self._data[key] + + # Scalar key → return Python scalar if np.isscalar(key): - if isinstance(result, pdarray): + # If server returned a pdarray of length 1, extract scalar + if isinstance(result, pdarray) and result.size == 1: return result[0] - else: - return result + return result + + # All other cases → wrap result in same class return self.__class__(result) - # TODO: Simplify to use underlying array setter - def __setitem__(self, key, value): - from arkouda.numpy.dtypes import isSupportedInt + def __setitem__(self, key: Any, value: Any) -> None: + """ + Assign one or more values to the underlying Arkouda array in-place. + + Parameters + ---------- + key : Any + A valid positional indexer for the array. This may be a scalar integer, + slice, list-like of integers, boolean mask, NumPy array, pandas Index/Series, + or Arkouda ``pdarray``. + value : Any + A scalar value broadcast to the selected positions, or an array-like + (NumPy array, Arkouda ``pdarray``, or ``ArkoudaArray``) that is + aligned with ``key``. + + Notes + ----- + This operation mutates the underlying server-side array in-place. + + Examples + -------- + Basic scalar assignment by position: + + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaArray + >>> data = ak.arange(5) + >>> arr = ArkoudaArray(data) + >>> arr[0] = 42 + >>> arr + ArkoudaArray([42 1 2 3 4]) + + Using a NumPy boolean mask: + + >>> data = ak.arange(5) + >>> arr = ArkoudaArray(data) + >>> mask = arr.to_ndarray() % 2 == 0 # even positions + >>> arr[mask] = -1 + >>> arr + ArkoudaArray([-1 1 -1 3 -1]) + + Using a NumPy integer indexer: + + >>> data = ak.arange(5) + >>> arr = ArkoudaArray(data) + >>> idx = np.array([1, 3], dtype=np.int64) + >>> arr[idx] = 99 + >>> arr + ArkoudaArray([0 99 2 99 4]) + + Assigning from another ArkoudaArray: + + >>> data = ak.arange(5) + >>> arr = ArkoudaArray(data) + >>> other = ArkoudaArray(ak.arange(10, 15)) + >>> idx = [1, 3, 4] + >>> arr[idx] = other[idx] + >>> arr + ArkoudaArray([0 11 2 13 14]) + """ from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array as ak_array - # Convert numpy mask to pdarray if necessary - if isinstance(key, np.ndarray) and key.dtype == bool: - key = ak_array(key) - elif isinstance(key, np.ndarray) and isSupportedInt(key.dtype): - key = ak_array(key) + # Normalize NumPy / Python indexers into Arkouda pdarrays where needed + if isinstance(key, np.ndarray): + # NumPy bool mask or integer indexer + if key.dtype == bool or key.dtype == np.bool_ or np.issubdtype(key.dtype, np.integer): + key = ak_array(key) + elif isinstance(key, list): + # Python list of bools or ints - convert to NumPy then to pdarray + if key and isinstance(key[0], (bool, np.bool_)): + key = ak_array(np.array(key, dtype=bool)) + elif key and isinstance(key[0], (int, np.integer)): + key = ak_array(np.array(key, dtype=np.int64)) + + if isinstance(key, Sequence) and not isinstance(key, (str, bytes)): + # Cannot set empty index, nothing to do + return + + # Normalize the value into something the underlying pdarray understands if isinstance(value, ArkoudaArray): value = value._data elif isinstance(value, pdarray): + # already an Arkouda pdarray; nothing to do pass - elif isinstance(value, (int, float, bool)): # Add scalar check - self._data[key] = value # assign scalar to scalar position + elif np.isscalar(value): + # Fast path for scalar assignment + + self._data[key] = value return else: + # Convert generic array-likes (Python lists, NumPy arrays, etc.) + # into Arkouda pdarrays. value = ak_array(value) self._data[key] = value diff --git a/arkouda/pandas/extension/_arkouda_categorical_array.py b/arkouda/pandas/extension/_arkouda_categorical_array.py index 8e10e427a67..3a2f8153e6c 100644 --- a/arkouda/pandas/extension/_arkouda_categorical_array.py +++ b/arkouda/pandas/extension/_arkouda_categorical_array.py @@ -7,7 +7,8 @@ from numpy import ndarray from pandas.api.extensions import ExtensionArray -import arkouda as ak +from arkouda.numpy.dtypes import bool_ +from arkouda.numpy.pdarrayclass import pdarray from ._arkouda_array import ArkoudaArray from ._arkouda_extension_array import ArkoudaExtensionArray @@ -70,6 +71,134 @@ def __init__(self, data: Categorical | "ArkoudaCategoricalArray" | ndarray | Seq self._data = data + def __getitem__(self, key: Any) -> Any: + """ + Retrieve one or more categorical values. + + Parameters + ---------- + key : Any + Location(s) to retrieve. Supported forms include: + + * scalar integer index + * slice objects (e.g. ``1:3``) + * NumPy integer array (any integer dtype) + * NumPy boolean mask with the same length as the array + * Python list of integers or booleans + * Arkouda ``pdarray`` of integers or booleans + + Returns + ------- + Any + A Python scalar for scalar access, or a new + :class:`ArkoudaCategoricalArray` for non-scalar indexers. + + Raises + ------ + TypeError + If a NumPy indexer with an unsupported dtype is provided. + + Examples + -------- + >>> import numpy as np + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaCategoricalArray + >>> data = ak.Categorical(ak.array(["a", "b", "c", "d"])) + >>> arr = ArkoudaCategoricalArray(data) + + Scalar access returns a Python string-like scalar: + + >>> arr[1] + np.str_('b') + + Negative indexing: + + >>> arr[-1] + np.str_('d') + + Slice indexing returns a new ArkoudaCategoricalArray: + + >>> result = arr[1:3] + >>> type(result) + + + NumPy integer array indexing: + + >>> idx = np.array([0, 2], dtype=np.int64) + >>> sliced = arr[idx] + >>> isinstance(sliced, ArkoudaCategoricalArray) + True + + NumPy boolean mask: + + >>> mask = np.array([True, False, True, False]) + >>> masked = arr[mask] + >>> isinstance(masked, ArkoudaCategoricalArray) + True + + Empty integer indexer returns an empty ArkoudaCategoricalArray: + + >>> empty_idx = np.array([], dtype=np.int64) + >>> empty = arr[empty_idx] + >>> len(empty) + 0 + """ + import numpy as np + + from arkouda.numpy.pdarraycreation import array as ak_array + from arkouda.pandas.categorical import Categorical + + # Handle empty indexer (list / tuple / ndarray of length 0) + if isinstance(key, (list, tuple, np.ndarray)) and len(key) == 0: + empty_strings = ak_array([], dtype="str_") + return ArkoudaCategoricalArray(Categorical(empty_strings)) + + # Scalar integers and slices: delegate directly to the underlying Categorical + if isinstance(key, (int, np.integer, slice)): + result = self._data[key] + # For scalar keys, just return the underlying scalar + if isinstance(key, (int, np.integer)): + return result + # For slices, underlying arkouda.Categorical returns a Categorical + return ArkoudaCategoricalArray(result) + + # NumPy array indexers: normalize to Arkouda pdarrays + if isinstance(key, np.ndarray): + if key.dtype == bool: + key = ak_array(key) + elif np.issubdtype(key.dtype, np.signedinteger): + key = ak_array(key, dtype="int64") + elif np.issubdtype(key.dtype, np.unsignedinteger): + key = ak_array(key, dtype="uint64") + else: + raise TypeError(f"Unsupported numpy index type {key.dtype}") + elif not isinstance(key, (pdarray, Categorical)): + # Convert generic indexers (e.g. Python lists of ints/bools) to an Arkouda pdarray + key = ak_array(key) + + # Delegate to underlying arkouda.Categorical + result = self._data[key] + + # Scalar result: just return the underlying scalar + if isinstance(key, pdarray) and key.size == 1: + # Categorical.__getitem__ will generally still give a Categorical here; + # we normalize to a Python scalar by going through categories[codes]. + + codes = result.codes if isinstance(result, Categorical) else result + cats = self._data.categories + # codes is length-1, so this is length-1 Strings + labels = cats[codes] + # Return a Python scalar string + return labels[0] + + # Non-scalar: wrap Categorical in ArkoudaCategoricalArray + if isinstance(result, Categorical): + return ArkoudaCategoricalArray(result) + + # Fallback: if Categorical returned something array-like but not Categorical, + # rebuild a Categorical from it. + return ArkoudaCategoricalArray(Categorical(result)) + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): from arkouda import Categorical, array @@ -79,16 +208,13 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): scalars = Categorical(array(scalars)) return cls(scalars) - def __getitem__(self, idx): - if isinstance(idx, int): - return self._data[idx] - return ArkoudaCategoricalArray(self._data[idx]) - def astype(self, x, dtype): raise NotImplementedError("array_api.astype is not implemented in Arkouda yet") def isna(self): - return ak.zeros(self._data.size, dtype=ak.bool) + from arkouda.numpy.pdarraycreation import zeros + + return zeros(self._data.size, dtype=bool_) @property def dtype(self): @@ -130,73 +256,3 @@ def __eq__(self, other): def __repr__(self): return f"ArkoudaCategoricalArray({self._data})" - - def _not_implemented(self, name: str): - raise NotImplementedError(f"`{name}` is not implemented for ArkoudaCategoricalArray yet.") - - def add_categories(self, *args, **kwargs): - self._not_implemented("add_categories") - - def as_ordered(self, *args, **kwargs): - self._not_implemented("as_ordered") - - def as_unordered(self, *args, **kwargs): - self._not_implemented("as_unordered") - - def check_for_ordered(self, *args, **kwargs): - self._not_implemented("check_for_ordered") - - def describe(self, *args, **kwargs): - self._not_implemented("describe") - - @classmethod - def from_codes(cls, *args, **kwargs): - raise NotImplementedError("`from_codes` is not implemented for ArkoudaCategoricalArray yet.") - - def isnull(self, *args, **kwargs): - self._not_implemented("isnull") - - def max(self, *args, **kwargs): - self._not_implemented("max") - - def memory_usage(self, *args, **kwargs): - self._not_implemented("memory_usage") - - def min(self, *args, **kwargs): - self._not_implemented("min") - - def notna(self, *args, **kwargs): - self._not_implemented("notna") - - def notnull(self, *args, **kwargs): - self._not_implemented("notnull") - - def remove_categories(self, *args, **kwargs): - self._not_implemented("remove_categories") - - def remove_unused_categories(self, *args, **kwargs): - self._not_implemented("remove_unused_categories") - - def rename_categories(self, *args, **kwargs): - self._not_implemented("rename_categories") - - def reorder_categories(self, *args, **kwargs): - self._not_implemented("reorder_categories") - - def set_categories(self, *args, **kwargs): - self._not_implemented("set_categories") - - def set_ordered(self, *args, **kwargs): - self._not_implemented("set_ordered") - - def sort_values(self, *args, **kwargs): - self._not_implemented("sort_values") - - def swapaxes(self, *args, **kwargs): - self._not_implemented("swapaxes") - - def to_list(self, *args, **kwargs): - self._not_implemented("to_list") - - def value_counts(self, *args, **kwargs): - self._not_implemented("value_counts") diff --git a/arkouda/pandas/extension/_arkouda_string_array.py b/arkouda/pandas/extension/_arkouda_string_array.py index 1bdf4bef020..e80138dda8b 100644 --- a/arkouda/pandas/extension/_arkouda_string_array.py +++ b/arkouda/pandas/extension/_arkouda_string_array.py @@ -77,14 +77,106 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(ak_array(scalars)) - def __getitem__(self, key): + def __getitem__(self, key: Any) -> Any: + """ + Retrieve one or more string values. + + Parameters + ---------- + key : Any + Positional indexer. Supports: + * scalar integer positions + * slice objects + * NumPy integer arrays (signed/unsigned) + * NumPy boolean masks + * Python lists of integers / booleans + * Arkouda pdarray indexers (int / uint / bool) + + Returns + ------- + Any + A Python string for scalar access, or a new ArkoudaStringArray + for non-scalar indexers. + + Raises + ------ + TypeError + If ``key`` is a NumPy array with an unsupported dtype (for example, + a floating point or object dtype). + + Examples + -------- + Basic scalar access: + + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaStringArray + >>> arr = ArkoudaStringArray(ak.array(["a", "b", "c", "d"])) + >>> arr[1] + np.str_('b') + + Negative indexing: + + >>> arr[-1] + np.str_('d') + + Slice indexing (returns a new ArkoudaStringArray): + + >>> arr[1:3] + ArkoudaStringArray(['b', 'c']) + + NumPy integer array indexing: + + >>> idx = np.array([0, 2], dtype=np.int64) + >>> arr[idx] + ArkoudaStringArray(['a', 'c']) + + NumPy boolean mask: + + >>> mask = np.array([True, False, True, False]) + >>> arr[mask] + ArkoudaStringArray(['a', 'c']) + + Arkouda integer indexer: + + >>> ak_idx = ak.array([3, 1]) + >>> arr[ak_idx] + ArkoudaStringArray(['d', 'b']) + + Empty indexer returns an empty ArkoudaStringArray: + + >>> empty_idx = np.array([], dtype=np.int64) + >>> arr[empty_idx] + ArkoudaStringArray([]) + """ + from arkouda.numpy.pdarraycreation import array as ak_array + from arkouda.numpy.strings import Strings + + # Normalize NumPy indexers to Arkouda pdarrays, mirroring ArkoudaArray.__getitem__ + if isinstance(key, np.ndarray): + if key.dtype == bool: + key = ak_array(key) + elif key.dtype.kind in {"i"}: + # signed integer + key = ak_array(key, dtype="int64") + elif key.dtype.kind in {"u"}: + # unsigned integer + key = ak_array(key, dtype="uint64") + else: + raise TypeError(f"Unsupported numpy index type {key.dtype}") + result = self._data[key] + + # Scalar access: return a plain Python str (or scalar) instead of a Strings object if np.isscalar(key): - if hasattr(result, "to_ndarray"): - return result.to_ndarray()[()] - else: - return result - return ArkoudaStringArray(result) + return result + + # Non-scalar: expect an Arkouda Strings, wrap it + if isinstance(result, Strings): + return ArkoudaStringArray(result) + + # Fallback: if Arkouda returned something array-like but not Strings, + # materialize via ak.array and wrap again as Strings. + return ArkoudaStringArray(ak_array(result)) def astype(self, dtype, copy: bool = False): if dtype in (object, np.object_, "object", np.dtype("O")): @@ -135,123 +227,3 @@ def __eq__(self, other): def __repr__(self): return f"ArkoudaStringArray({self._data})" - - def _not_implemented(self, name: str): - raise NotImplementedError(f"`{name}` is not implemented for Arkouda-backed arrays yet.") - - def all(self, *args, **kwargs): - self._not_implemented("all") - - def any(self, *args, **kwargs): - self._not_implemented("any") - - def argpartition(self, *args, **kwargs): - self._not_implemented("argpartition") - - def byteswap(self, *args, **kwargs): - self._not_implemented("byteswap") - - def choose(self, *args, **kwargs): - self._not_implemented("choose") - - def clip(self, *args, **kwargs): - self._not_implemented("clip") - - def compress(self, *args, **kwargs): - self._not_implemented("compress") - - def conj(self, *args, **kwargs): - self._not_implemented("conj") - - def conjugate(self, *args, **kwargs): - self._not_implemented("conjugate") - - def cumprod(self, *args, **kwargs): - self._not_implemented("cumprod") - - def cumsum(self, *args, **kwargs): - self._not_implemented("cumsum") - - def diagonal(self, *args, **kwargs): - self._not_implemented("diagonal") - - def dot(self, *args, **kwargs): - self._not_implemented("dot") - - def dump(self, *args, **kwargs): - self._not_implemented("dump") - - def dumps(self, *args, **kwargs): - self._not_implemented("dumps") - - def fill(self, *args, **kwargs): - self._not_implemented("fill") - - def flatten(self, *args, **kwargs): - self._not_implemented("flatten") - - def getfield(self, *args, **kwargs): - self._not_implemented("getfield") - - def item(self, *args, **kwargs): - self._not_implemented("item") - - def max(self, *args, **kwargs): - self._not_implemented("max") - - def mean(self, *args, **kwargs): - self._not_implemented("mean") - - def min(self, *args, **kwargs): - self._not_implemented("min") - - def nonzero(self, *args, **kwargs): - self._not_implemented("nonzero") - - def partition(self, *args, **kwargs): - self._not_implemented("partition") - - def prod(self, *args, **kwargs): - self._not_implemented("prod") - - def put(self, *args, **kwargs): - self._not_implemented("put") - - def resize(self, *args, **kwargs): - self._not_implemented("resize") - - def round(self, *args, **kwargs): - self._not_implemented("round") - - def setfield(self, *args, **kwargs): - self._not_implemented("setfield") - - def setflags(self, *args, **kwargs): - self._not_implemented("setflags") - - def sort(self, *args, **kwargs): - self._not_implemented("sort") - - def std(self, *args, **kwargs): - self._not_implemented("std") - - def sum(self, *args, **kwargs): - self._not_implemented("sum") - - def swapaxes(self, *args, **kwargs): - self._not_implemented("swapaxes") - - def to_device(self, *args, **kwargs): - self._not_implemented("to_device") - - def tobytes(self, *args, **kwargs): - self._not_implemented("tobytes") - - def tofile(self, *args, **kwargs): - self._not_implemented("tofile") - - def trace(self, *args, **kwargs): - self._not_implemented("trace") - - def var(self, *args, **kwargs): - self._not_implemented("var") From 937d368e03cc1b58e1730ba2de2113dae2e06721 Mon Sep 17 00:00:00 2001 From: ajpotts Date: Mon, 5 Jan 2026 14:03:04 -0500 Subject: [PATCH 2/2] Closes #5228: remove type: ignore from factorize in extension module --- arkouda/pandas/extension/_arkouda_array.py | 198 ++-------------- .../extension/_arkouda_categorical_array.py | 210 ++++++---------- .../extension/_arkouda_extension_array.py | 153 ++++++------ .../pandas/extension/_arkouda_string_array.py | 224 ++++++++++-------- tests/pandas/extension/arkouda_extension.py | 129 ++++------ tests/pandas/extension/dataframe_accessor.py | 2 +- 6 files changed, 351 insertions(+), 565 deletions(-) diff --git a/arkouda/pandas/extension/_arkouda_array.py b/arkouda/pandas/extension/_arkouda_array.py index 3535f06670e..56eabafc226 100644 --- a/arkouda/pandas/extension/_arkouda_array.py +++ b/arkouda/pandas/extension/_arkouda_array.py @@ -120,206 +120,48 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): # If scalars is already a numpy array, we can preserve its dtype return cls(ak_array(scalars, dtype=dtype, copy=copy)) - def __getitem__(self, key: Any) -> Any: - """ - Retrieve one or more values using a pandas/NumPy-style indexer. - - Parameters - ---------- - key : Any - A valid indexer for 1D array-like data. This may be: - - A scalar integer position (e.g. ``1``) - - A Python ``slice`` (e.g. ``1:3``) - - A list-like of integer positions - - A boolean mask (NumPy array, pandas Series, or Arkouda ``pdarray``) - - A NumPy array, pandas Index/Series, or Arkouda ``pdarray``/``Strings``. - - Returns - ------- - Any - A scalar value for scalar indexers, or an ``ArkoudaArray`` for sequence-like - indexers. - - Raises - ------ - TypeError - If ``key`` is not a supported indexer type, or if a NumPy array or - list-like indexer has an unsupported dtype. - NotImplementedError - If a list-like indexer contains mixed element dtypes (e.g. a mixture - of booleans and integers), which is not supported. - - Examples - -------- - >>> import arkouda as ak - >>> from arkouda.pandas.extension import ArkoudaArray - >>> data = ak.arange(5) - >>> arr = ArkoudaArray(data) - - Scalar integer index returns a Python scalar: - - >>> arr[1] - np.int64(1) - - Slicing returns another ArkoudaArray: - - >>> arr[1:4] - ArkoudaArray([1 2 3]) - - List-like integer positions: - - >>> arr[[0, 2, 4]] - ArkoudaArray([0 2 4]) - - Boolean mask (NumPy array): - - >>> import numpy as np - >>> mask = np.array([True, False, True, False, True]) - >>> arr[mask] - ArkoudaArray([0 2 4]) - """ + def __getitem__(self, key): from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array as ak_array - # Normalize NumPy ndarray indexers + # Convert numpy boolean mask to arkouda pdarray if isinstance(key, np.ndarray): - if key.dtype == bool or key.dtype == np.bool_: - key = ak_array(key, dtype=bool) - elif np.issubdtype(key.dtype, np.integer): + if key.dtype == bool: + key = ak_array(key) + elif key.dtype.kind in {"i"}: key = ak_array(key, dtype="int64") - elif np.issubdtype(key.dtype, np.unsignedinteger): + elif key.dtype.kind in {"u"}: key = ak_array(key, dtype="uint64") else: - raise TypeError(f"Unsupported NumPy index type {key.dtype}") - - # Normalize Python lists - elif isinstance(key, list): - if len(key) == 0: - # Empty selection -> empty ArkoudaArray of same dtype - empty = ak_array([], dtype=self._data.dtype) - return self.__class__(empty) - - first = key[0] - first_dtype = ak_dtype(first) - for item in key: - item_dtype = ak_dtype(item) - if first_dtype != item_dtype: - raise NotImplementedError( - f"Mixed dtypes are not supported: {item_dtype} vs {first_dtype}" - ) - - if isinstance(first, (bool, np.bool_)): - key = ak_array(np.array(key, dtype=bool)) - elif isinstance(first, (int, np.integer)): - key = ak_array(np.array(key, dtype=np.int64)) - else: - raise TypeError(f"Unsupported list index type: {type(first)}") + raise TypeError(f"Unsupported numpy index type {key.dtype}") - # Perform the indexing operation result = self._data[key] - - # Scalar key → return Python scalar if np.isscalar(key): - # If server returned a pdarray of length 1, extract scalar - if isinstance(result, pdarray) and result.size == 1: + if isinstance(result, pdarray): return result[0] - return result - - # All other cases → wrap result in same class + else: + return result return self.__class__(result) - def __setitem__(self, key: Any, value: Any) -> None: - """ - Assign one or more values to the underlying Arkouda array in-place. - - Parameters - ---------- - key : Any - A valid positional indexer for the array. This may be a scalar integer, - slice, list-like of integers, boolean mask, NumPy array, pandas Index/Series, - or Arkouda ``pdarray``. - value : Any - A scalar value broadcast to the selected positions, or an array-like - (NumPy array, Arkouda ``pdarray``, or ``ArkoudaArray``) that is - aligned with ``key``. - - Notes - ----- - This operation mutates the underlying server-side array in-place. - - Examples - -------- - Basic scalar assignment by position: - - >>> import arkouda as ak - >>> from arkouda.pandas.extension import ArkoudaArray - >>> data = ak.arange(5) - >>> arr = ArkoudaArray(data) - >>> arr[0] = 42 - >>> arr - ArkoudaArray([42 1 2 3 4]) - - Using a NumPy boolean mask: - - >>> data = ak.arange(5) - >>> arr = ArkoudaArray(data) - >>> mask = arr.to_ndarray() % 2 == 0 # even positions - >>> arr[mask] = -1 - >>> arr - ArkoudaArray([-1 1 -1 3 -1]) - - Using a NumPy integer indexer: - - >>> data = ak.arange(5) - >>> arr = ArkoudaArray(data) - >>> idx = np.array([1, 3], dtype=np.int64) - >>> arr[idx] = 99 - >>> arr - ArkoudaArray([0 99 2 99 4]) - - Assigning from another ArkoudaArray: - - >>> data = ak.arange(5) - >>> arr = ArkoudaArray(data) - >>> other = ArkoudaArray(ak.arange(10, 15)) - >>> idx = [1, 3, 4] - >>> arr[idx] = other[idx] - >>> arr - ArkoudaArray([0 11 2 13 14]) - """ + # TODO: Simplify to use underlying array setter + def __setitem__(self, key, value): + from arkouda.numpy.dtypes import isSupportedInt from arkouda.numpy.pdarrayclass import pdarray from arkouda.numpy.pdarraycreation import array as ak_array - # Normalize NumPy / Python indexers into Arkouda pdarrays where needed - if isinstance(key, np.ndarray): - # NumPy bool mask or integer indexer - if key.dtype == bool or key.dtype == np.bool_ or np.issubdtype(key.dtype, np.integer): - key = ak_array(key) - elif isinstance(key, list): - # Python list of bools or ints - convert to NumPy then to pdarray - if key and isinstance(key[0], (bool, np.bool_)): - key = ak_array(np.array(key, dtype=bool)) - elif key and isinstance(key[0], (int, np.integer)): - key = ak_array(np.array(key, dtype=np.int64)) - - if isinstance(key, Sequence) and not isinstance(key, (str, bytes)): - # Cannot set empty index, nothing to do - return - - # Normalize the value into something the underlying pdarray understands + # Convert numpy mask to pdarray if necessary + if isinstance(key, np.ndarray) and key.dtype == bool: + key = ak_array(key) + elif isinstance(key, np.ndarray) and isSupportedInt(key.dtype): + key = ak_array(key) if isinstance(value, ArkoudaArray): value = value._data elif isinstance(value, pdarray): - # already an Arkouda pdarray; nothing to do pass - elif np.isscalar(value): - # Fast path for scalar assignment - - self._data[key] = value + elif isinstance(value, (int, float, bool)): # Add scalar check + self._data[key] = value # assign scalar to scalar position return else: - # Convert generic array-likes (Python lists, NumPy arrays, etc.) - # into Arkouda pdarrays. value = ak_array(value) self._data[key] = value diff --git a/arkouda/pandas/extension/_arkouda_categorical_array.py b/arkouda/pandas/extension/_arkouda_categorical_array.py index 3a2f8153e6c..8e10e427a67 100644 --- a/arkouda/pandas/extension/_arkouda_categorical_array.py +++ b/arkouda/pandas/extension/_arkouda_categorical_array.py @@ -7,8 +7,7 @@ from numpy import ndarray from pandas.api.extensions import ExtensionArray -from arkouda.numpy.dtypes import bool_ -from arkouda.numpy.pdarrayclass import pdarray +import arkouda as ak from ._arkouda_array import ArkoudaArray from ._arkouda_extension_array import ArkoudaExtensionArray @@ -71,134 +70,6 @@ def __init__(self, data: Categorical | "ArkoudaCategoricalArray" | ndarray | Seq self._data = data - def __getitem__(self, key: Any) -> Any: - """ - Retrieve one or more categorical values. - - Parameters - ---------- - key : Any - Location(s) to retrieve. Supported forms include: - - * scalar integer index - * slice objects (e.g. ``1:3``) - * NumPy integer array (any integer dtype) - * NumPy boolean mask with the same length as the array - * Python list of integers or booleans - * Arkouda ``pdarray`` of integers or booleans - - Returns - ------- - Any - A Python scalar for scalar access, or a new - :class:`ArkoudaCategoricalArray` for non-scalar indexers. - - Raises - ------ - TypeError - If a NumPy indexer with an unsupported dtype is provided. - - Examples - -------- - >>> import numpy as np - >>> import arkouda as ak - >>> from arkouda.pandas.extension import ArkoudaCategoricalArray - >>> data = ak.Categorical(ak.array(["a", "b", "c", "d"])) - >>> arr = ArkoudaCategoricalArray(data) - - Scalar access returns a Python string-like scalar: - - >>> arr[1] - np.str_('b') - - Negative indexing: - - >>> arr[-1] - np.str_('d') - - Slice indexing returns a new ArkoudaCategoricalArray: - - >>> result = arr[1:3] - >>> type(result) - - - NumPy integer array indexing: - - >>> idx = np.array([0, 2], dtype=np.int64) - >>> sliced = arr[idx] - >>> isinstance(sliced, ArkoudaCategoricalArray) - True - - NumPy boolean mask: - - >>> mask = np.array([True, False, True, False]) - >>> masked = arr[mask] - >>> isinstance(masked, ArkoudaCategoricalArray) - True - - Empty integer indexer returns an empty ArkoudaCategoricalArray: - - >>> empty_idx = np.array([], dtype=np.int64) - >>> empty = arr[empty_idx] - >>> len(empty) - 0 - """ - import numpy as np - - from arkouda.numpy.pdarraycreation import array as ak_array - from arkouda.pandas.categorical import Categorical - - # Handle empty indexer (list / tuple / ndarray of length 0) - if isinstance(key, (list, tuple, np.ndarray)) and len(key) == 0: - empty_strings = ak_array([], dtype="str_") - return ArkoudaCategoricalArray(Categorical(empty_strings)) - - # Scalar integers and slices: delegate directly to the underlying Categorical - if isinstance(key, (int, np.integer, slice)): - result = self._data[key] - # For scalar keys, just return the underlying scalar - if isinstance(key, (int, np.integer)): - return result - # For slices, underlying arkouda.Categorical returns a Categorical - return ArkoudaCategoricalArray(result) - - # NumPy array indexers: normalize to Arkouda pdarrays - if isinstance(key, np.ndarray): - if key.dtype == bool: - key = ak_array(key) - elif np.issubdtype(key.dtype, np.signedinteger): - key = ak_array(key, dtype="int64") - elif np.issubdtype(key.dtype, np.unsignedinteger): - key = ak_array(key, dtype="uint64") - else: - raise TypeError(f"Unsupported numpy index type {key.dtype}") - elif not isinstance(key, (pdarray, Categorical)): - # Convert generic indexers (e.g. Python lists of ints/bools) to an Arkouda pdarray - key = ak_array(key) - - # Delegate to underlying arkouda.Categorical - result = self._data[key] - - # Scalar result: just return the underlying scalar - if isinstance(key, pdarray) and key.size == 1: - # Categorical.__getitem__ will generally still give a Categorical here; - # we normalize to a Python scalar by going through categories[codes]. - - codes = result.codes if isinstance(result, Categorical) else result - cats = self._data.categories - # codes is length-1, so this is length-1 Strings - labels = cats[codes] - # Return a Python scalar string - return labels[0] - - # Non-scalar: wrap Categorical in ArkoudaCategoricalArray - if isinstance(result, Categorical): - return ArkoudaCategoricalArray(result) - - # Fallback: if Categorical returned something array-like but not Categorical, - # rebuild a Categorical from it. - return ArkoudaCategoricalArray(Categorical(result)) - @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): from arkouda import Categorical, array @@ -208,13 +79,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): scalars = Categorical(array(scalars)) return cls(scalars) + def __getitem__(self, idx): + if isinstance(idx, int): + return self._data[idx] + return ArkoudaCategoricalArray(self._data[idx]) + def astype(self, x, dtype): raise NotImplementedError("array_api.astype is not implemented in Arkouda yet") def isna(self): - from arkouda.numpy.pdarraycreation import zeros - - return zeros(self._data.size, dtype=bool_) + return ak.zeros(self._data.size, dtype=ak.bool) @property def dtype(self): @@ -256,3 +130,73 @@ def __eq__(self, other): def __repr__(self): return f"ArkoudaCategoricalArray({self._data})" + + def _not_implemented(self, name: str): + raise NotImplementedError(f"`{name}` is not implemented for ArkoudaCategoricalArray yet.") + + def add_categories(self, *args, **kwargs): + self._not_implemented("add_categories") + + def as_ordered(self, *args, **kwargs): + self._not_implemented("as_ordered") + + def as_unordered(self, *args, **kwargs): + self._not_implemented("as_unordered") + + def check_for_ordered(self, *args, **kwargs): + self._not_implemented("check_for_ordered") + + def describe(self, *args, **kwargs): + self._not_implemented("describe") + + @classmethod + def from_codes(cls, *args, **kwargs): + raise NotImplementedError("`from_codes` is not implemented for ArkoudaCategoricalArray yet.") + + def isnull(self, *args, **kwargs): + self._not_implemented("isnull") + + def max(self, *args, **kwargs): + self._not_implemented("max") + + def memory_usage(self, *args, **kwargs): + self._not_implemented("memory_usage") + + def min(self, *args, **kwargs): + self._not_implemented("min") + + def notna(self, *args, **kwargs): + self._not_implemented("notna") + + def notnull(self, *args, **kwargs): + self._not_implemented("notnull") + + def remove_categories(self, *args, **kwargs): + self._not_implemented("remove_categories") + + def remove_unused_categories(self, *args, **kwargs): + self._not_implemented("remove_unused_categories") + + def rename_categories(self, *args, **kwargs): + self._not_implemented("rename_categories") + + def reorder_categories(self, *args, **kwargs): + self._not_implemented("reorder_categories") + + def set_categories(self, *args, **kwargs): + self._not_implemented("set_categories") + + def set_ordered(self, *args, **kwargs): + self._not_implemented("set_ordered") + + def sort_values(self, *args, **kwargs): + self._not_implemented("sort_values") + + def swapaxes(self, *args, **kwargs): + self._not_implemented("swapaxes") + + def to_list(self, *args, **kwargs): + self._not_implemented("to_list") + + def value_counts(self, *args, **kwargs): + self._not_implemented("value_counts") diff --git a/arkouda/pandas/extension/_arkouda_extension_array.py b/arkouda/pandas/extension/_arkouda_extension_array.py index 5cda4787a36..313095b4a55 100644 --- a/arkouda/pandas/extension/_arkouda_extension_array.py +++ b/arkouda/pandas/extension/_arkouda_extension_array.py @@ -48,6 +48,7 @@ import numpy as np +from numpy.typing import NDArray from pandas.api.extensions import ExtensionArray from arkouda.numpy.dtypes import all_scalars @@ -349,45 +350,47 @@ def take(self, indexer, fill_value=None, allow_fill=False): gathered = ak.where(mask, fv, self._data[idx_fix]) return type(self)(gathered) - def factorize( # type: ignore[override] - self, use_na_sentinel=True, sort=False, **kwargs - ) -> Tuple["ArkoudaExtensionArray", "ArkoudaExtensionArray"]: + def factorize(self, use_na_sentinel=True) -> Tuple[NDArray[np.intp], "ArkoudaExtensionArray"]: """ - Encode the values of this array as integer codes and uniques, - similar to :func:`pandas.factorize`, but implemented with Arkouda. + Encode the values of this array as integer codes and unique values. + + This is similar to :func:`pandas.factorize`, but the grouping/factorization + work is performed in Arkouda. The returned ``codes`` are a NumPy array for + pandas compatibility, while ``uniques`` are returned as an ExtensionArray + of the same type as ``self``. Each distinct non-missing value is assigned a unique integer code. - Missing values (NaN in floating dtypes) are encoded as -1 by default. + For floating dtypes, ``NaN`` is treated as missing; for all other dtypes, + no values are considered missing. Parameters ---------- use_na_sentinel : bool, default True - If True, missing values are encoded as -1 in the codes array. - If False, missing values are assigned a valid code equal to - ``len(uniques)``. - sort : bool, default False - Whether to sort the unique values. If False, the unique values - appear in the order of first appearance in the array. If True, - the unique values are sorted, and codes are assigned accordingly. - **kwargs - Ignored for compatibility. + If True, missing values are encoded as ``-1`` in the returned codes. + If False, missing values are assigned the code ``len(uniques)``. + (Missingness is only detected for floating dtypes via ``NaN``.) Returns ------- - Tuple[pdarray, ArkoudaExtensionArray] + (numpy.ndarray, ExtensionArray) A pair ``(codes, uniques)`` where: - - ``codes`` is a NumPy ``int64`` array of factor labels, one per element. - Missing values are ``-1`` if ``use_na_sentinel=True``; otherwise they - receive the code ``len(uniques)``. - - ``uniques`` is a NumPy array of the unique values. + + * ``codes`` is a 1D NumPy array of dtype ``np.intp`` with the same length + as this array, containing the factor codes for each element. + * ``uniques`` is an ExtensionArray containing the unique (non-missing) + values, with the same extension type as ``self``. + + If ``use_na_sentinel=True``, missing values in ``codes`` are ``-1``. + Otherwise they receive the code ``len(uniques)``. Notes ----- * Only floating-point dtypes treat ``NaN`` as missing; for other dtypes, - no values are considered missing. - * This method executes all grouping and factorization in Arkouda, - returning results as NumPy arrays for compatibility with pandas. - * Unlike pandas, string/None/null handling is not yet unified. + all values are treated as non-missing. + * ``uniques`` are constructed from Arkouda's unique keys and returned as + ``type(self)(uniques_ak)`` so that pandas internals (e.g. ``groupby``) + can treat them as an ExtensionArray. + * String/None/null missing-value behavior is not yet unified with pandas. Examples -------- @@ -396,7 +399,7 @@ def factorize( # type: ignore[override] >>> arr = ArkoudaArray(ak.array([1, 2, 1, 3])) >>> codes, uniques = arr.factorize() >>> codes - ArkoudaArray([0 1 0 2]) + array([0, 1, 0, 2]) >>> uniques ArkoudaArray([1 2 3]) """ @@ -407,7 +410,6 @@ def factorize( # type: ignore[override] from arkouda.numpy.pdarraycreation import array as ak_array from arkouda.numpy.sorting import argsort from arkouda.numpy.strings import Strings - from arkouda.pandas.extension import ArkoudaArray from arkouda.pandas.groupbyclass import GroupBy # Arkouda array backing @@ -425,7 +427,7 @@ def factorize( # type: ignore[override] sent = -1 if use_na_sentinel else 0 from arkouda.numpy.pdarraycreation import full as ak_full - return ArkoudaArray(ak_full(n, sent, dtype=int64)), type(self)( + return ak_full(n, sent, dtype=int64).to_ndarray(), type(self)( ak_array([], dtype=self.to_numpy().dtype) ) @@ -437,28 +439,16 @@ def factorize( # type: ignore[override] uniques_ak = concatenate(uniques_ak) - if sort: - # Keys already sorted; group id -> 0..k-1 - groupid_to_code = arange(uniques_ak.size, dtype=int64) - - # Work around to account GroupBy not sorting Categorical properly - if isinstance(arr, Categorical): - perm = uniques_ak.argsort() - # Inverse argsort: - groupid_to_code[perm] = arange(uniques_ak.size, dtype=int64) - uniques_ak = uniques_ak[perm] - - else: - # First-appearance order - _keys, first_idx_per_group = g.min(arange(arr_nn.size, dtype=int64)) - order = argsort(first_idx_per_group) + # First-appearance order + _keys, first_idx_per_group = g.min(arange(arr_nn.size, dtype=int64)) + order = argsort(first_idx_per_group) - # Reorder uniques by first appearance - uniques_ak = uniques_ak[order] + # Reorder uniques by first appearance + uniques_ak = uniques_ak[order] - # Map group_id -> code in first-appearance order - groupid_to_code = zeros(order.size, dtype=int64) - groupid_to_code[order] = arange(order.size, dtype=int64) + # Map group_id -> code in first-appearance order + groupid_to_code = zeros(order.size, dtype=int64) + groupid_to_code[order] = arange(order.size, dtype=int64) # Per-element codes on the non-NA slice codes_nn = g.broadcast(groupid_to_code) @@ -468,7 +458,9 @@ def factorize( # type: ignore[override] codes_ak = full(n, sentinel, dtype=int64) codes_ak[non_na] = codes_nn - return ArkoudaArray(codes_ak), type(self)(uniques_ak) + codes_np = codes_ak.to_ndarray().astype(np.intp, copy=False) + + return codes_np, type(self)(uniques_ak) # In each EA def _values_for_factorize(self): @@ -527,42 +519,45 @@ def to_ndarray(self) -> np.ndarray: """ return self._data.to_ndarray() - def argsort( # type: ignore[override] + def argsort( self, *, ascending: bool = True, - kind="quicksort", - na_position: str = "last", - **kwargs, - ) -> pdarray: + kind: str = "quicksort", + **kwargs: object, + ) -> NDArray[np.intp]: """ Return the indices that would sort the array. - This method computes the permutation indices that would sort the - underlying Arkouda data. It aligns with the pandas ``ExtensionArray`` - contract, returning a 1-D ``pdarray`` of integer indices suitable for - reordering the array via ``take`` or ``iloc``. NaN values are placed - either at the beginning or end of the result depending on - ``na_position``. + This method computes the permutation indices that would sort the underlying + Arkouda data and returns them as a NumPy array, in accordance with the + pandas ``ExtensionArray`` contract. The indices can be used to reorder the + array via ``take`` or ``iloc``. + + For floating-point data, ``NaN`` values are handled according to the + ``na_position`` keyword argument. Parameters ---------- ascending : bool, default True - If True, sort values in ascending order. If False, sort in - descending order. + If True, sort values in ascending order. If False, sort in descending + order. kind : str, default "quicksort" - Sorting algorithm. Present for API compatibility with NumPy and - pandas but currently ignored. - na_position : {"first", "last"}, default "last" - Where to place NaN values in the sorted result. Currently only implemented for pdarray. - For Strings and Categorical will have no effect. - **kwargs : Any - Additional keyword arguments for compatibility; ignored. + Sorting algorithm. Present for API compatibility with NumPy and pandas + but currently ignored. + **kwargs + Additional keyword arguments for compatibility. Supported keyword: + + * ``na_position`` : {"first", "last"}, default "last" + Where to place ``NaN`` values in the sorted result. This option is + currently only applied for floating-point ``pdarray`` data; for + ``Strings`` and ``Categorical`` data it has no effect. Returns ------- - pdarray - Integer indices (``int64``) that would sort the array. + numpy.ndarray + A 1D NumPy array of dtype ``np.intp`` containing the indices that would + sort the array. Raises ------ @@ -573,11 +568,12 @@ def argsort( # type: ignore[override] Notes ----- - - Supports Arkouda ``pdarray``, ``Strings``, and ``Categorical`` data. - - Floating-point arrays have NaNs repositioned according to + * Supports Arkouda ``pdarray``, ``Strings``, and ``Categorical`` data. + * For floating-point arrays, ``NaN`` values are repositioned according to ``na_position``. - - This method does not move data to the client; the computation - occurs on the Arkouda server. + * The sorting computation occurs on the Arkouda server, but the resulting + permutation indices are materialized on the client as a NumPy array, as + required by pandas internals. Examples -------- @@ -585,9 +581,9 @@ def argsort( # type: ignore[override] >>> from arkouda.pandas.extension import ArkoudaArray >>> a = ArkoudaArray(ak.array([3.0, float("nan"), 1.0])) >>> a.argsort() # NA last by default - array([2 0 1]) + array([2, 0, 1]) >>> a.argsort(na_position="first") - array([1 2 0]) + array([1, 2, 0]) """ from arkouda.numpy import argsort from arkouda.numpy.numeric import isnan as ak_isnan @@ -596,6 +592,9 @@ def argsort( # type: ignore[override] from arkouda.numpy.util import is_float from arkouda.pandas.categorical import Categorical + # Extract na_position from kwargs + na_position = kwargs.pop("na_position", "last") + if na_position not in {"first", "last"}: raise ValueError("na_position must be 'first' or 'last'.") @@ -613,7 +612,7 @@ def argsort( # type: ignore[override] else: raise TypeError(f"Unsupported argsort dtype: {type(self._data)}") - return perm + return perm.to_ndarray() def broadcast_arrays(self, *arrays): raise NotImplementedError( diff --git a/arkouda/pandas/extension/_arkouda_string_array.py b/arkouda/pandas/extension/_arkouda_string_array.py index e80138dda8b..1bdf4bef020 100644 --- a/arkouda/pandas/extension/_arkouda_string_array.py +++ b/arkouda/pandas/extension/_arkouda_string_array.py @@ -77,106 +77,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(ak_array(scalars)) - def __getitem__(self, key: Any) -> Any: - """ - Retrieve one or more string values. - - Parameters - ---------- - key : Any - Positional indexer. Supports: - * scalar integer positions - * slice objects - * NumPy integer arrays (signed/unsigned) - * NumPy boolean masks - * Python lists of integers / booleans - * Arkouda pdarray indexers (int / uint / bool) - - Returns - ------- - Any - A Python string for scalar access, or a new ArkoudaStringArray - for non-scalar indexers. - - Raises - ------ - TypeError - If ``key`` is a NumPy array with an unsupported dtype (for example, - a floating point or object dtype). - - Examples - -------- - Basic scalar access: - - >>> import arkouda as ak - >>> from arkouda.pandas.extension import ArkoudaStringArray - >>> arr = ArkoudaStringArray(ak.array(["a", "b", "c", "d"])) - >>> arr[1] - np.str_('b') - - Negative indexing: - - >>> arr[-1] - np.str_('d') - - Slice indexing (returns a new ArkoudaStringArray): - - >>> arr[1:3] - ArkoudaStringArray(['b', 'c']) - - NumPy integer array indexing: - - >>> idx = np.array([0, 2], dtype=np.int64) - >>> arr[idx] - ArkoudaStringArray(['a', 'c']) - - NumPy boolean mask: - - >>> mask = np.array([True, False, True, False]) - >>> arr[mask] - ArkoudaStringArray(['a', 'c']) - - Arkouda integer indexer: - - >>> ak_idx = ak.array([3, 1]) - >>> arr[ak_idx] - ArkoudaStringArray(['d', 'b']) - - Empty indexer returns an empty ArkoudaStringArray: - - >>> empty_idx = np.array([], dtype=np.int64) - >>> arr[empty_idx] - ArkoudaStringArray([]) - """ - from arkouda.numpy.pdarraycreation import array as ak_array - from arkouda.numpy.strings import Strings - - # Normalize NumPy indexers to Arkouda pdarrays, mirroring ArkoudaArray.__getitem__ - if isinstance(key, np.ndarray): - if key.dtype == bool: - key = ak_array(key) - elif key.dtype.kind in {"i"}: - # signed integer - key = ak_array(key, dtype="int64") - elif key.dtype.kind in {"u"}: - # unsigned integer - key = ak_array(key, dtype="uint64") - else: - raise TypeError(f"Unsupported numpy index type {key.dtype}") - + def __getitem__(self, key): result = self._data[key] - - # Scalar access: return a plain Python str (or scalar) instead of a Strings object if np.isscalar(key): - return result - - # Non-scalar: expect an Arkouda Strings, wrap it - if isinstance(result, Strings): - return ArkoudaStringArray(result) - - # Fallback: if Arkouda returned something array-like but not Strings, - # materialize via ak.array and wrap again as Strings. - return ArkoudaStringArray(ak_array(result)) + if hasattr(result, "to_ndarray"): + return result.to_ndarray()[()] + else: + return result + return ArkoudaStringArray(result) def astype(self, dtype, copy: bool = False): if dtype in (object, np.object_, "object", np.dtype("O")): @@ -227,3 +135,123 @@ def __eq__(self, other): def __repr__(self): return f"ArkoudaStringArray({self._data})" + + def _not_implemented(self, name: str): + raise NotImplementedError(f"`{name}` is not implemented for Arkouda-backed arrays yet.") + + def all(self, *args, **kwargs): + self._not_implemented("all") + + def any(self, *args, **kwargs): + self._not_implemented("any") + + def argpartition(self, *args, **kwargs): + self._not_implemented("argpartition") + + def byteswap(self, *args, **kwargs): + self._not_implemented("byteswap") + + def choose(self, *args, **kwargs): + self._not_implemented("choose") + + def clip(self, *args, **kwargs): + self._not_implemented("clip") + + def compress(self, *args, **kwargs): + self._not_implemented("compress") + + def conj(self, *args, **kwargs): + self._not_implemented("conj") + + def conjugate(self, *args, **kwargs): + self._not_implemented("conjugate") + + def cumprod(self, *args, **kwargs): + self._not_implemented("cumprod") + + def cumsum(self, *args, **kwargs): + self._not_implemented("cumsum") + + def diagonal(self, *args, **kwargs): + self._not_implemented("diagonal") + + def dot(self, *args, **kwargs): + self._not_implemented("dot") + + def dump(self, *args, **kwargs): + self._not_implemented("dump") + + def dumps(self, *args, **kwargs): + self._not_implemented("dumps") + + def fill(self, *args, **kwargs): + self._not_implemented("fill") + + def flatten(self, *args, **kwargs): + self._not_implemented("flatten") + + def getfield(self, *args, **kwargs): + self._not_implemented("getfield") + + def item(self, *args, **kwargs): + self._not_implemented("item") + + def max(self, *args, **kwargs): + self._not_implemented("max") + + def mean(self, *args, **kwargs): + self._not_implemented("mean") + + def min(self, *args, **kwargs): + self._not_implemented("min") + + def nonzero(self, *args, **kwargs): + self._not_implemented("nonzero") + + def partition(self, *args, **kwargs): + self._not_implemented("partition") + + def prod(self, *args, **kwargs): + self._not_implemented("prod") + + def put(self, *args, **kwargs): + self._not_implemented("put") + + def resize(self, *args, **kwargs): + self._not_implemented("resize") + + def round(self, *args, **kwargs): + self._not_implemented("round") + + def setfield(self, *args, **kwargs): + self._not_implemented("setfield") + + def setflags(self, *args, **kwargs): + self._not_implemented("setflags") + + def sort(self, *args, **kwargs): + self._not_implemented("sort") + + def std(self, *args, **kwargs): + self._not_implemented("std") + + def sum(self, *args, **kwargs): + self._not_implemented("sum") + + def swapaxes(self, *args, **kwargs): + self._not_implemented("swapaxes") + + def to_device(self, *args, **kwargs): + self._not_implemented("to_device") + + def tobytes(self, *args, **kwargs): + self._not_implemented("tobytes") + + def tofile(self, *args, **kwargs): + self._not_implemented("tofile") + + def trace(self, *args, **kwargs): + self._not_implemented("trace") + + def var(self, *args, **kwargs): + self._not_implemented("var") diff --git a/tests/pandas/extension/arkouda_extension.py b/tests/pandas/extension/arkouda_extension.py index 4e643595608..1503938eef0 100644 --- a/tests/pandas/extension/arkouda_extension.py +++ b/tests/pandas/extension/arkouda_extension.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from numpy.testing import assert_equal as np_assert_equal + import arkouda as ak from arkouda.numpy.strings import Strings @@ -16,6 +18,16 @@ class TestArkoudaExtensionArray: + def test_extension_docstrings(self): + import doctest + + from arkouda.pandas.extension import _arkouda_extension_array + + result = doctest.testmod( + _arkouda_extension_array, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + ) + assert result.failed == 0, f"Doctest failed: {result.failed} failures" + @pytest.fixture(params=["numeric", "strings", "categorical"]) def ea(self, request): """ @@ -47,16 +59,6 @@ def ea(self, request): arr._test_kind = kind return arr - def test_extension_docstrings(self): - import doctest - - from arkouda.pandas.extension import _arkouda_extension_array - - result = doctest.testmod( - _arkouda_extension_array, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE - ) - assert result.failed == 0, f"Doctest failed: {result.failed} failures" - def base_objs(self): """Provide canonical Arkouda objects for reuse in tests.""" nums = ak.array([1, 2, 3, 4]) # pdarray[int64] @@ -358,13 +360,13 @@ def test_concat_largeish_segments_length_only_smoke(self): assert out[1000] == 1000 assert out[-1] == 1499 - def assert_indices(self, perm: pdarray, expected_py_indices): + def assert_indices(self, perm: np.ndarray, expected_py_indices): """Compare returned indices to expected Python list, staying server-side where possible.""" - assert isinstance(perm, pdarray) + assert isinstance(perm, np.ndarray) # Arkouda uses int64 for indices; accept any int dtype but verify it's integer assert np.issubdtype(perm.dtype, np.integer) - exp = ak.array(expected_py_indices) - assert ak.all(perm == exp) + exp = np.array(expected_py_indices) + assert np.all(perm == exp) # ---------- pdarray (float) with NaN handling ---------- @@ -450,116 +452,87 @@ class Dummy: # noinspection PyUnresolvedReferences ea.argsort() - @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("use_na_sentinel", [True, False]) - def test_factorize_int_basic(self, sort, use_na_sentinel): + def test_factorize_int_basic(self, use_na_sentinel): """ Int array has no NAs; first-appearance order vs sorted uniques; NA sentinel only affects behavior if there are NAs (there aren't here). """ a = ArkoudaArray(ak.array([1, 2, 1, 3])) - codes, uniques = a.factorize(sort=sort, use_na_sentinel=use_na_sentinel) - - if not sort: - # First appearance: uniques [1, 2, 3] - assert_equal(uniques._data, ak.array([1, 2, 3])) - assert_equal(codes, ArkoudaArray(ak.array([0, 1, 0, 2]))) - else: - # Sorted: uniques [1, 2, 3] (same here, but codes recomputed from sorted order) - assert_equal(uniques._data, ak.array([1, 2, 3])) - assert_equal(codes, ArkoudaArray(ak.array([0, 1, 0, 2]))) - - @pytest.mark.parametrize("sort", [False, True]) - def test_factorize_float_with_nan_default_sentinel(self, sort): + codes, uniques = a.factorize(use_na_sentinel=use_na_sentinel) + + # First appearance: uniques [1, 2, 3] + assert_equal(uniques._data, ak.array([1, 2, 3])) + np_assert_equal(codes, np.array([0, 1, 0, 2])) + + def test_factorize_float_with_nan_default_sentinel(self): """Float array treats NaN as missing -> -1 sentinel by default.""" a = ArkoudaArray(ak.array([1.0, np.nan, 1.0, 2.0])) - codes, uniques = a.factorize(sort=sort) + codes, uniques = a.factorize() - if not sort: - # First appearance uniques: [1.0, 2.0] - assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) - assert_arkouda_array_equal(codes._data, ak.array([0, -1, 0, 1])) - else: - # Sorted uniques: [1.0, 2.0] (same set) - assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) - assert_arkouda_array_equal(codes._data, ak.array([0, -1, 0, 1])) + # First appearance uniques: [1.0, 2.0] + assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) + np_assert_equal(codes, np.array([0, -1, 0, 1])) def test_factorize_float_with_nan_no_sentinel(self): """With use_na_sentinel=False, NaNs get a valid code == len(uniques).""" a = ArkoudaArray(ak.array([1.0, np.nan, 1.0, 2.0])) - codes, uniques = a.factorize(sort=False, use_na_sentinel=False) + codes, uniques = a.factorize(use_na_sentinel=False) # uniques from first appearance: [1.0, 2.0]; NaN code == 2 assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) - assert_arkouda_array_equal(codes._data, ak.array([0, 2, 0, 1])) + np_assert_equal(codes, np.array([0, 2, 0, 1])) def test_factorize_float_all_nan(self): """Edge case: all values are NaN -> codes all sentinel, uniques empty.""" a = ArkoudaArray(ak.array([np.nan, np.nan])) codes, uniques = a.factorize() assert_arkouda_array_equal(uniques._data, ak.array([], dtype=float)) - assert_arkouda_array_equal(codes._data, ak.array([-1, -1], dtype=np.int64)) + np_assert_equal(codes, np.array([-1, -1], dtype=np.int64)) - @pytest.mark.parametrize("sort", [False, True]) - def test_factorize_strings_basic(self, sort): + def test_factorize_strings_basic(self): """Strings: no NA handling; empty strings are treated as normal values.""" s = ak.array(["a", "b", "a", "c"]) a = ArkoudaStringArray(s) - codes, uniques = a.factorize(sort=sort) + codes, uniques = a.factorize() - if not sort: - assert_arkouda_array_equal(uniques._data, ak.array(["a", "b", "c"])) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) - else: - # Sorted: ["a", "b", "c"] -> same result for this set - assert_arkouda_array_equal(uniques._data, ak.array(["a", "b", "c"])) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) + # Sorted: ["a", "b", "c"] -> same result for this set + assert_arkouda_array_equal(uniques._data, ak.array(["a", "b", "c"])) + np_assert_equal(codes, np.array([0, 1, 0, 2])) def test_factorize_strings_with_empty_string(self): """Explicitly ensure "" is treated as a normal value (not missing).""" s = ak.array(["", "x", "", "y"]) a = ArkoudaStringArray(s) - codes, uniques = a.factorize(sort=False) + codes, uniques = a.factorize() assert_arkouda_array_equal(uniques._data, ak.array(["", "x", "y"])) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) + np_assert_equal(codes, np.array([0, 1, 0, 2])) - @pytest.mark.parametrize("sort", [False, True]) - def test_factorize_categorical_basic(self, sort): + def test_factorize_categorical_basic(self): """ Categorical: factorization operates over observed values (not categories table), - honoring first-appearance vs sorted order semantics of the observed data. + honoring first-appearance semantics of the observed data. """ s = ak.array(["red", "blue", "red", "green"]) cat = ak.Categorical(s) # construct from Strings a = ArkoudaCategoricalArray(cat) - codes, uniques = a.factorize(sort=sort) - - if not sort: - # first appearance uniques: ["red", "blue", "green"] - assert_arkouda_array_equal(uniques._data, ak.Categorical(ak.array(["red", "blue", "green"]))) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) - else: - # sorted uniques: ["blue", "green", "red"] - assert_equal(uniques._data, ak.Categorical(ak.array(["blue", "green", "red"]))) - # remapped codes according to sorted order: - # red->2, blue->0, green->1 - assert_arkouda_array_equal(codes._data, ak.array([2, 0, 2, 1])) - - def test_factorize_stability_first_appearance_vs_sorted(self): + codes, uniques = a.factorize() + + # order of first-appearance: ["red", "blue", "green"] + assert_equal(uniques._data, ak.Categorical(ak.array(["red", "blue", "green"]))) + # remapped codes according to sorted order: + # red->2, blue->0, green->1 + np_assert_equal(codes, np.array([0, 1, 0, 2])) + + def test_factorize_stability_first_appearance(self): """Sanity check that switching sort changes code assignments consistently.""" x = ak.array([2, 1, 3, 2]) a = ArkoudaArray(x) - codes_unsorted, uniques_unsorted = a.factorize(sort=False) - codes_sorted, uniques_sorted = a.factorize(sort=True) + codes_unsorted, uniques_unsorted = a.factorize() # First appearance uniques: [2, 1, 3] assert_arkouda_array_equal(uniques_unsorted._data, ak.array([2, 1, 3])) - assert_arkouda_array_equal(codes_unsorted._data, ak.array([0, 1, 2, 0])) - - # Sorted uniques: [1, 2, 3] - assert_arkouda_array_equal(uniques_sorted._data, ak.array([1, 2, 3])) - # mapping old->new: 2->1, 1->0, 3->2 => [1,0,2,1] - assert_arkouda_array_equal(codes_sorted._data, ak.array([1, 0, 2, 1])) + np_assert_equal(codes_unsorted, np.array([0, 1, 2, 0])) def test_from_sequence_dispatches_to_correct_subclass(self): """ diff --git a/tests/pandas/extension/dataframe_accessor.py b/tests/pandas/extension/dataframe_accessor.py index 7b7bc46d177..be5469d0606 100644 --- a/tests/pandas/extension/dataframe_accessor.py +++ b/tests/pandas/extension/dataframe_accessor.py @@ -28,7 +28,7 @@ class TestDataFrameAccessorInternals: - def test_extension_docstrings(self): + def test_dataframe_extension_docstrings(self): import doctest from arkouda.pandas.extension import _dataframe_accessor