diff --git a/arkouda/pandas/extension/_arkouda_extension_array.py b/arkouda/pandas/extension/_arkouda_extension_array.py index 5cda4787a36..313095b4a55 100644 --- a/arkouda/pandas/extension/_arkouda_extension_array.py +++ b/arkouda/pandas/extension/_arkouda_extension_array.py @@ -48,6 +48,7 @@ import numpy as np +from numpy.typing import NDArray from pandas.api.extensions import ExtensionArray from arkouda.numpy.dtypes import all_scalars @@ -349,45 +350,47 @@ def take(self, indexer, fill_value=None, allow_fill=False): gathered = ak.where(mask, fv, self._data[idx_fix]) return type(self)(gathered) - def factorize( # type: ignore[override] - self, use_na_sentinel=True, sort=False, **kwargs - ) -> Tuple["ArkoudaExtensionArray", "ArkoudaExtensionArray"]: + def factorize(self, use_na_sentinel=True) -> Tuple[NDArray[np.intp], "ArkoudaExtensionArray"]: """ - Encode the values of this array as integer codes and uniques, - similar to :func:`pandas.factorize`, but implemented with Arkouda. + Encode the values of this array as integer codes and unique values. + + This is similar to :func:`pandas.factorize`, but the grouping/factorization + work is performed in Arkouda. The returned ``codes`` are a NumPy array for + pandas compatibility, while ``uniques`` are returned as an ExtensionArray + of the same type as ``self``. Each distinct non-missing value is assigned a unique integer code. - Missing values (NaN in floating dtypes) are encoded as -1 by default. + For floating dtypes, ``NaN`` is treated as missing; for all other dtypes, + no values are considered missing. Parameters ---------- use_na_sentinel : bool, default True - If True, missing values are encoded as -1 in the codes array. - If False, missing values are assigned a valid code equal to - ``len(uniques)``. - sort : bool, default False - Whether to sort the unique values. If False, the unique values - appear in the order of first appearance in the array. If True, - the unique values are sorted, and codes are assigned accordingly. - **kwargs - Ignored for compatibility. + If True, missing values are encoded as ``-1`` in the returned codes. + If False, missing values are assigned the code ``len(uniques)``. + (Missingness is only detected for floating dtypes via ``NaN``.) Returns ------- - Tuple[pdarray, ArkoudaExtensionArray] + (numpy.ndarray, ExtensionArray) A pair ``(codes, uniques)`` where: - - ``codes`` is a NumPy ``int64`` array of factor labels, one per element. - Missing values are ``-1`` if ``use_na_sentinel=True``; otherwise they - receive the code ``len(uniques)``. - - ``uniques`` is a NumPy array of the unique values. + + * ``codes`` is a 1D NumPy array of dtype ``np.intp`` with the same length + as this array, containing the factor codes for each element. + * ``uniques`` is an ExtensionArray containing the unique (non-missing) + values, with the same extension type as ``self``. + + If ``use_na_sentinel=True``, missing values in ``codes`` are ``-1``. + Otherwise they receive the code ``len(uniques)``. Notes ----- * Only floating-point dtypes treat ``NaN`` as missing; for other dtypes, - no values are considered missing. - * This method executes all grouping and factorization in Arkouda, - returning results as NumPy arrays for compatibility with pandas. - * Unlike pandas, string/None/null handling is not yet unified. + all values are treated as non-missing. + * ``uniques`` are constructed from Arkouda's unique keys and returned as + ``type(self)(uniques_ak)`` so that pandas internals (e.g. ``groupby``) + can treat them as an ExtensionArray. + * String/None/null missing-value behavior is not yet unified with pandas. Examples -------- @@ -396,7 +399,7 @@ def factorize( # type: ignore[override] >>> arr = ArkoudaArray(ak.array([1, 2, 1, 3])) >>> codes, uniques = arr.factorize() >>> codes - ArkoudaArray([0 1 0 2]) + array([0, 1, 0, 2]) >>> uniques ArkoudaArray([1 2 3]) """ @@ -407,7 +410,6 @@ def factorize( # type: ignore[override] from arkouda.numpy.pdarraycreation import array as ak_array from arkouda.numpy.sorting import argsort from arkouda.numpy.strings import Strings - from arkouda.pandas.extension import ArkoudaArray from arkouda.pandas.groupbyclass import GroupBy # Arkouda array backing @@ -425,7 +427,7 @@ def factorize( # type: ignore[override] sent = -1 if use_na_sentinel else 0 from arkouda.numpy.pdarraycreation import full as ak_full - return ArkoudaArray(ak_full(n, sent, dtype=int64)), type(self)( + return ak_full(n, sent, dtype=int64).to_ndarray(), type(self)( ak_array([], dtype=self.to_numpy().dtype) ) @@ -437,28 +439,16 @@ def factorize( # type: ignore[override] uniques_ak = concatenate(uniques_ak) - if sort: - # Keys already sorted; group id -> 0..k-1 - groupid_to_code = arange(uniques_ak.size, dtype=int64) - - # Work around to account GroupBy not sorting Categorical properly - if isinstance(arr, Categorical): - perm = uniques_ak.argsort() - # Inverse argsort: - groupid_to_code[perm] = arange(uniques_ak.size, dtype=int64) - uniques_ak = uniques_ak[perm] - - else: - # First-appearance order - _keys, first_idx_per_group = g.min(arange(arr_nn.size, dtype=int64)) - order = argsort(first_idx_per_group) + # First-appearance order + _keys, first_idx_per_group = g.min(arange(arr_nn.size, dtype=int64)) + order = argsort(first_idx_per_group) - # Reorder uniques by first appearance - uniques_ak = uniques_ak[order] + # Reorder uniques by first appearance + uniques_ak = uniques_ak[order] - # Map group_id -> code in first-appearance order - groupid_to_code = zeros(order.size, dtype=int64) - groupid_to_code[order] = arange(order.size, dtype=int64) + # Map group_id -> code in first-appearance order + groupid_to_code = zeros(order.size, dtype=int64) + groupid_to_code[order] = arange(order.size, dtype=int64) # Per-element codes on the non-NA slice codes_nn = g.broadcast(groupid_to_code) @@ -468,7 +458,9 @@ def factorize( # type: ignore[override] codes_ak = full(n, sentinel, dtype=int64) codes_ak[non_na] = codes_nn - return ArkoudaArray(codes_ak), type(self)(uniques_ak) + codes_np = codes_ak.to_ndarray().astype(np.intp, copy=False) + + return codes_np, type(self)(uniques_ak) # In each EA def _values_for_factorize(self): @@ -527,42 +519,45 @@ def to_ndarray(self) -> np.ndarray: """ return self._data.to_ndarray() - def argsort( # type: ignore[override] + def argsort( self, *, ascending: bool = True, - kind="quicksort", - na_position: str = "last", - **kwargs, - ) -> pdarray: + kind: str = "quicksort", + **kwargs: object, + ) -> NDArray[np.intp]: """ Return the indices that would sort the array. - This method computes the permutation indices that would sort the - underlying Arkouda data. It aligns with the pandas ``ExtensionArray`` - contract, returning a 1-D ``pdarray`` of integer indices suitable for - reordering the array via ``take`` or ``iloc``. NaN values are placed - either at the beginning or end of the result depending on - ``na_position``. + This method computes the permutation indices that would sort the underlying + Arkouda data and returns them as a NumPy array, in accordance with the + pandas ``ExtensionArray`` contract. The indices can be used to reorder the + array via ``take`` or ``iloc``. + + For floating-point data, ``NaN`` values are handled according to the + ``na_position`` keyword argument. Parameters ---------- ascending : bool, default True - If True, sort values in ascending order. If False, sort in - descending order. + If True, sort values in ascending order. If False, sort in descending + order. kind : str, default "quicksort" - Sorting algorithm. Present for API compatibility with NumPy and - pandas but currently ignored. - na_position : {"first", "last"}, default "last" - Where to place NaN values in the sorted result. Currently only implemented for pdarray. - For Strings and Categorical will have no effect. - **kwargs : Any - Additional keyword arguments for compatibility; ignored. + Sorting algorithm. Present for API compatibility with NumPy and pandas + but currently ignored. + **kwargs + Additional keyword arguments for compatibility. Supported keyword: + + * ``na_position`` : {"first", "last"}, default "last" + Where to place ``NaN`` values in the sorted result. This option is + currently only applied for floating-point ``pdarray`` data; for + ``Strings`` and ``Categorical`` data it has no effect. Returns ------- - pdarray - Integer indices (``int64``) that would sort the array. + numpy.ndarray + A 1D NumPy array of dtype ``np.intp`` containing the indices that would + sort the array. Raises ------ @@ -573,11 +568,12 @@ def argsort( # type: ignore[override] Notes ----- - - Supports Arkouda ``pdarray``, ``Strings``, and ``Categorical`` data. - - Floating-point arrays have NaNs repositioned according to + * Supports Arkouda ``pdarray``, ``Strings``, and ``Categorical`` data. + * For floating-point arrays, ``NaN`` values are repositioned according to ``na_position``. - - This method does not move data to the client; the computation - occurs on the Arkouda server. + * The sorting computation occurs on the Arkouda server, but the resulting + permutation indices are materialized on the client as a NumPy array, as + required by pandas internals. Examples -------- @@ -585,9 +581,9 @@ def argsort( # type: ignore[override] >>> from arkouda.pandas.extension import ArkoudaArray >>> a = ArkoudaArray(ak.array([3.0, float("nan"), 1.0])) >>> a.argsort() # NA last by default - array([2 0 1]) + array([2, 0, 1]) >>> a.argsort(na_position="first") - array([1 2 0]) + array([1, 2, 0]) """ from arkouda.numpy import argsort from arkouda.numpy.numeric import isnan as ak_isnan @@ -596,6 +592,9 @@ def argsort( # type: ignore[override] from arkouda.numpy.util import is_float from arkouda.pandas.categorical import Categorical + # Extract na_position from kwargs + na_position = kwargs.pop("na_position", "last") + if na_position not in {"first", "last"}: raise ValueError("na_position must be 'first' or 'last'.") @@ -613,7 +612,7 @@ def argsort( # type: ignore[override] else: raise TypeError(f"Unsupported argsort dtype: {type(self._data)}") - return perm + return perm.to_ndarray() def broadcast_arrays(self, *arrays): raise NotImplementedError( diff --git a/tests/pandas/extension/arkouda_extension.py b/tests/pandas/extension/arkouda_extension.py index 4e643595608..1503938eef0 100644 --- a/tests/pandas/extension/arkouda_extension.py +++ b/tests/pandas/extension/arkouda_extension.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from numpy.testing import assert_equal as np_assert_equal + import arkouda as ak from arkouda.numpy.strings import Strings @@ -16,6 +18,16 @@ class TestArkoudaExtensionArray: + def test_extension_docstrings(self): + import doctest + + from arkouda.pandas.extension import _arkouda_extension_array + + result = doctest.testmod( + _arkouda_extension_array, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + ) + assert result.failed == 0, f"Doctest failed: {result.failed} failures" + @pytest.fixture(params=["numeric", "strings", "categorical"]) def ea(self, request): """ @@ -47,16 +59,6 @@ def ea(self, request): arr._test_kind = kind return arr - def test_extension_docstrings(self): - import doctest - - from arkouda.pandas.extension import _arkouda_extension_array - - result = doctest.testmod( - _arkouda_extension_array, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE - ) - assert result.failed == 0, f"Doctest failed: {result.failed} failures" - def base_objs(self): """Provide canonical Arkouda objects for reuse in tests.""" nums = ak.array([1, 2, 3, 4]) # pdarray[int64] @@ -358,13 +360,13 @@ def test_concat_largeish_segments_length_only_smoke(self): assert out[1000] == 1000 assert out[-1] == 1499 - def assert_indices(self, perm: pdarray, expected_py_indices): + def assert_indices(self, perm: np.ndarray, expected_py_indices): """Compare returned indices to expected Python list, staying server-side where possible.""" - assert isinstance(perm, pdarray) + assert isinstance(perm, np.ndarray) # Arkouda uses int64 for indices; accept any int dtype but verify it's integer assert np.issubdtype(perm.dtype, np.integer) - exp = ak.array(expected_py_indices) - assert ak.all(perm == exp) + exp = np.array(expected_py_indices) + assert np.all(perm == exp) # ---------- pdarray (float) with NaN handling ---------- @@ -450,116 +452,87 @@ class Dummy: # noinspection PyUnresolvedReferences ea.argsort() - @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("use_na_sentinel", [True, False]) - def test_factorize_int_basic(self, sort, use_na_sentinel): + def test_factorize_int_basic(self, use_na_sentinel): """ Int array has no NAs; first-appearance order vs sorted uniques; NA sentinel only affects behavior if there are NAs (there aren't here). """ a = ArkoudaArray(ak.array([1, 2, 1, 3])) - codes, uniques = a.factorize(sort=sort, use_na_sentinel=use_na_sentinel) - - if not sort: - # First appearance: uniques [1, 2, 3] - assert_equal(uniques._data, ak.array([1, 2, 3])) - assert_equal(codes, ArkoudaArray(ak.array([0, 1, 0, 2]))) - else: - # Sorted: uniques [1, 2, 3] (same here, but codes recomputed from sorted order) - assert_equal(uniques._data, ak.array([1, 2, 3])) - assert_equal(codes, ArkoudaArray(ak.array([0, 1, 0, 2]))) - - @pytest.mark.parametrize("sort", [False, True]) - def test_factorize_float_with_nan_default_sentinel(self, sort): + codes, uniques = a.factorize(use_na_sentinel=use_na_sentinel) + + # First appearance: uniques [1, 2, 3] + assert_equal(uniques._data, ak.array([1, 2, 3])) + np_assert_equal(codes, np.array([0, 1, 0, 2])) + + def test_factorize_float_with_nan_default_sentinel(self): """Float array treats NaN as missing -> -1 sentinel by default.""" a = ArkoudaArray(ak.array([1.0, np.nan, 1.0, 2.0])) - codes, uniques = a.factorize(sort=sort) + codes, uniques = a.factorize() - if not sort: - # First appearance uniques: [1.0, 2.0] - assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) - assert_arkouda_array_equal(codes._data, ak.array([0, -1, 0, 1])) - else: - # Sorted uniques: [1.0, 2.0] (same set) - assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) - assert_arkouda_array_equal(codes._data, ak.array([0, -1, 0, 1])) + # First appearance uniques: [1.0, 2.0] + assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) + np_assert_equal(codes, np.array([0, -1, 0, 1])) def test_factorize_float_with_nan_no_sentinel(self): """With use_na_sentinel=False, NaNs get a valid code == len(uniques).""" a = ArkoudaArray(ak.array([1.0, np.nan, 1.0, 2.0])) - codes, uniques = a.factorize(sort=False, use_na_sentinel=False) + codes, uniques = a.factorize(use_na_sentinel=False) # uniques from first appearance: [1.0, 2.0]; NaN code == 2 assert_arkouda_array_equal(uniques._data, ak.array([1.0, 2.0])) - assert_arkouda_array_equal(codes._data, ak.array([0, 2, 0, 1])) + np_assert_equal(codes, np.array([0, 2, 0, 1])) def test_factorize_float_all_nan(self): """Edge case: all values are NaN -> codes all sentinel, uniques empty.""" a = ArkoudaArray(ak.array([np.nan, np.nan])) codes, uniques = a.factorize() assert_arkouda_array_equal(uniques._data, ak.array([], dtype=float)) - assert_arkouda_array_equal(codes._data, ak.array([-1, -1], dtype=np.int64)) + np_assert_equal(codes, np.array([-1, -1], dtype=np.int64)) - @pytest.mark.parametrize("sort", [False, True]) - def test_factorize_strings_basic(self, sort): + def test_factorize_strings_basic(self): """Strings: no NA handling; empty strings are treated as normal values.""" s = ak.array(["a", "b", "a", "c"]) a = ArkoudaStringArray(s) - codes, uniques = a.factorize(sort=sort) + codes, uniques = a.factorize() - if not sort: - assert_arkouda_array_equal(uniques._data, ak.array(["a", "b", "c"])) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) - else: - # Sorted: ["a", "b", "c"] -> same result for this set - assert_arkouda_array_equal(uniques._data, ak.array(["a", "b", "c"])) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) + # Sorted: ["a", "b", "c"] -> same result for this set + assert_arkouda_array_equal(uniques._data, ak.array(["a", "b", "c"])) + np_assert_equal(codes, np.array([0, 1, 0, 2])) def test_factorize_strings_with_empty_string(self): """Explicitly ensure "" is treated as a normal value (not missing).""" s = ak.array(["", "x", "", "y"]) a = ArkoudaStringArray(s) - codes, uniques = a.factorize(sort=False) + codes, uniques = a.factorize() assert_arkouda_array_equal(uniques._data, ak.array(["", "x", "y"])) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) + np_assert_equal(codes, np.array([0, 1, 0, 2])) - @pytest.mark.parametrize("sort", [False, True]) - def test_factorize_categorical_basic(self, sort): + def test_factorize_categorical_basic(self): """ Categorical: factorization operates over observed values (not categories table), - honoring first-appearance vs sorted order semantics of the observed data. + honoring first-appearance semantics of the observed data. """ s = ak.array(["red", "blue", "red", "green"]) cat = ak.Categorical(s) # construct from Strings a = ArkoudaCategoricalArray(cat) - codes, uniques = a.factorize(sort=sort) - - if not sort: - # first appearance uniques: ["red", "blue", "green"] - assert_arkouda_array_equal(uniques._data, ak.Categorical(ak.array(["red", "blue", "green"]))) - assert_arkouda_array_equal(codes._data, ak.array([0, 1, 0, 2])) - else: - # sorted uniques: ["blue", "green", "red"] - assert_equal(uniques._data, ak.Categorical(ak.array(["blue", "green", "red"]))) - # remapped codes according to sorted order: - # red->2, blue->0, green->1 - assert_arkouda_array_equal(codes._data, ak.array([2, 0, 2, 1])) - - def test_factorize_stability_first_appearance_vs_sorted(self): + codes, uniques = a.factorize() + + # order of first-appearance: ["red", "blue", "green"] + assert_equal(uniques._data, ak.Categorical(ak.array(["red", "blue", "green"]))) + # remapped codes according to sorted order: + # red->2, blue->0, green->1 + np_assert_equal(codes, np.array([0, 1, 0, 2])) + + def test_factorize_stability_first_appearance(self): """Sanity check that switching sort changes code assignments consistently.""" x = ak.array([2, 1, 3, 2]) a = ArkoudaArray(x) - codes_unsorted, uniques_unsorted = a.factorize(sort=False) - codes_sorted, uniques_sorted = a.factorize(sort=True) + codes_unsorted, uniques_unsorted = a.factorize() # First appearance uniques: [2, 1, 3] assert_arkouda_array_equal(uniques_unsorted._data, ak.array([2, 1, 3])) - assert_arkouda_array_equal(codes_unsorted._data, ak.array([0, 1, 2, 0])) - - # Sorted uniques: [1, 2, 3] - assert_arkouda_array_equal(uniques_sorted._data, ak.array([1, 2, 3])) - # mapping old->new: 2->1, 1->0, 3->2 => [1,0,2,1] - assert_arkouda_array_equal(codes_sorted._data, ak.array([1, 0, 2, 1])) + np_assert_equal(codes_unsorted, np.array([0, 1, 2, 0])) def test_from_sequence_dispatches_to_correct_subclass(self): """ diff --git a/tests/pandas/extension/dataframe_accessor.py b/tests/pandas/extension/dataframe_accessor.py index 7b7bc46d177..be5469d0606 100644 --- a/tests/pandas/extension/dataframe_accessor.py +++ b/tests/pandas/extension/dataframe_accessor.py @@ -28,7 +28,7 @@ class TestDataFrameAccessorInternals: - def test_extension_docstrings(self): + def test_dataframe_extension_docstrings(self): import doctest from arkouda.pandas.extension import _dataframe_accessor