Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 76 additions & 77 deletions arkouda/pandas/extension/_arkouda_extension_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

import numpy as np

from numpy.typing import NDArray
from pandas.api.extensions import ExtensionArray

from arkouda.numpy.dtypes import all_scalars
Expand Down Expand Up @@ -349,45 +350,47 @@ def take(self, indexer, fill_value=None, allow_fill=False):
gathered = ak.where(mask, fv, self._data[idx_fix])
return type(self)(gathered)

def factorize( # type: ignore[override]
self, use_na_sentinel=True, sort=False, **kwargs
) -> Tuple["ArkoudaExtensionArray", "ArkoudaExtensionArray"]:
def factorize(self, use_na_sentinel=True) -> Tuple[NDArray[np.intp], "ArkoudaExtensionArray"]:
"""
Encode the values of this array as integer codes and uniques,
similar to :func:`pandas.factorize`, but implemented with Arkouda.
Encode the values of this array as integer codes and unique values.

This is similar to :func:`pandas.factorize`, but the grouping/factorization
work is performed in Arkouda. The returned ``codes`` are a NumPy array for
pandas compatibility, while ``uniques`` are returned as an ExtensionArray
of the same type as ``self``.

Each distinct non-missing value is assigned a unique integer code.
Missing values (NaN in floating dtypes) are encoded as -1 by default.
For floating dtypes, ``NaN`` is treated as missing; for all other dtypes,
no values are considered missing.

Parameters
----------
use_na_sentinel : bool, default True
If True, missing values are encoded as -1 in the codes array.
If False, missing values are assigned a valid code equal to
``len(uniques)``.
sort : bool, default False
Whether to sort the unique values. If False, the unique values
appear in the order of first appearance in the array. If True,
the unique values are sorted, and codes are assigned accordingly.
**kwargs
Ignored for compatibility.
If True, missing values are encoded as ``-1`` in the returned codes.
If False, missing values are assigned the code ``len(uniques)``.
(Missingness is only detected for floating dtypes via ``NaN``.)

Returns
-------
Tuple[pdarray, ArkoudaExtensionArray]
(numpy.ndarray, ExtensionArray)
A pair ``(codes, uniques)`` where:
- ``codes`` is a NumPy ``int64`` array of factor labels, one per element.
Missing values are ``-1`` if ``use_na_sentinel=True``; otherwise they
receive the code ``len(uniques)``.
- ``uniques`` is a NumPy array of the unique values.

* ``codes`` is a 1D NumPy array of dtype ``np.intp`` with the same length
as this array, containing the factor codes for each element.
* ``uniques`` is an ExtensionArray containing the unique (non-missing)
values, with the same extension type as ``self``.

If ``use_na_sentinel=True``, missing values in ``codes`` are ``-1``.
Otherwise they receive the code ``len(uniques)``.

Notes
-----
* Only floating-point dtypes treat ``NaN`` as missing; for other dtypes,
no values are considered missing.
* This method executes all grouping and factorization in Arkouda,
returning results as NumPy arrays for compatibility with pandas.
* Unlike pandas, string/None/null handling is not yet unified.
all values are treated as non-missing.
* ``uniques`` are constructed from Arkouda's unique keys and returned as
``type(self)(uniques_ak)`` so that pandas internals (e.g. ``groupby``)
can treat them as an ExtensionArray.
* String/None/null missing-value behavior is not yet unified with pandas.

Examples
--------
Expand All @@ -396,7 +399,7 @@ def factorize( # type: ignore[override]
>>> arr = ArkoudaArray(ak.array([1, 2, 1, 3]))
>>> codes, uniques = arr.factorize()
>>> codes
ArkoudaArray([0 1 0 2])
array([0, 1, 0, 2])
>>> uniques
ArkoudaArray([1 2 3])
"""
Expand All @@ -407,7 +410,6 @@ def factorize( # type: ignore[override]
from arkouda.numpy.pdarraycreation import array as ak_array
from arkouda.numpy.sorting import argsort
from arkouda.numpy.strings import Strings
from arkouda.pandas.extension import ArkoudaArray
from arkouda.pandas.groupbyclass import GroupBy

# Arkouda array backing
Expand All @@ -425,7 +427,7 @@ def factorize( # type: ignore[override]
sent = -1 if use_na_sentinel else 0
from arkouda.numpy.pdarraycreation import full as ak_full

return ArkoudaArray(ak_full(n, sent, dtype=int64)), type(self)(
return ak_full(n, sent, dtype=int64).to_ndarray(), type(self)(
ak_array([], dtype=self.to_numpy().dtype)
)

Expand All @@ -437,28 +439,16 @@ def factorize( # type: ignore[override]

uniques_ak = concatenate(uniques_ak)

if sort:
# Keys already sorted; group id -> 0..k-1
groupid_to_code = arange(uniques_ak.size, dtype=int64)

# Work around to account GroupBy not sorting Categorical properly
if isinstance(arr, Categorical):
perm = uniques_ak.argsort()
# Inverse argsort:
groupid_to_code[perm] = arange(uniques_ak.size, dtype=int64)
uniques_ak = uniques_ak[perm]

else:
# First-appearance order
_keys, first_idx_per_group = g.min(arange(arr_nn.size, dtype=int64))
order = argsort(first_idx_per_group)
# First-appearance order
_keys, first_idx_per_group = g.min(arange(arr_nn.size, dtype=int64))
order = argsort(first_idx_per_group)

# Reorder uniques by first appearance
uniques_ak = uniques_ak[order]
# Reorder uniques by first appearance
uniques_ak = uniques_ak[order]

# Map group_id -> code in first-appearance order
groupid_to_code = zeros(order.size, dtype=int64)
groupid_to_code[order] = arange(order.size, dtype=int64)
# Map group_id -> code in first-appearance order
groupid_to_code = zeros(order.size, dtype=int64)
groupid_to_code[order] = arange(order.size, dtype=int64)

# Per-element codes on the non-NA slice
codes_nn = g.broadcast(groupid_to_code)
Expand All @@ -468,7 +458,9 @@ def factorize( # type: ignore[override]
codes_ak = full(n, sentinel, dtype=int64)
codes_ak[non_na] = codes_nn

return ArkoudaArray(codes_ak), type(self)(uniques_ak)
codes_np = codes_ak.to_ndarray().astype(np.intp, copy=False)

return codes_np, type(self)(uniques_ak)

# In each EA
def _values_for_factorize(self):
Expand Down Expand Up @@ -527,42 +519,45 @@ def to_ndarray(self) -> np.ndarray:
"""
return self._data.to_ndarray()

def argsort( # type: ignore[override]
def argsort(
self,
*,
ascending: bool = True,
kind="quicksort",
na_position: str = "last",
**kwargs,
) -> pdarray:
kind: str = "quicksort",
**kwargs: object,
) -> NDArray[np.intp]:
"""
Return the indices that would sort the array.

This method computes the permutation indices that would sort the
underlying Arkouda data. It aligns with the pandas ``ExtensionArray``
contract, returning a 1-D ``pdarray`` of integer indices suitable for
reordering the array via ``take`` or ``iloc``. NaN values are placed
either at the beginning or end of the result depending on
``na_position``.
This method computes the permutation indices that would sort the underlying
Arkouda data and returns them as a NumPy array, in accordance with the
pandas ``ExtensionArray`` contract. The indices can be used to reorder the
array via ``take`` or ``iloc``.

For floating-point data, ``NaN`` values are handled according to the
``na_position`` keyword argument.

Parameters
----------
ascending : bool, default True
If True, sort values in ascending order. If False, sort in
descending order.
If True, sort values in ascending order. If False, sort in descending
order.
kind : str, default "quicksort"
Sorting algorithm. Present for API compatibility with NumPy and
pandas but currently ignored.
na_position : {"first", "last"}, default "last"
Where to place NaN values in the sorted result. Currently only implemented for pdarray.
For Strings and Categorical will have no effect.
**kwargs : Any
Additional keyword arguments for compatibility; ignored.
Sorting algorithm. Present for API compatibility with NumPy and pandas
but currently ignored.
**kwargs
Additional keyword arguments for compatibility. Supported keyword:

* ``na_position`` : {"first", "last"}, default "last"
Where to place ``NaN`` values in the sorted result. This option is
currently only applied for floating-point ``pdarray`` data; for
``Strings`` and ``Categorical`` data it has no effect.

Returns
-------
pdarray
Integer indices (``int64``) that would sort the array.
numpy.ndarray
A 1D NumPy array of dtype ``np.intp`` containing the indices that would
sort the array.

Raises
------
Expand All @@ -573,21 +568,22 @@ def argsort( # type: ignore[override]

Notes
-----
- Supports Arkouda ``pdarray``, ``Strings``, and ``Categorical`` data.
- Floating-point arrays have NaNs repositioned according to
* Supports Arkouda ``pdarray``, ``Strings``, and ``Categorical`` data.
* For floating-point arrays, ``NaN`` values are repositioned according to
``na_position``.
- This method does not move data to the client; the computation
occurs on the Arkouda server.
* The sorting computation occurs on the Arkouda server, but the resulting
permutation indices are materialized on the client as a NumPy array, as
required by pandas internals.

Examples
--------
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaArray
>>> a = ArkoudaArray(ak.array([3.0, float("nan"), 1.0]))
>>> a.argsort() # NA last by default
array([2 0 1])
array([2, 0, 1])
>>> a.argsort(na_position="first")
array([1 2 0])
array([1, 2, 0])
"""
from arkouda.numpy import argsort
from arkouda.numpy.numeric import isnan as ak_isnan
Expand All @@ -596,6 +592,9 @@ def argsort( # type: ignore[override]
from arkouda.numpy.util import is_float
from arkouda.pandas.categorical import Categorical

# Extract na_position from kwargs
na_position = kwargs.pop("na_position", "last")

if na_position not in {"first", "last"}:
raise ValueError("na_position must be 'first' or 'last'.")

Expand All @@ -613,7 +612,7 @@ def argsort( # type: ignore[override]
else:
raise TypeError(f"Unsupported argsort dtype: {type(self._data)}")

return perm
return perm.to_ndarray()

def broadcast_arrays(self, *arrays):
raise NotImplementedError(
Expand Down
Loading