diff --git a/arkouda/__init__.py b/arkouda/__init__.py index 81c351d9913..26b6c4154b0 100644 --- a/arkouda/__init__.py +++ b/arkouda/__init__.py @@ -189,7 +189,6 @@ format_float_positional, format_float_scientific, format_parser, - from_series, full, full_like, get_byteorder, @@ -364,6 +363,7 @@ Row, Series, compute_join_size, + from_series, gen_ranges, join, join_on_eq_with_dt, diff --git a/arkouda/numpy/__init__.py b/arkouda/numpy/__init__.py index 2545bf69a61..da733789a41 100644 --- a/arkouda/numpy/__init__.py +++ b/arkouda/numpy/__init__.py @@ -274,7 +274,6 @@ arange, array, bigint_from_uint_arrays, - from_series, full, full_like, linspace, diff --git a/arkouda/numpy/pdarraycreation.py b/arkouda/numpy/pdarraycreation.py index 5339677a822..dbba185ec40 100644 --- a/arkouda/numpy/pdarraycreation.py +++ b/arkouda/numpy/pdarraycreation.py @@ -25,7 +25,6 @@ NUMBER_FORMAT_STRINGS, DTypes, NumericDTypes, - SeriesDTypes, bigint, bool_scalars, float64, @@ -65,109 +64,12 @@ "standard_normal", "random_strings_uniform", "random_strings_lognormal", - "from_series", "bigint_from_uint_arrays", "promote_to_common_dtype", "scalar_array", ] -@typechecked -def from_series(series: pd.Series, dtype: Optional[Union[type, str]] = None) -> Union[pdarray, Strings]: - """ - Converts a Pandas Series to an Arkouda pdarray or Strings object. If - dtype is None, the dtype is inferred from the Pandas Series. Otherwise, - the dtype parameter is set if the dtype of the Pandas Series is to be - overridden or is unknown (for example, in situations where the Series - dtype is object). - - Parameters - ---------- - series : Pandas Series - The Pandas Series with a dtype of bool, float64, int64, or string - dtype : Optional[type] - The valid dtype types are np.bool, np.float64, np.int64, and np.str - - Returns - ------- - Union[pdarray,Strings] - - Raises - ------ - TypeError - Raised if series is not a Pandas Series object - ValueError - Raised if the Series dtype is not bool, float64, int64, string, datetime, or timedelta - - Examples - -------- - >>> import arkouda as ak - >>> np.random.seed(1701) - >>> ak.from_series(pd.Series(np.random.randint(0,10,5))) - array([4 3 3 5 0]) - - >>> ak.from_series(pd.Series(['1', '2', '3', '4', '5']),dtype=np.int64) - array([1 2 3 4 5]) - - >>> np.random.seed(1701) - >>> ak.from_series(pd.Series(np.random.uniform(low=0.0,high=1.0,size=3))) - array([0.089433234324597599 0.1153776854774361 0.51874393620990389]) - - >>> ak.from_series( - ... pd.Series([ - ... '0.57600036956445599', - ... '0.41619265571741659', - ... '0.6615356693784662', - ... ]), - ... dtype=np.float64, - ... ) - array([0.57600036956445599 0.41619265571741659 0.6615356693784662]) - - >>> np.random.seed(1864) - >>> ak.from_series(pd.Series(np.random.choice([True, False],size=5))) - array([True True True False False]) - - >>> ak.from_series(pd.Series(['True', 'False', 'False', 'True', 'True']), dtype=bool) - array([True True True True True]) - - >>> ak.from_series(pd.Series(['a', 'b', 'c', 'd', 'e'], dtype="string")) - array(['a', 'b', 'c', 'd', 'e']) - - >>> ak.from_series(pd.Series(pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01')]))) - array([1514764800000000000 1514764800000000000]) - - Notes - ----- - The supported datatypes are bool, float64, int64, string, and datetime64[ns]. The - data type is either inferred from the the Series or is set via the dtype parameter. - - Series of datetime or timedelta are converted to Arkouda arrays of dtype int64 (nanoseconds) - - A Pandas Series containing strings has a dtype of object. Arkouda assumes the Series - contains strings and sets the dtype to str - """ - if not dtype: - dt = series.dtype.name - else: - dt = str(dtype) - try: - """ - If the Series has a object dtype, set dtype to string to comply with method - signature that does not require a dtype; this is required because Pandas can infer - non-str dtypes from the input np or Python array. - """ - if dt == "object": - dt = "string" - - n_array = series.to_numpy(dtype=SeriesDTypes[dt]) # type: ignore - except KeyError: - raise ValueError( - f"dtype {dt} is unsupported. Supported dtypes are bool, float64, int64, string, " - f"datetime64[ns], and timedelta64[ns]" - ) - return array(n_array) - - def _deepcopy(a: pdarray) -> pdarray: from arkouda.client import generic_msg from arkouda.numpy.pdarrayclass import create_pdarray diff --git a/arkouda/numpy/timeclass.py b/arkouda/numpy/timeclass.py index 565251119e8..d2bb33e463b 100644 --- a/arkouda/numpy/timeclass.py +++ b/arkouda/numpy/timeclass.py @@ -95,7 +95,7 @@ class _AbstractBaseTime(pdarray): def __init__(self, pda, unit: str = _BASE_UNIT): from arkouda.numpy import cast as akcast - from arkouda.numpy.pdarraycreation import from_series + from arkouda.pandas.conversion import from_series if isinstance(pda, Datetime) or isinstance(pda, Timedelta): self.unit: str = pda.unit diff --git a/arkouda/pandas/__init__.py b/arkouda/pandas/__init__.py index 4ae52095b31..3cc7aae1108 100644 --- a/arkouda/pandas/__init__.py +++ b/arkouda/pandas/__init__.py @@ -17,3 +17,4 @@ from arkouda.pandas.row import Row from arkouda.pandas.series import Series from arkouda.pandas.typing import ArkoudaArrayLike +from arkouda.pandas.conversion import from_series diff --git a/arkouda/pandas/conversion.py b/arkouda/pandas/conversion.py new file mode 100644 index 00000000000..d27f5ebe73c --- /dev/null +++ b/arkouda/pandas/conversion.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, TypeVar, Union + +import pandas as pd + +from typeguard import typechecked + +from arkouda.numpy.dtypes import SeriesDTypes +from arkouda.numpy.pdarrayclass import pdarray + + +if TYPE_CHECKING: + from arkouda.numpy.strings import Strings +else: + Strings = TypeVar("Strings") + +__all__ = ["from_series"] + + +@typechecked +def from_series( + series: pd.Series, dtype: Optional[Union[type, str]] = None +) -> Union[pdarray, "Strings"]: + """ + Converts a Pandas Series to an Arkouda pdarray or Strings object. If + dtype is None, the dtype is inferred from the Pandas Series. Otherwise, + the dtype parameter is set if the dtype of the Pandas Series is to be + overridden or is unknown (for example, in situations where the Series + dtype is object). + + Parameters + ---------- + series : pd.Series + The Pandas Series with a dtype of bool, float64, int64, or string + dtype : Optional[Union[type, str]] + The valid dtype types are np.bool, np.float64, np.int64, and np.str + + Returns + ------- + Union[pdarray, Strings] + + Raises + ------ + ValueError + Raised if the Series dtype is not bool, float64, int64, string, datetime, or timedelta + + Examples + -------- + >>> import arkouda as ak + >>> np.random.seed(1701) + >>> ak.from_series(pd.Series(np.random.randint(0,10,5))) + array([4 3 3 5 0]) + + >>> ak.from_series(pd.Series(['1', '2', '3', '4', '5']),dtype=np.int64) + array([1 2 3 4 5]) + + >>> np.random.seed(1701) + >>> ak.from_series(pd.Series(np.random.uniform(low=0.0,high=1.0,size=3))) + array([0.089433234324597599 0.1153776854774361 0.51874393620990389]) + + >>> ak.from_series( + ... pd.Series([ + ... '0.57600036956445599', + ... '0.41619265571741659', + ... '0.6615356693784662', + ... ]), + ... dtype=np.float64, + ... ) + array([0.57600036956445599 0.41619265571741659 0.6615356693784662]) + + >>> np.random.seed(1864) + >>> ak.from_series(pd.Series(np.random.choice([True, False],size=5))) + array([True True True False False]) + + >>> ak.from_series(pd.Series(['True', 'False', 'False', 'True', 'True']), dtype=bool) + array([True True True True True]) + + >>> ak.from_series(pd.Series(['a', 'b', 'c', 'd', 'e'], dtype="string")) + array(['a', 'b', 'c', 'd', 'e']) + + >>> ak.from_series(pd.Series(pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01')]))) + array([1514764800000000000 1514764800000000000]) + + Notes + ----- + The supported datatypes are bool, float64, int64, string, and datetime64[ns]. The + data type is either inferred from the the Series or is set via the dtype parameter. + + Series of datetime or timedelta are converted to Arkouda arrays of dtype int64 (nanoseconds) + + A Pandas Series containing strings has a dtype of object. Arkouda assumes the Series + contains strings and sets the dtype to str + """ + from arkouda.numpy.pdarraycreation import array + + if not dtype: + dt = series.dtype.name + else: + dt = str(dtype) + try: + """ + If the Series has a object dtype, set dtype to string to comply with method + signature that does not require a dtype; this is required because Pandas can infer + non-str dtypes from the input np or Python array. + """ + if dt == "object": + dt = "string" + + n_array = series.to_numpy(dtype=SeriesDTypes[dt]) # type: ignore + except KeyError: + raise ValueError( + f"dtype {dt} is unsupported. Supported dtypes are bool, float64, int64, string, " + f"datetime64[ns], and timedelta64[ns]" + ) + return array(n_array) diff --git a/arkouda/pdarraycreation/__init__.py b/arkouda/pdarraycreation/__init__.py index 19340f2811e..e95cf7f53bb 100644 --- a/arkouda/pdarraycreation/__init__.py +++ b/arkouda/pdarraycreation/__init__.py @@ -4,7 +4,6 @@ arange, array, bigint_from_uint_arrays, - from_series, full, full_like, linspace, diff --git a/pytest.ini b/pytest.ini index 8f0f5574427..e1d0bb3358b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -65,6 +65,7 @@ testpaths = tests/numpy/utils_test.py tests/operator_test.py tests/pandas/categorical_test.py + tests/pandas/conversion_test.py tests/pandas/dataframe_test.py tests/pandas/extension/arkouda_array_extension.py tests/pandas/extension/arkouda_categorical_extension.py diff --git a/tests/numpy/pdarray_creation_test.py b/tests/numpy/pdarray_creation_test.py index 32104152a6c..a39c9ae4477 100644 --- a/tests/numpy/pdarray_creation_test.py +++ b/tests/numpy/pdarray_creation_test.py @@ -1,4 +1,3 @@ -import datetime as dt import math import statistics @@ -1128,52 +1127,6 @@ def test_random_strings_lognormal_with_seed(self): ) assert printable_randoms == pda.tolist() - @pytest.mark.parametrize("size", pytest.prob_size) - @pytest.mark.parametrize("dtype", [bool, np.float64, np.int64, str]) - def test_from_series_dtypes(self, size, dtype): - p_array = ak.from_series(pd.Series(np.random.randint(0, 10, size)), dtype) - assert isinstance(p_array, ak.pdarray if dtype is not str else ak.Strings) - assert dtype == p_array.dtype - - p_objects_array = ak.from_series( - pd.Series(np.random.randint(0, 10, size), dtype="object"), dtype=dtype - ) - assert isinstance(p_objects_array, ak.pdarray if dtype is not str else ak.Strings) - assert dtype == p_objects_array.dtype - - def test_from_series_misc(self): - p_array = ak.from_series(pd.Series(["a", "b", "c", "d", "e"])) - assert isinstance(p_array, ak.Strings) - assert str == p_array.dtype - - p_array = ak.from_series(pd.Series(np.random.choice([True, False], size=10))) - - assert isinstance(p_array, ak.pdarray) - assert bool == p_array.dtype - - p_array = ak.from_series(pd.Series([dt.datetime(2016, 1, 1, 0, 0, 1)])) - - assert isinstance(p_array, ak.pdarray) - assert np.int64 == p_array.dtype - - p_array = ak.from_series(pd.Series([np.datetime64("2018-01-01")])) - - assert isinstance(p_array, ak.pdarray) - assert np.int64 == p_array.dtype - - p_array = ak.from_series( - pd.Series(pd.to_datetime(["1/1/2018", np.datetime64("2018-01-01"), dt.datetime(2018, 1, 1)])) - ) - - assert isinstance(p_array, ak.pdarray) - assert np.int64 == p_array.dtype - - with pytest.raises(TypeError): - ak.from_series(np.ones(10)) - - with pytest.raises(ValueError): - ak.from_series(pd.Series(np.random.randint(0, 10, 10), dtype=np.int8)) - @pytest.mark.parametrize("dtype", NUMERIC_SCALARS) @pytest.mark.parametrize("size", pytest.prob_size) def test_fill(self, size, dtype): diff --git a/tests/pandas/conversion_test.py b/tests/pandas/conversion_test.py new file mode 100644 index 00000000000..9bccc08f4b0 --- /dev/null +++ b/tests/pandas/conversion_test.py @@ -0,0 +1,55 @@ +import datetime as dt + +import numpy as np +import pandas as pd +import pytest + +import arkouda as ak + + +class TestPandasConversion: + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("dtype", [bool, np.float64, np.int64, str]) + def test_from_series_dtypes(self, size, dtype): + p_array = ak.from_series(pd.Series(np.random.randint(0, 10, size)), dtype) + assert isinstance(p_array, ak.pdarray if dtype is not str else ak.Strings) + assert dtype == p_array.dtype + + p_objects_array = ak.from_series( + pd.Series(np.random.randint(0, 10, size), dtype="object"), dtype=dtype + ) + assert isinstance(p_objects_array, ak.pdarray if dtype is not str else ak.Strings) + assert dtype == p_objects_array.dtype + + def test_from_series_misc(self): + p_array = ak.from_series(pd.Series(["a", "b", "c", "d", "e"])) + assert isinstance(p_array, ak.Strings) + assert str == p_array.dtype + + p_array = ak.from_series(pd.Series(np.random.choice([True, False], size=10))) + + assert isinstance(p_array, ak.pdarray) + assert bool == p_array.dtype + + p_array = ak.from_series(pd.Series([dt.datetime(2016, 1, 1, 0, 0, 1)])) + + assert isinstance(p_array, ak.pdarray) + assert np.int64 == p_array.dtype + + p_array = ak.from_series(pd.Series([np.datetime64("2018-01-01")])) + + assert isinstance(p_array, ak.pdarray) + assert np.int64 == p_array.dtype + + p_array = ak.from_series( + pd.Series(pd.to_datetime(["1/1/2018", np.datetime64("2018-01-01"), dt.datetime(2018, 1, 1)])) + ) + + assert isinstance(p_array, ak.pdarray) + assert np.int64 == p_array.dtype + + with pytest.raises(TypeError): + ak.from_series(np.ones(10)) + + with pytest.raises(ValueError): + ak.from_series(pd.Series(np.random.randint(0, 10, 10), dtype=np.int8))