diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 89bc942cb7250..0f2a7696cb792 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -38,6 +38,7 @@ Other enhancements - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`) +- Added a new ``keep`` option to the ``normalize`` parameter of the ``value_counts`` function. This new option allows both normalized and unnormalized values to be displayed in a tuple format. (contributed by: Keramatfar) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda93251..1dc44dc5a9684 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2,7 +2,6 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ - from __future__ import annotations import decimal @@ -11,6 +10,7 @@ from typing import ( TYPE_CHECKING, Literal, + Union, cast, ) import warnings @@ -26,7 +26,6 @@ from pandas._typing import ( AnyArrayLike, ArrayLike, - ArrayLikeT, AxisInt, DtypeObj, TakeIndexer, @@ -43,11 +42,11 @@ ensure_float64, ensure_object, ensure_platform_int, + is_array_like, is_bool_dtype, is_complex_dtype, is_dict_like, is_extension_array_dtype, - is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -68,7 +67,6 @@ ABCExtensionArray, ABCIndex, ABCMultiIndex, - ABCNumpyExtensionArray, ABCSeries, ABCTimedeltaArray, ) @@ -185,8 +183,8 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: def _reconstruct_data( - values: ArrayLikeT, dtype: DtypeObj, original: AnyArrayLike -) -> ArrayLikeT: + values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike +) -> ArrayLike: """ reverse of _ensure_data @@ -209,9 +207,7 @@ def _reconstruct_data( # that values.dtype == dtype cls = dtype.construct_array_type() - # error: Incompatible types in assignment (expression has type - # "ExtensionArray", variable has type "ndarray[Any, Any]") - values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment] + values = cls._from_sequence(values, dtype=dtype) else: values = values.astype(dtype, copy=False) @@ -223,17 +219,16 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ - if not isinstance( - values, - (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray, ABCNumpyExtensionArray), - ): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): # GH#52986 if func_name != "isin-targets": # Make an exception for the comps argument in isin. - raise TypeError( - f"{func_name} requires a Series, Index, " - f"ExtensionArray, np.ndarray or NumpyExtensionArray " - f"got {type(values).__name__}." + warnings.warn( + f"{func_name} with argument that is not not a Series, Index, " + "ExtensionArray, or np.ndarray is deprecated and will raise in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), ) inferred = lib.infer_dtype(values, skipna=False) @@ -265,9 +260,7 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: } -def _get_hashtable_algo( - values: np.ndarray, -) -> tuple[type[htable.HashTable], np.ndarray]: +def _get_hashtable_algo(values: np.ndarray): """ Parameters ---------- @@ -324,12 +317,10 @@ def unique(values): Parameters ---------- values : 1d array-like - The input array-like object containing values from which to extract - unique values. Returns ------- - numpy.ndarray, ExtensionArray or NumpyExtensionArray + numpy.ndarray or ExtensionArray The return can be: @@ -337,7 +328,7 @@ def unique(values): * Categorical : when the input is a Categorical dtype * ndarray : when the input is a Series/ndarray - Return numpy.ndarray, ExtensionArray or NumpyExtensionArray. + Return numpy.ndarray or ExtensionArray. See Also -------- @@ -353,15 +344,14 @@ def unique(values): array([2, 1]) >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00'], dtype='datetime64[s]') + array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ], - ... dtype="M8[ns, US/Eastern]", + ... ] ... ) ... ) @@ -373,8 +363,7 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ], - ... dtype="M8[ns, US/Eastern]", + ... ] ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], @@ -409,13 +398,6 @@ def unique(values): >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) - - An NumpyExtensionArray of complex - - >>> pd.unique(pd.array([1 + 1j, 2, 3])) - - [(1+1j), (2+0j), (3+0j)] - Length: 3, dtype: complex128 """ return unique_with_mask(values) @@ -451,10 +433,6 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): # Dispatch to extension dtype's unique. return values.unique() - if isinstance(values, ABCIndex): - # Dispatch to Index's unique. - return values.unique() - original = values hashtable, values = _get_hashtable_algo(values) @@ -835,11 +813,58 @@ def factorize( return codes, uniques +def value_counts( + values, + sort: bool = True, + ascending: bool = False, + normalize: bool | str = False, + bins=None, + dropna: bool = True, +) -> Series: + """ + Compute a histogram of the counts of non-null values. + + Parameters + ---------- + values : ndarray (1-d) + sort : bool, default True + Sort by values + ascending : bool, default False + Sort in ascending order + normalize: bool, default False + If True then compute a relative histogram + bins : integer, optional + Rather than count values, group them into half-open bins, + convenience for pd.cut, only works with numeric data + dropna : bool, default True + Don't include counts of NaN + + Returns + ------- + Series + """ + warnings.warn( + # GH#53493 + "pandas.value_counts is deprecated and will be removed in a " + "future version. Use pd.Series(obj).value_counts() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return value_counts_internal( + values, + sort=sort, + ascending=ascending, + normalize=normalize, + bins=bins, + dropna=dropna, + ) + + def value_counts_internal( values, sort: bool = True, ascending: bool = False, - normalize: bool = False, + normalize: bool | str = False, bins=None, dropna: bool = True, ) -> Series: @@ -904,17 +929,36 @@ def value_counts_internal( if keys.dtype == np.float16: keys = keys.astype(np.float32) - # Starting in 3.0, we no longer perform dtype inference on the - # Index object we construct here, xref GH#56161 - idx = Index(keys, dtype=keys.dtype, name=index_name) + # For backwards compatibility, we let Index do its normal type + # inference, _except_ for if if infers from object to bool. + idx = Index(keys) + if idx.dtype == bool and keys.dtype == object: + idx = idx.astype(object) + elif ( + idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 + and idx.dtype != "string[pyarrow_numpy]" + ): + warnings.warn( + # GH#56161 + "The behavior of value_counts with object-dtype is deprecated. " + "In a future version, this will *not* perform dtype inference " + "on the resulting index. To retain the old behavior, use " + "`result.index = result.index.infer_objects()`", + FutureWarning, + stacklevel=find_stack_level(), + ) + idx.name = index_name + result = Series(counts, index=idx, name=name, copy=False) if sort: result = result.sort_values(ascending=ascending) - if normalize: + if normalize is True: + # Handle normalization as usual result = result / counts.sum() - + elif normalize == "keep": + result = result.astype(str)+"("+(result/counts.sum()).apply(lambda x: round(x, 6)).astype(str)+")" return result @@ -1168,40 +1212,34 @@ def take( >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) array([10., 10., nan]) - >>> pd.api.extensions.take( - ... np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, fill_value=-10 - ... ) + >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, + ... fill_value=-10) array([ 10, 10, -10]) """ - if not isinstance( - arr, - (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, ABCNumpyExtensionArray), - ): + if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): # GH#52981 - raise TypeError( - "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " - f"Index, Series, or NumpyExtensionArray got {type(arr).__name__}." + warnings.warn( + "pd.api.extensions.take accepting non-standard inputs is deprecated " + "and will raise in a future version. Pass either a numpy.ndarray, " + "ExtensionArray, Index, or Series instead.", + FutureWarning, + stacklevel=find_stack_level(), ) + if not is_array_like(arr): + arr = np.asarray(arr) + indices = ensure_platform_int(indices) if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) - # error: Argument 1 to "take_nd" has incompatible type - # "ndarray[Any, Any] | ExtensionArray | Index | Series"; expected - # "ndarray[Any, Any]" result = take_nd( - arr, # type: ignore[arg-type] - indices, - axis=axis, - allow_fill=True, - fill_value=fill_value, + arr, indices, axis=axis, allow_fill=True, fill_value=fill_value ) else: # NumPy style - # error: Unexpected keyword argument "axis" for "take" of "ExtensionArray" - result = arr.take(indices, axis=axis) # type: ignore[call-arg,assignment] + result = arr.take(indices, axis=axis) return result @@ -1321,12 +1359,7 @@ def diff(arr, n: int, axis: AxisInt = 0): shifted """ - # added a check on the integer value of period - # see https://github.com/pandas-dev/pandas/issues/56607 - if not lib.is_integer(n): - if not (is_float(n) and n.is_integer()): - raise ValueError("periods must be an integer") - n = int(n) + n = int(n) na = np.nan dtype = arr.dtype @@ -1520,16 +1553,16 @@ def safe_sort( hash_klass, values = _get_hashtable_algo(values) # type: ignore[arg-type] t = hash_klass(len(values)) t.map_locations(values) - # error: Argument 1 to "lookup" of "HashTable" has incompatible type - # "ExtensionArray | ndarray[Any, Any] | Index | Series"; expected "ndarray" - sorter = ensure_platform_int(t.lookup(ordered)) # type: ignore[arg-type] + sorter = ensure_platform_int(t.lookup(ordered)) if use_na_sentinel: # take_nd is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() if verify: mask = (codes < -len(values)) | (codes >= len(values)) - codes[mask] = -1 + codes[mask] = 0 + else: + mask = None new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) @@ -1538,6 +1571,14 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") + if use_na_sentinel: + mask = codes == -1 + if verify: + mask = mask | (codes < -len(values)) | (codes >= len(values)) + + if use_na_sentinel and mask is not None: + np.putmask(new_codes, mask, -1) + return ordered, ensure_platform_int(new_codes) @@ -1596,8 +1637,16 @@ def union_with_duplicates( """ from pandas import Series - l_count = value_counts_internal(lvals, dropna=False) - r_count = value_counts_internal(rvals, dropna=False) + with warnings.catch_warnings(): + # filter warning from object dtype inference; we will end up discarding + # the index here, so the deprecation does not affect the end result here. + warnings.filterwarnings( + "ignore", + "The behavior of value_counts with object-dtype is deprecated", + category=FutureWarning, + ) + l_count = value_counts_internal(lvals, dropna=False) + r_count = value_counts_internal(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) final_count = np.maximum(l_count.values, r_count.values) final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) @@ -1622,6 +1671,7 @@ def map_array( arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None, + convert: bool = True, ) -> np.ndarray | ExtensionArray | Index: """ Map values using an input mapping or function. @@ -1633,6 +1683,9 @@ def map_array( na_action : {None, 'ignore'}, default None If 'ignore', propagate NA values, without passing them to the mapping correspondence. + convert : bool, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object. Returns ------- @@ -1690,6 +1743,8 @@ def map_array( # we must convert to python types values = arr.astype(object, copy=False) if na_action is None: - return lib.map_infer(values, mapper) + return lib.map_infer(values, mapper, convert=convert) else: - return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8)) + return lib.map_infer_mask( + values, mapper, mask=isna(values).view(np.uint8), convert=convert + ) diff --git a/pandas/core/base.py b/pandas/core/base.py index 61a7c079d87f8..82932f7aef19a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -10,13 +10,17 @@ Any, Generic, Literal, + Union, cast, final, overload, ) +import warnings import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._typing import ( AxisInt, @@ -34,6 +38,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( @@ -44,7 +49,6 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, - ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -86,6 +90,12 @@ _shared_docs: dict[str, str] = {} +_indexops_doc_kwargs = { + "klass": "IndexOpsMixin", + "inplace": "", + "unique": "IndexOpsMixin", + "duplicated": "IndexOpsMixin", +} class PandasObject(DirNamesMixin): @@ -97,7 +107,7 @@ class PandasObject(DirNamesMixin): _cache: dict[str, Any] @property - def _constructor(self) -> type[Self]: + def _constructor(self): """ Class constructor (for this class it's just `__class__`). """ @@ -128,7 +138,7 @@ def __sizeof__(self) -> int: """ memory_usage = getattr(self, "memory_usage", None) if memory_usage: - mem = memory_usage(deep=True) + mem = memory_usage(deep=True) # pylint: disable=not-callable return int(mem if is_scalar(mem) else mem.sum()) # no memory_usage attribute, so fall back to object's 'sizeof' @@ -209,7 +219,7 @@ def _obj_with_exclusions(self): return self.obj if self._selection is not None: - return self.obj[self._selection_list] + return self.obj._getitem_nocopy(self._selection_list) if len(self.exclusions) > 0: # equivalent to `self.obj.drop(self.exclusions, axis=1) @@ -310,10 +320,6 @@ def transpose(self, *args, **kwargs) -> Self: doc=""" Return the transpose, which is by definition self. - See Also - -------- - Index : Immutable sequence used for indexing and alignment. - Examples -------- For Series: @@ -343,12 +349,6 @@ def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. - See Also - -------- - Series.ndim : Number of dimensions of the underlying data. - Series.size : Return the number of elements in the underlying data. - Series.nbytes : Return the number of bytes in the underlying data. - Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -361,24 +361,14 @@ def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) - # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug - # https://github.com/ipython/ipython/issues/14412 - # https://github.com/davidhalter/jedi/issues/1990 @property - def ndim(self) -> int: + def ndim(self) -> Literal[1]: """ Number of dimensions of the underlying data, by definition 1. - See Also - -------- - Series.size: Return the number of elements in the underlying data. - Series.shape: Return a tuple of the shape of the underlying data. - Series.dtype: Return the dtype object of the underlying data. - Series.values: Return Series as ndarray or ndarray-like depending on the dtype. - Examples -------- - >>> s = pd.Series(["Ant", "Bear", "Cow"]) + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant 1 Bear @@ -412,11 +402,6 @@ def item(self): ValueError If the data is not length = 1. - See Also - -------- - Index.values : Returns an array representing the data in the Index. - Series.head : Returns the first `n` rows. - Examples -------- >>> s = pd.Series([1]) @@ -425,7 +410,7 @@ def item(self): For an index: - >>> s = pd.Series([1], index=["a"]) + >>> s = pd.Series([1], index=['a']) >>> s.index.item() 'a' """ @@ -438,16 +423,11 @@ def nbytes(self) -> int: """ Return the number of bytes in the underlying data. - See Also - -------- - Series.ndim : Number of dimensions of the underlying data. - Series.size : Return the number of elements in the underlying data. - Examples -------- For Series: - >>> s = pd.Series(["Ant", "Bear", "Cow"]) + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant 1 Bear @@ -471,18 +451,11 @@ def size(self) -> int: """ Return the number of elements in the underlying data. - See Also - -------- - Series.ndim: Number of dimensions of the underlying data, by definition 1. - Series.shape: Return a tuple of the shape of the underlying data. - Series.dtype: Return the dtype object of the underlying data. - Series.values: Return Series as ndarray or ndarray-like depending on the dtype. - Examples -------- For Series: - >>> s = pd.Series(["Ant", "Bear", "Cow"]) + >>> s = pd.Series(['Ant', 'Bear', 'Cow']) >>> s 0 Ant 1 Bear @@ -559,13 +532,14 @@ def array(self) -> ExtensionArray: For extension types, like Categorical, the actual ExtensionArray is returned - >>> ser = pd.Series(pd.Categorical(["a", "b", "a"])) + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.array ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] """ raise AbstractMethodError(self) + @final def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -595,8 +569,6 @@ def to_numpy( Returns ------- numpy.ndarray - The NumPy ndarray holding the values from this Series or Index. - The dtype of the array may differ. See Notes. See Also -------- @@ -639,7 +611,7 @@ def to_numpy( Examples -------- - >>> ser = pd.Series(pd.Categorical(["a", "b", "a"])) + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) @@ -647,7 +619,7 @@ def to_numpy( Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. - >>> ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) + >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), Timestamp('2000-01-02 00:00:00+0100', tz='CET')], @@ -677,7 +649,7 @@ def to_numpy( ) values = self._values - if fillna and self.hasnans: + if fillna: if not can_hold_element(values, na_value): # if we can't hold the na_value asarray either makes a copy or we # error before modifying values. The asarray later on thus won't make @@ -690,10 +662,10 @@ def to_numpy( result = np.asarray(values, dtype=dtype) - if (copy and not fillna) or not copy: + if (copy and not fillna) or (not copy and using_copy_on_write()): if np.shares_memory(self._values[:2], result[:2]): # Take slices to improve performance of check - if not copy: + if using_copy_on_write() and not copy: result = result.view() result.flags.writeable = False else: @@ -704,45 +676,6 @@ def to_numpy( @final @property def empty(self) -> bool: - """ - Indicator whether Index is empty. - - An Index is considered empty if it has no elements. This property can be - useful for quickly checking the state of an Index, especially in data - processing and analysis workflows where handling of empty datasets might - be required. - - Returns - ------- - bool - If Index is empty, return True, if not return False. - - See Also - -------- - Index.size : Return the number of elements in the underlying data. - - Examples - -------- - >>> idx = pd.Index([1, 2, 3]) - >>> idx - Index([1, 2, 3], dtype='int64') - >>> idx.empty - False - - >>> idx_empty = pd.Index([]) - >>> idx_empty - Index([], dtype='object') - >>> idx_empty.empty - True - - If we only have NaNs in our DataFrame, it is not considered empty! - - >>> idx = pd.Index([np.nan, np.nan]) - >>> idx - Index([nan, nan], dtype='float64') - >>> idx.empty - False - """ return not self.size @doc(op="max", oppose="min", value="largest") @@ -760,8 +693,7 @@ def argmax( axis : {{None}} Unused. Parameter needed for compatibility with DataFrame. skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` - and there is an NA value, this method will raise a ``ValueError``. + Exclude NA/null values when showing the result. *args, **kwargs Additional arguments and keywords for compatibility with NumPy. @@ -782,15 +714,8 @@ def argmax( -------- Consider dataset containing cereal calories - >>> s = pd.Series( - ... [100.0, 110.0, 120.0, 110.0], - ... index=[ - ... "Corn Flakes", - ... "Almond Delight", - ... "Cinnamon Toast Crunch", - ... "Cocoa Puff", - ... ], - ... ) + >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, + ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) >>> s Corn Flakes 100.0 Almond Delight 110.0 @@ -812,9 +737,27 @@ def argmax( skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): - return delegate.argmax(skipna=skipna) + if not skipna and delegate.isna().any(): + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return -1 + else: + return delegate.argmax() else: result = nanops.nanargmax(delegate, skipna=skipna) + if result == -1: + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) # error: Incompatible return value type (got "Union[int, ndarray]", expected # "int") return result # type: ignore[return-value] @@ -825,17 +768,35 @@ def argmin( ) -> int: delegate = self._values nv.validate_minmax_axis(axis) - skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) + skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): - return delegate.argmin(skipna=skipna) + if not skipna and delegate.isna().any(): + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return -1 + else: + return delegate.argmin() else: result = nanops.nanargmin(delegate, skipna=skipna) + if result == -1: + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) # error: Incompatible return value type (got "Union[int, ndarray]", expected # "int") return result # type: ignore[return-value] - def tolist(self) -> list: + def tolist(self): """ Return a list of the values. @@ -846,7 +807,6 @@ def tolist(self) -> list: Returns ------- list - List containing the values as Python or pandas scalers. See Also -------- @@ -885,11 +845,6 @@ def __iter__(self) -> Iterator: Returns ------- iterator - An iterator yielding scalar values from the Series. - - See Also - -------- - Series.items : Lazily iterate over (index, value) tuples. Examples -------- @@ -918,11 +873,6 @@ def hasnans(self) -> bool: ------- bool - See Also - -------- - Series.isna : Detect missing values. - Series.notna : Detect existing (non-missing) values. - Examples -------- >>> s = pd.Series([1, 2, 3, None]) @@ -940,7 +890,7 @@ def hasnans(self) -> bool: return bool(isna(self).any()) # type: ignore[union-attr] @final - def _map_values(self, mapper, na_action=None): + def _map_values(self, mapper, na_action=None, convert: bool = True): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). @@ -952,6 +902,10 @@ def _map_values(self, mapper, na_action=None): na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function + convert : bool, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object. Note that the dtype is always + preserved for some extension array dtypes, such as Categorical. Returns ------- @@ -965,11 +919,12 @@ def _map_values(self, mapper, na_action=None): if isinstance(arr, ExtensionArray): return arr.map(mapper, na_action=na_action) - return algorithms.map_array(arr, mapper, na_action=na_action) + return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert) + @final def value_counts( self, - normalize: bool = False, + normalize: bool | str = False, sort: bool = True, ascending: bool = False, bins=None, @@ -1000,7 +955,6 @@ def value_counts( Returns ------- Series - Series containing counts of unique values. See Also -------- @@ -1053,34 +1007,6 @@ def value_counts( 4.0 1 NaN 1 Name: count, dtype: int64 - - **Categorical Dtypes** - - Rows with categorical type will be counted as one group - if they have same categories and order. - In the example below, even though ``a``, ``c``, and ``d`` - all have the same data types of ``category``, - only ``c`` and ``d`` will be counted as one group - since ``a`` doesn't have the same categories. - - >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) - >>> df = df.astype({"a": "category", "c": "category", "d": "category"}) - >>> df - a b c d - 0 1 2 3 3 - - >>> df.dtypes - a category - b object - c category - d category - dtype: object - - >>> df.dtypes.value_counts() - category 2 - category 1 - object 1 - Name: count, dtype: int64 """ return algorithms.value_counts_internal( self, @@ -1115,7 +1041,6 @@ def nunique(self, dropna: bool = True) -> int: Returns ------- int - A integer indicating the number of unique elements in the object. See Also -------- @@ -1144,18 +1069,12 @@ def nunique(self, dropna: bool = True) -> int: @property def is_unique(self) -> bool: """ - Return True if values in the object are unique. + Return boolean if values in the object are unique. Returns ------- bool - See Also - -------- - Series.unique : Return unique values of Series object. - Series.drop_duplicates : Return Series with duplicate values removed. - Series.duplicated : Indicate duplicate Series values. - Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -1171,17 +1090,12 @@ def is_unique(self) -> bool: @property def is_monotonic_increasing(self) -> bool: """ - Return True if values in the object are monotonically increasing. + Return boolean if values in the object are monotonically increasing. Returns ------- bool - See Also - -------- - Series.is_monotonic_decreasing : Return boolean if values in the object are - monotonically decreasing. - Examples -------- >>> s = pd.Series([1, 2, 2]) @@ -1199,17 +1113,12 @@ def is_monotonic_increasing(self) -> bool: @property def is_monotonic_decreasing(self) -> bool: """ - Return True if values in the object are monotonically decreasing. + Return boolean if values in the object are monotonically decreasing. Returns ------- bool - See Also - -------- - Series.is_monotonic_increasing : Return boolean if values in the object are - monotonically increasing. - Examples -------- >>> s = pd.Series([3, 2, 2, 1]) @@ -1238,7 +1147,6 @@ def _memory_usage(self, deep: bool = False) -> int: Returns ------- bytes used - Returns memory usage of the values in the Index in bytes. See Also -------- @@ -1257,7 +1165,7 @@ def _memory_usage(self, deep: bool = False) -> int: 24 """ if hasattr(self.array, "memory_usage"): - return self.array.memory_usage( # pyright: ignore[reportAttributeAccessIssue] + return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues] deep=deep, ) @@ -1291,21 +1199,18 @@ def factorize( if uniques.dtype == np.float16: uniques = uniques.astype(np.float32) - if isinstance(self, ABCMultiIndex): - # preserve MultiIndex + if isinstance(self, ABCIndex): + # preserve e.g. MultiIndex uniques = self._constructor(uniques) else: from pandas import Index - try: - uniques = Index(uniques, dtype=self.dtype) - except NotImplementedError: - # not all dtypes are supported in Index that are allowed for Series - # e.g. float16 or bytes - uniques = Index(uniques) + uniques = Index(uniques) return codes, uniques - _shared_docs["searchsorted"] = """ + _shared_docs[ + "searchsorted" + ] = """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted {klass} `self` such that, if the @@ -1353,7 +1258,7 @@ def factorize( 1 2 2 3 dtype: int64 - +https://github.com/Keramatfar/pandas/blob/main/pandas/core/base.py >>> ser.searchsorted(4) 3 @@ -1371,7 +1276,7 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[s] + dtype: datetime64[ns] >>> ser.searchsorted('3/14/2000') 3 @@ -1414,7 +1319,8 @@ def searchsorted( # type: ignore[overload-overlap] value: ScalarLike_co, side: Literal["left", "right"] = ..., sorter: NumpySorter = ..., - ) -> np.intp: ... + ) -> np.intp: + ... @overload def searchsorted( @@ -1422,7 +1328,8 @@ def searchsorted( value: npt.ArrayLike | ExtensionArray, side: Literal["left", "right"] = ..., sorter: NumpySorter = ..., - ) -> npt.NDArray[np.intp]: ... + ) -> npt.NDArray[np.intp]: + ... @doc(_shared_docs["searchsorted"], klass="Index") def searchsorted( @@ -1450,7 +1357,7 @@ def searchsorted( sorter=sorter, ) - def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: + def drop_duplicates(self, *, keep: DropKeep = "first"): duplicated = self._duplicated(keep=keep) # error: Value of type "IndexOpsMixin" is not indexable return self[~duplicated] # type: ignore[index] diff --git a/pandas/tests/base/test_value_counts_normalize.py b/pandas/tests/base/test_value_counts_normalize.py new file mode 100644 index 0000000000000..4f46c516cb463 --- /dev/null +++ b/pandas/tests/base/test_value_counts_normalize.py @@ -0,0 +1,22 @@ +""" +This module is a test for the `value_counts` function with the new `keep` parameter. +""" + +import pandas as pd + +def test_value_counts_normalize(): + """This function tests the value_counts function with the keep parameter.""" + # Test data + data = pd.Series([1, 1, 2, 2, 3, 3, 3, 4]) + + # Apply the function to the series (with the 'keep' parameter introduced) + result = data.value_counts(normalize='keep') # Assuming your new feature is applied + + # Expected output: Normalized value counts + expected = pd.Series( + {3: '3(0.375)', 1: '2(0.25)', 2: '2(0.25)', 4: '1(0.125)'}, + name="proportion" # Ensure the name is set here to match the result + ) + + # Ensure both result and expected have the same name attribute + pd.testing.assert_series_equal(result, expected, check_names=True) diff --git a/pandas/tests/test_value_counts_normalize.py b/pandas/tests/test_value_counts_normalize.py new file mode 100644 index 0000000000000..391a11b10ff70 --- /dev/null +++ b/pandas/tests/test_value_counts_normalize.py @@ -0,0 +1,18 @@ +import pandas as pd +import pytest + +def test_value_counts_normalize(): + # Test data + data = pd.Series([1, 1, 2, 2, 3, 3, 3, 4]) + + # Apply your function to the series (with the 'keep' parameter introduced) + result = data.value_counts(normalize='keep') # Assuming your new feature is applied + + # Expected output: Normalized value counts + expected = pd.Series( + {3: '3(0.375)', 1: '2(0.25)', 2: '2(0.25)', 4: '1(0.125)'}, + name="proportion" # Ensure the name is set here to match the result + ) + + # Ensure both result and expected have the same name attribute + pd.testing.assert_series_equal(result, expected, check_names=True)