From 031e3af8856b7b99c41c1432944ccccd69f211a5 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Sun, 21 Aug 2022 14:05:06 +0000 Subject: [PATCH] Format code with black, yapf, autopep8 and isort This commit fixes the style issues introduced in d2d4fc0 according to the output from black, yapf, autopep8 and isort. Details: https://deepsource.io/gh/shubham11941140/pandas/transform/b3b68a47-7c29-4b1a-9217-4e2cfb274878/ --- pandas/core/arrays/base.py | 147 ++-- pandas/core/arrays/categorical.py | 362 +++++----- pandas/core/arrays/datetimes.py | 296 ++++---- pandas/core/arrays/interval.py | 405 +++++------ pandas/core/arrays/sparse/array.py | 221 +++--- pandas/core/arrays/string_arrow.py | 117 ++-- pandas/core/frame.py | 1052 +++++++++++++++------------- pandas/core/groupby/generic.py | 297 ++++---- 8 files changed, 1546 insertions(+), 1351 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 26c2366289d19..e19ecd742689c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -87,6 +87,7 @@ if TYPE_CHECKING: class ExtensionArraySupportsAnyAll("ExtensionArray"): + def any(self, *, skipna: bool = True) -> bool: pass @@ -98,7 +99,6 @@ def all(self, *, skipna: bool = True) -> bool: NumpyValueArrayLike, ) - _extension_array_shared_docs: dict[str, str] = {} ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray") @@ -242,7 +242,11 @@ class ExtensionArray: # ------------------------------------------------------------------------ @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + def _from_sequence(cls, + scalars, + *, + dtype: Dtype | None = None, + copy=False): """ Construct a new ExtensionArray from a sequence of scalars. @@ -264,9 +268,11 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): raise AbstractMethodError(cls) @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy=False - ): + def _from_sequence_of_strings(cls, + strings, + *, + dtype: Dtype | None = None, + copy=False): """ Construct a new ExtensionArray from a sequence of strings. @@ -314,12 +320,12 @@ def __getitem__(self, item: ScalarIndexer) -> Any: ... @overload - def __getitem__(self: ExtensionArrayT, item: SequenceIndexer) -> ExtensionArrayT: + def __getitem__(self: ExtensionArrayT, + item: SequenceIndexer) -> ExtensionArrayT: ... - def __getitem__( - self: ExtensionArrayT, item: PositionalIndexer - ) -> ExtensionArrayT | Any: + def __getitem__(self: ExtensionArrayT, + item: PositionalIndexer) -> ExtensionArrayT | Any: """ Select a subset of self. @@ -395,7 +401,8 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: # __init__ method coerces that value, then so should __setitem__ # Note, also, that Series/DataFrame.where internally use __setitem__ # on a copy of the data. - raise NotImplementedError(f"{type(self)} does not implement __setitem__.") + raise NotImplementedError( + f"{type(self)} does not implement __setitem__.") def __len__(self) -> int: """ @@ -427,7 +434,8 @@ def __contains__(self, item: object) -> bool | np.bool_: if is_scalar(item) and isna(item): if not self._can_hold_na: return False - elif item is self.dtype.na_value or isinstance(item, self.dtype.type): + elif item is self.dtype.na_value or isinstance( + item, self.dtype.type): return self._hasna else: return False @@ -510,7 +518,7 @@ def shape(self) -> Shape: """ Return a tuple of the array dimensions. """ - return (len(self),) + return (len(self), ) @property def size(self) -> int: @@ -544,7 +552,9 @@ def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload - def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + def astype(self, + dtype: ExtensionDtype, + copy: bool = ...) -> ExtensionArray: ... @overload @@ -785,7 +795,9 @@ def fillna( # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" value = missing.check_value_size( - value, mask, len(self) # type: ignore[arg-type] + value, + mask, + len(self) # type: ignore[arg-type] ) if mask.any(): @@ -813,7 +825,9 @@ def dropna(self: ExtensionArrayT) -> ExtensionArrayT: # error: Unsupported operand type for ~ ("ExtensionArray") return self[~self.isna()] # type: ignore[operator] - def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: + def shift(self, + periods: int = 1, + fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. @@ -852,14 +866,14 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: if isna(fill_value): fill_value = self.dtype.na_value - empty = self._from_sequence( - [fill_value] * min(abs(periods), len(self)), dtype=self.dtype - ) + empty = self._from_sequence([fill_value] * + min(abs(periods), len(self)), + dtype=self.dtype) if periods > 0: a = empty b = self[:-periods] else: - a = self[abs(periods) :] + a = self[abs(periods):] b = empty return self._concat_same_type([a, b]) @@ -1002,7 +1016,8 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize(self, + na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. @@ -1043,16 +1058,14 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: # Complete control over factorization. arr, na_value = self._values_for_factorize() - codes, uniques = factorize_array( - arr, na_sentinel=na_sentinel, na_value=na_value - ) + codes, uniques = factorize_array(arr, + na_sentinel=na_sentinel, + na_value=na_value) uniques_ea = self._from_factorized(uniques, self) return codes, uniques_ea - _extension_array_shared_docs[ - "repeat" - ] = """ + _extension_array_shared_docs["repeat"] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -1245,9 +1258,9 @@ def __repr__(self) -> str: # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary - data = format_object_summary( - self, self._formatter(), indent_for_name=False - ).rstrip(", \n") + data = format_object_summary(self, + self._formatter(), + indent_for_name=False).rstrip(", \n") class_name = f"<{type(self).__name__}>\n" return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" @@ -1258,9 +1271,8 @@ def _repr_2d(self) -> str: # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary lines = [ - format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( - ", \n" - ) + format_object_summary(x, self._formatter(), + indent_for_name=False).rstrip(", \n") for x in self ] data = ",\n".join(lines) @@ -1312,7 +1324,9 @@ def transpose(self, *axes: int) -> ExtensionArray: def T(self) -> ExtensionArray: return self.transpose() - def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArray: + def ravel( + self, + order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArray: """ Return a flattened view on this array. @@ -1333,8 +1347,8 @@ def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArr @classmethod def _concat_same_type( - cls: type[ExtensionArrayT], to_concat: Sequence[ExtensionArrayT] - ) -> ExtensionArrayT: + cls: type[ExtensionArrayT], + to_concat: Sequence[ExtensionArrayT]) -> ExtensionArrayT: """ Concatenate multiple array of this dtype. @@ -1388,10 +1402,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ meth = getattr(self, name, None) if meth is None: - raise TypeError( - f"'{type(self).__name__}' with dtype {self.dtype} " - f"does not support reduction '{name}'" - ) + raise TypeError(f"'{type(self).__name__}' with dtype {self.dtype} " + f"does not support reduction '{name}'") return meth(skipna=skipna, **kwargs) # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 @@ -1419,7 +1431,8 @@ def tolist(self) -> list: return [x.tolist() for x in self] return list(self) - def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: + def delete(self: ExtensionArrayT, + loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) return self.take(indexer) @@ -1478,9 +1491,8 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: self[mask] = val - def _where( - self: ExtensionArrayT, mask: npt.NDArray[np.bool_], value - ) -> ExtensionArrayT: + def _where(self: ExtensionArrayT, mask: npt.NDArray[np.bool_], + value) -> ExtensionArrayT: """ Analogue to np.where(mask, self, value) @@ -1503,9 +1515,8 @@ def _where( result[~mask] = val return result - def _fill_mask_inplace( - self, method: str, limit, mask: npt.NDArray[np.bool_] - ) -> None: + def _fill_mask_inplace(self, method: str, limit, + mask: npt.NDArray[np.bool_]) -> None: """ Replace values in locations specified by 'mask' using pad or backfill. @@ -1571,9 +1582,8 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype): ) return result - def _quantile( - self: ExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str - ) -> ExtensionArrayT: + def _quantile(self: ExtensionArrayT, qs: npt.NDArray[np.float64], + interpolation: str) -> ExtensionArrayT: """ Compute the quantiles of self for each quantile in `qs`. @@ -1593,7 +1603,8 @@ def _quantile( arr = np.atleast_2d(np.asarray(self)) fill_value = np.nan - res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) + res_values = quantile_with_mask(arr, mask, fill_value, qs, + interpolation) if self.ndim == 2: # i.e. DatetimeArray @@ -1628,29 +1639,27 @@ def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT: def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if any( - isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) for other in inputs - ): + isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) + for other in inputs): return NotImplemented result = arraylike.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) + self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: return result if "out" in kwargs: - return arraylike.dispatch_ufunc_with_out( - self, ufunc, method, *inputs, **kwargs - ) + return arraylike.dispatch_ufunc_with_out(self, ufunc, method, + *inputs, **kwargs) if method == "reduce": result = arraylike.dispatch_reduction_ufunc( - self, ufunc, method, *inputs, **kwargs - ) + self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: return result - return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, + **kwargs) class ExtensionOpsMixin: @@ -1680,14 +1689,17 @@ def _add_arithmetic_ops(cls): setattr(cls, "__rpow__", cls._create_arithmetic_method(roperator.rpow)) setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) setattr(cls, "__rmod__", cls._create_arithmetic_method(roperator.rmod)) - setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) - setattr( - cls, "__rfloordiv__", cls._create_arithmetic_method(roperator.rfloordiv) - ) - setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) - setattr(cls, "__rtruediv__", cls._create_arithmetic_method(roperator.rtruediv)) + setattr(cls, "__floordiv__", + cls._create_arithmetic_method(operator.floordiv)) + setattr(cls, "__rfloordiv__", + cls._create_arithmetic_method(roperator.rfloordiv)) + setattr(cls, "__truediv__", + cls._create_arithmetic_method(operator.truediv)) + setattr(cls, "__rtruediv__", + cls._create_arithmetic_method(roperator.rtruediv)) setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) - setattr(cls, "__rdivmod__", cls._create_arithmetic_method(roperator.rdivmod)) + setattr(cls, "__rdivmod__", + cls._create_arithmetic_method(roperator.rdivmod)) @classmethod def _create_comparison_method(cls, op): @@ -1783,6 +1795,7 @@ def _create_method(cls, op, coerce_to_dtype=True, result_dtype=None): """ def _binop(self, other): + def convert_values(param): if isinstance(param, ExtensionArray) or is_list_like(param): ovalues = param diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 569a9e83ec9e8..3feca76965110 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -117,7 +117,6 @@ if TYPE_CHECKING: from pandas import Index - CategoricalT = TypeVar("CategoricalT", bound="Categorical") @@ -135,8 +134,7 @@ def func(self, other): if not self.ordered: if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: raise TypeError( - "Unordered Categoricals can only compare equality or not" - ) + "Unordered Categoricals can only compare equality or not") if isinstance(other, Categorical): # Two Categoricals can only be compared if the categories are # the same (maybe up to ordering, depending on ordered) @@ -145,11 +143,13 @@ def func(self, other): if not self._categories_match_up_to_permutation(other): raise TypeError(msg) - if not self.ordered and not self.categories.equals(other.categories): + if not self.ordered and not self.categories.equals( + other.categories): # both unordered and different order - other_codes = recode_for_categories( - other.codes, other.categories, self.categories, copy=False - ) + other_codes = recode_for_categories(other.codes, + other.categories, + self.categories, + copy=False) else: other_codes = other._codes @@ -180,10 +180,10 @@ def func(self, other): raise TypeError( f"Cannot compare a Categorical for op {opname} with " f"type {type(other)}.\nIf you want to compare values, " - "use 'np.asarray(cat) other'." - ) + "use 'np.asarray(cat) other'.") - if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype): + if isinstance(other, ExtensionArray) and needs_i8_conversion( + other.dtype): # We would return NotImplemented here, but that messes up # ExtensionIndex's wrapped methods return op(other, self) @@ -245,7 +245,8 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): +class Categorical(NDArrayBackedExtensionArray, PandasObject, + ObjectStringArrayMixin): """ Represent a categorical variable in classic R / S-plus fashion. @@ -369,9 +370,8 @@ def __init__( copy: bool = True, ): - dtype = CategoricalDtype._from_values_or_dtype( - values, categories, ordered, dtype - ) + dtype = CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step further below @@ -409,8 +409,7 @@ def __init__( if values.ndim > 1: # preempt sanitize_array from raising ValueError raise NotImplementedError( - "> 1 ndim Categorical are not supported at this time" - ) + "> 1 ndim Categorical are not supported at this time") values = sanitize_array(values, None) else: # i.e. must be a list @@ -441,8 +440,7 @@ def __init__( raise TypeError( "'values' is not ordered, please " "explicitly specify the categories order " - "by passing in a categories argument." - ) from err + "by passing in a categories argument.") from err # we're inferring from values dtype = CategoricalDtype(categories, dtype.ordered) @@ -450,10 +448,12 @@ def __init__( elif is_categorical_dtype(values.dtype): # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "_codes" - old_codes = extract_array(values)._codes # type: ignore[union-attr] - codes = recode_for_categories( - old_codes, values.dtype.categories, dtype.categories, copy=copy - ) + old_codes = extract_array( + values)._codes # type: ignore[union-attr] + codes = recode_for_categories(old_codes, + values.dtype.categories, + dtype.categories, + copy=copy) else: codes = _get_codes_for_values(values, dtype.categories) @@ -482,7 +482,11 @@ def _constructor(self) -> type[Categorical]: return Categorical @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + def _from_sequence(cls, + scalars, + *, + dtype: Dtype | None = None, + copy=False): return Categorical(scalars, dtype=dtype, copy=copy) @overload @@ -490,7 +494,9 @@ def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload - def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + def astype(self, + dtype: ExtensionDtype, + copy: bool = ...) -> ExtensionArray: ... @overload @@ -543,18 +549,17 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: fill_value = self.categories._na_value if not is_valid_na_for_dtype(fill_value, dtype): fill_value = lib.item_from_zerodim( - np.array(self.categories._na_value).astype(dtype) - ) + np.array(self.categories._na_value).astype(dtype)) except ( - TypeError, # downstream error msg for CategoricalIndex is misleading - ValueError, + TypeError, # downstream error msg for CategoricalIndex is misleading + ValueError, ): msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) - result = take_nd( - new_cats, ensure_platform_int(self._codes), fill_value=fill_value - ) + result = take_nd(new_cats, + ensure_platform_int(self._codes), + fill_value=fill_value) return result @@ -565,9 +570,11 @@ def to_list(self): return self.tolist() @classmethod - def _from_inferred_categories( - cls, inferred_categories, inferred_codes, dtype, true_values=None - ): + def _from_inferred_categories(cls, + inferred_categories, + inferred_codes, + dtype, + true_values=None): """ Construct a Categorical from inferred values. @@ -596,9 +603,8 @@ def _from_inferred_categories( ) cats = Index(inferred_categories) - known_categories = ( - isinstance(dtype, CategoricalDtype) and dtype.categories is not None - ) + known_categories = (isinstance(dtype, CategoricalDtype) + and dtype.categories is not None) if known_categories: # Convert to a specialized type with `dtype` if specified. @@ -634,9 +640,11 @@ def _from_inferred_categories( return cls(codes, dtype=dtype, fastpath=True) @classmethod - def from_codes( - cls, codes, categories=None, ordered=None, dtype: Dtype | None = None - ): + def from_codes(cls, + codes, + categories=None, + ordered=None, + dtype: Dtype | None = None): """ Make a Categorical type from codes and categories or dtype. @@ -675,14 +683,12 @@ def from_codes( ['a', 'b', 'a', 'b'] Categories (2, object): ['a' < 'b'] """ - dtype = CategoricalDtype._from_values_or_dtype( - categories=categories, ordered=ordered, dtype=dtype - ) + dtype = CategoricalDtype._from_values_or_dtype(categories=categories, + ordered=ordered, + dtype=dtype) if dtype.categories is None: - msg = ( - "The categories must be provided in 'categories' or " - "'dtype'. Both were None." - ) + msg = ("The categories must be provided in 'categories' or " + "'dtype'. Both were None.") raise ValueError(msg) if is_extension_array_dtype(codes) and is_integer_dtype(codes): @@ -695,8 +701,10 @@ def from_codes( if len(codes) and not is_integer_dtype(codes): raise ValueError("codes need to be array-like integers") - if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and len(categories)-1") + if len(codes) and (codes.max() >= len(dtype.categories) + or codes.min() < -1): + raise ValueError( + "codes need to be between -1 and len(categories)-1") return cls(codes, dtype=dtype, fastpath=True) @@ -737,13 +745,10 @@ def categories(self): @categories.setter def categories(self, categories): new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if self.dtype.categories is not None and len(self.dtype.categories) != len( - new_dtype.categories - ): - raise ValueError( - "new categories need to have the same number of " - "items as the old categories!" - ) + if self.dtype.categories is not None and len( + self.dtype.categories) != len(new_dtype.categories): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") super().__init__(self._ndarray, new_dtype) @property @@ -795,18 +800,14 @@ def _set_categories(self, categories, fastpath=False): Categories (2, object): ['a', 'c'] """ if fastpath: - new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) + new_dtype = CategoricalDtype._from_fastpath( + categories, self.ordered) else: new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if ( - not fastpath - and self.dtype.categories is not None - and len(new_dtype.categories) != len(self.dtype.categories) - ): - raise ValueError( - "new categories need to have the same number of " - "items than the old categories!" - ) + if (not fastpath and self.dtype.categories is not None + and len(new_dtype.categories) != len(self.dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items than the old categories!") super().__init__(self._ndarray, new_dtype) @@ -823,7 +824,8 @@ def _set_dtype(self, dtype: CategoricalDtype) -> Categorical: We don't do any validation here. It's assumed that the dtype is a (valid) instance of `CategoricalDtype`. """ - codes = recode_for_categories(self.codes, self.categories, dtype.categories) + codes = recode_for_categories(self.codes, self.categories, + dtype.categories) return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): @@ -881,9 +883,11 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) - def set_categories( - self, new_categories, ordered=None, rename=False, inplace=no_default - ): + def set_categories(self, + new_categories, + ordered=None, + rename=False, + inplace=no_default): """ Set the categories to the specified new_categories. @@ -955,16 +959,14 @@ def set_categories( cat = self if inplace else self.copy() if rename: - if cat.dtype.categories is not None and len(new_dtype.categories) < len( - cat.dtype.categories - ): + if cat.dtype.categories is not None and len( + new_dtype.categories) < len(cat.dtype.categories): # remove all _codes which are larger and set to -1/NaN cat._codes[cat._codes >= len(new_dtype.categories)] = -1 codes = cat._codes else: - codes = recode_for_categories( - cat.codes, cat.categories, new_dtype.categories - ) + codes = recode_for_categories(cat.codes, cat.categories, + new_dtype.categories) NDArrayBacked.__init__(cat, codes, new_dtype) if not inplace: @@ -1052,7 +1054,9 @@ def rename_categories(self, new_categories, inplace=no_default): cat = self if inplace else self.copy() if is_dict_like(new_categories): - cat.categories = [new_categories.get(item, item) for item in cat.categories] + cat.categories = [ + new_categories.get(item, item) for item in cat.categories + ] elif callable(new_categories): cat.categories = [new_categories(item) for item in cat.categories] else: @@ -1060,7 +1064,10 @@ def rename_categories(self, new_categories, inplace=no_default): if not inplace: return cat - def reorder_categories(self, new_categories, ordered=None, inplace=no_default): + def reorder_categories(self, + new_categories, + ordered=None, + inplace=no_default): """ Reorder categories as specified in new_categories. @@ -1119,7 +1126,9 @@ def reorder_categories(self, new_categories, ordered=None, inplace=no_default): with catch_warnings(): simplefilter("ignore") - return self.set_categories(new_categories, ordered=ordered, inplace=inplace) + return self.set_categories(new_categories, + ordered=ordered, + inplace=inplace) def add_categories(self, new_categories, inplace=no_default): """ @@ -1261,7 +1270,9 @@ def remove_categories(self, removals, inplace=no_default): removal_set = set(removals) not_included = removal_set - set(self.dtype.categories) - new_categories = [c for c in self.dtype.categories if c not in removal_set] + new_categories = [ + c for c in self.dtype.categories if c not in removal_set + ] # GH 10156 if any(isna(removals)): @@ -1269,13 +1280,15 @@ def remove_categories(self, removals, inplace=no_default): new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: - raise ValueError(f"removals must all be in old categories: {not_included}") + raise ValueError( + f"removals must all be in old categories: {not_included}") with catch_warnings(): simplefilter("ignore") - return self.set_categories( - new_categories, ordered=self.ordered, rename=False, inplace=inplace - ) + return self.set_categories(new_categories, + ordered=self.ordered, + rename=False, + inplace=inplace) def remove_unused_categories(self, inplace=no_default): """ @@ -1338,9 +1351,8 @@ def remove_unused_categories(self, inplace=no_default): idx, inv = idx[1:], inv - 1 new_categories = cat.dtype.categories.take(idx) - new_dtype = CategoricalDtype._from_fastpath( - new_categories, ordered=self.ordered - ) + new_dtype = CategoricalDtype._from_fastpath(new_categories, + ordered=self.ordered) new_codes = coerce_indexer_dtype(inv, new_dtype.categories) NDArrayBacked.__init__(cat, new_codes, new_dtype) if not inplace: @@ -1419,14 +1431,15 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - return self.from_codes( - self._codes.copy(), categories=new_categories, ordered=self.ordered - ) + return self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories if np.any(self._codes == -1): - new_categories = new_categories.insert(len(new_categories), np.nan) + new_categories = new_categories.insert(len(new_categories), + np.nan) return np.take(new_categories, self._codes) __eq__ = _cat_compare_op(operator.eq) @@ -1473,8 +1486,7 @@ def _validate_scalar(self, fill_value): else: raise TypeError( "Cannot setitem on a Categorical with a new " - f"category ({fill_value}), set the categories first" - ) + f"category ({fill_value}), set the categories first") return fill_value # ------------------------------------------------------------- @@ -1501,32 +1513,27 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) + result = ops.maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, + *inputs, **kwargs) if result is not NotImplemented: return result if "out" in kwargs: # e.g. test_numpy_ufuncs_out - return arraylike.dispatch_ufunc_with_out( - self, ufunc, method, *inputs, **kwargs - ) + return arraylike.dispatch_ufunc_with_out(self, ufunc, method, + *inputs, **kwargs) if method == "reduce": # e.g. TestCategoricalAnalytics::test_min_max_ordered result = arraylike.dispatch_reduction_ufunc( - self, ufunc, method, *inputs, **kwargs - ) + self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: return result # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) - raise TypeError( - f"Object with dtype {self.dtype} cannot perform " - f"the numpy op {ufunc.__name__}" - ) + raise TypeError(f"Object with dtype {self.dtype} cannot perform " + f"the numpy op {ufunc.__name__}") def __setstate__(self, state): """Necessary for making this object picklable""" @@ -1534,7 +1541,8 @@ def __setstate__(self, state): return super().__setstate__(state) if "_dtype" not in state: - state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) + state["_dtype"] = CategoricalDtype(state["_categories"], + state["_ordered"]) if "_codes" in state and "_ndarray" not in state: # backward compat, changed what is property vs attribute @@ -1569,7 +1577,8 @@ def memory_usage(self, deep: bool = False) -> int: -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) def isna(self) -> np.ndarray: """ @@ -1659,8 +1668,8 @@ def value_counts(self, dropna: bool = True): # "ExtensionDtype" @classmethod def _empty( # type: ignore[override] - cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype - ) -> Categorical: + cls: type_t[Categorical], shape: Shape, + dtype: CategoricalDtype) -> Categorical: """ Analogous to np.empty(shape, dtype=dtype) @@ -1694,17 +1703,16 @@ def _internal_get_values(self): if needs_i8_conversion(self.categories.dtype): return self.categories.take(self._codes, fill_value=NaT) elif is_integer_dtype(self.categories) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, fill_value=np.nan) + return self.categories.astype("object").take(self._codes, + fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): """assert that we are ordered""" if not self.ordered: - raise TypeError( - f"Categorical is not ordered for operation {op}\n" - "you can use .as_ordered() to change the " - "Categorical to an ordered one\n" - ) + raise TypeError(f"Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n") @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def argsort(self, ascending=True, kind="quicksort", **kwargs): @@ -1760,9 +1768,10 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs): """ return super().argsort(ascending=ascending, kind=kind, **kwargs) - def sort_values( - self, inplace: bool = False, ascending: bool = True, na_position: str = "last" - ): + def sort_values(self, + inplace: bool = False, + ascending: bool = True, + na_position: str = "last"): """ Sort the Categorical by category value returning a new Categorical by default. @@ -1840,7 +1849,9 @@ def sort_values( if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {repr(na_position)}") - sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) + sorted_idx = nargsort(self, + ascending=ascending, + na_position=na_position) if inplace: self._codes[:] = self._codes[sorted_idx] @@ -1898,8 +1909,7 @@ def _values_for_rank(self): # reorder the categories (so rank can use the float codes) # instead of passing an object array to rank values = np.array( - self.rename_categories(Series(self.categories).rank().values) - ) + self.rename_categories(Series(self.categories).rank().values)) return values def to_dense(self) -> np.ndarray: @@ -1993,7 +2003,7 @@ def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) + tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) result = f"{head[:-1]}, ..., {tail[1:]}" if footer: @@ -2005,16 +2015,13 @@ def _repr_categories(self) -> list[str]: """ return the base repr for the categories """ - max_categories = ( - 10 - if get_option("display.max_categories") == 0 - else get_option("display.max_categories") - ) + max_categories = (10 if get_option("display.max_categories") == 0 else + get_option("display.max_categories")) from pandas.io.formats import format as fmt - format_array = partial( - fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC - ) + format_array = partial(fmt.format_array, + formatter=None, + quoting=QUOTE_NONNUMERIC) if len(self.categories) > max_categories: num = max_categories // 2 head = format_array(self.categories[:num]) @@ -2060,12 +2067,16 @@ def _repr_footer(self) -> str: info = self._repr_categories_info() return f"Length: {len(self)}\n{info}" - def _get_repr(self, length: bool = True, na_rep="NaN", footer: bool = True) -> str: + def _get_repr(self, + length: bool = True, + na_rep="NaN", + footer: bool = True) -> str: from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter( - self, length=length, na_rep=na_rep, footer=footer - ) + formatter = fmt.CategoricalFormatter(self, + length=length, + na_rep=na_rep, + footer=footer) result = formatter.to_string() return str(result) @@ -2093,10 +2104,8 @@ def _validate_listlike(self, value): # require identical categories set if isinstance(value, Categorical): if not is_dtype_equal(self.dtype, value.dtype): - raise TypeError( - "Cannot set a Categorical with another, " - "without identical categories" - ) + raise TypeError("Cannot set a Categorical with another, " + "without identical categories") # is_dtype_equal implies categories_match_up_to_permutation value = self._encode_with_my_categories(value) return value._codes @@ -2105,16 +2114,13 @@ def _validate_listlike(self, value): # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 to_add = Index._with_infer(value, tupleize_cols=False).difference( - self.categories - ) + self.categories) # no assignments of values not in categories, but it's always ok to set # something to np.nan if len(to_add) and not isna(to_add).all(): - raise TypeError( - "Cannot setitem on a Categorical with a new " - "category, set the categories first" - ) + raise TypeError("Cannot setitem on a Categorical with a new " + "category, set the categories first") codes = self.categories.get_indexer(value) return codes.astype(self._ndarray.dtype, copy=False) @@ -2146,9 +2152,8 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ categories = self.categories - r, counts = libalgos.groupsort_indexer( - ensure_platform_int(self.codes), categories.size - ) + r, counts = libalgos.groupsort_indexer(ensure_platform_int(self.codes), + categories.size) counts = ensure_int64(counts).cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) return dict(zip(categories, _result)) @@ -2326,9 +2331,9 @@ def equals(self, other: object) -> bool: return False @classmethod - def _concat_same_type( - cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 - ) -> CategoricalT: + def _concat_same_type(cls: type[CategoricalT], + to_concat: Sequence[CategoricalT], + axis: int = 0) -> CategoricalT: from pandas.core.dtypes.concat import union_categoricals first = to_concat[0] @@ -2369,9 +2374,10 @@ def _encode_with_my_categories(self, other: Categorical) -> Categorical: # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - codes = recode_for_categories( - other.codes, other.categories, self.categories, copy=False - ) + codes = recode_for_categories(other.codes, + other.categories, + self.categories, + copy=False) return self._from_backing_data(codes) def _categories_match_up_to_permutation(self, other: Categorical) -> bool: @@ -2465,10 +2471,8 @@ def isin(self, values) -> npt.NDArray[np.bool_]: """ if not is_list_like(values): values_type = type(values).__name__ - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{values_type}]" - ) + raise TypeError("only list-like objects are allowed to be passed " + f"to isin(), you passed a [{values_type}]") values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) @@ -2509,7 +2513,9 @@ def replace(self, to_replace, value, inplace: bool = False): FutureWarning, stacklevel=find_stack_level(), ) - return self._replace(to_replace=to_replace, value=value, inplace=inplace) + return self._replace(to_replace=to_replace, + value=value, + inplace=inplace) def _replace(self, *, to_replace, value, inplace: bool = False): inplace = validate_bool_kwarg(inplace, "inplace") @@ -2518,7 +2524,10 @@ def _replace(self, *, to_replace, value, inplace: bool = False): # build a dict of (to replace -> value) pairs if is_list_like(to_replace): # if to_replace is list-like and value is scalar - replace_dict = {replace_value: value for replace_value in to_replace} + replace_dict = { + replace_value: value + for replace_value in to_replace + } else: # if both to_replace and value are scalar replace_dict = {to_replace: value} @@ -2554,9 +2563,11 @@ def _replace(self, *, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface - def _str_map( - self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True - ): + def _str_map(self, + f, + na_value=np.nan, + dtype=np.dtype("object"), + convert: bool = True): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. @@ -2564,7 +2575,8 @@ def _str_map( categories = self.categories codes = self.codes - result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) + result = PandasArray(categories.to_numpy())._str_map( + f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) def _str_get_dummies(self, sep="|"): @@ -2577,9 +2589,9 @@ def _str_get_dummies(self, sep="|"): # The Series.cat accessor -@delegate_names( - delegate=Categorical, accessors=["categories", "ordered"], typ="property" -) +@delegate_names(delegate=Categorical, + accessors=["categories", "ordered"], + typ="property") @delegate_names( delegate=Categorical, accessors=[ @@ -2714,7 +2726,8 @@ def __init__(self, data): @staticmethod def _validate(data): if not is_categorical_dtype(data.dtype): - raise AttributeError("Can only use .cat accessor with a 'category' dtype") + raise AttributeError( + "Can only use .cat accessor with a 'category' dtype") def _delegate_property_get(self, name): return getattr(self._parent, name) @@ -2758,9 +2771,10 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray: return coerce_indexer_dtype(codes, categories) -def recode_for_categories( - codes: np.ndarray, old_categories, new_categories, copy: bool = True -) -> np.ndarray: +def recode_for_categories(codes: np.ndarray, + old_categories, + new_categories, + copy: bool = True) -> np.ndarray: """ Convert a set of codes for to a new set of categories @@ -2794,9 +2808,8 @@ def recode_for_categories( return codes.copy() return codes - indexer = coerce_indexer_dtype( - new_categories.get_indexer(old_categories), new_categories - ) + indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), + new_categories) new_codes = take_nd(indexer, codes, fill_value=-1) return new_codes @@ -2841,7 +2854,8 @@ def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: return codes, categories -def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]: +def factorize_from_iterables( + iterables) -> tuple[list[np.ndarray], list[Index]]: """ A higher-level wrapper over `factorize_from_iterable`. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 277320930d5f9..998a87b7a28fb 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -115,6 +115,7 @@ def tz_to_dtype(tz): def _field_accessor(name: str, field: str, docstring=None): + def f(self): values = self._local_timestamps() @@ -128,9 +129,8 @@ def f(self): kwds = freq.kwds month_kw = kwds.get("startingMonth", kwds.get("month", 12)) - result = fields.get_start_end_field( - values, field, self.freqstr, month_kw - ) + result = fields.get_start_end_field(values, field, + self.freqstr, month_kw) else: result = fields.get_date_field(values, field) @@ -143,9 +143,9 @@ def f(self): else: result = fields.get_date_field(values, field) - result = self._maybe_mask_results( - result, fill_value=None, convert="float64" - ) + result = self._maybe_mask_results(result, + fill_value=None, + convert="float64") return result @@ -228,7 +228,8 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): "nanosecond", ] _other_ops: list[str] = ["date", "time", "timetz"] - _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + _other_ops + _datetimelike_ops: list[ + str] = _field_ops + _object_ops + _bool_ops + _other_ops _datetimelike_methods: list[str] = [ "to_period", "tz_localize", @@ -254,7 +255,11 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _dtype: np.dtype | DatetimeTZDtype _freq = None - def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy: bool = False): + def __init__(self, + values, + dtype=DT64NS_DTYPE, + freq=None, + copy: bool = False): values = extract_array(values, extract_numpy=True) if isinstance(values, IntegerArray): values = values.to_numpy("int64", na_value=iNaT) @@ -268,10 +273,8 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy: bool = False): dtype = DatetimeTZDtype(tz=dtype.tz) elif dtz and values.tz: if not timezones.tz_compare(dtz, values.tz): - msg = ( - "Timezone of the array and 'dtype' do not match. " - f"'{dtz}' != '{values.tz}'" - ) + msg = ("Timezone of the array and 'dtype' do not match. " + f"'{dtz}' != '{values.tz}'") raise TypeError(msg) elif values.tz: dtype = values.dtype @@ -297,16 +300,14 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy: bool = False): if values.dtype != DT64NS_DTYPE: raise ValueError( "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. " - f"Got {values.dtype} instead." - ) + f"Got {values.dtype} instead.") dtype = _validate_dt64_dtype(dtype) if freq == "infer": raise ValueError( "Frequency inference not allowed in DatetimeArray.__init__. " - "Use 'pd.array()' instead." - ) + "Use 'pd.array()' instead.") if copy: values = values.copy() @@ -329,8 +330,10 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy: bool = False): # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" @classmethod def _simple_new( # type: ignore[override] - cls, values: np.ndarray, freq: BaseOffset | None = None, dtype=DT64NS_DTYPE - ) -> DatetimeArray: + cls, + values: np.ndarray, + freq: BaseOffset | None = None, + dtype=DT64NS_DTYPE) -> DatetimeArray: assert isinstance(values, np.ndarray) assert values.dtype == DT64NS_DTYPE @@ -369,7 +372,8 @@ def _from_sequence_not_strict( ambiguous=ambiguous, ) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, + freq_infer) if explicit_none: freq = None @@ -403,13 +407,12 @@ def _generate_range( periods = dtl.validate_periods(periods) if freq is None and any(x is None for x in [periods, start, end]): - raise ValueError("Must provide freq argument if no data is supplied") + raise ValueError( + "Must provide freq argument if no data is supplied") if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError( - "Of the four parameters: start, end, periods, " - "and freq, exactly three must be specified" - ) + raise ValueError("Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified") freq = to_offset(freq) if start is not None: @@ -422,19 +425,18 @@ def _generate_range( raise ValueError("Neither `start` nor `end` can be NaT") left_inclusive, right_inclusive = validate_inclusive(inclusive) - start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize) + start, end, _normalized = _maybe_normalize_endpoints( + start, end, normalize) tz = _infer_tz_from_endpoints(start, end, tz) if tz is not None: # Localize the start and end arguments start_tz = None if start is None else start.tz end_tz = None if end is None else end.tz - start = _maybe_localize_point( - start, start_tz, start, freq, tz, ambiguous, nonexistent - ) - end = _maybe_localize_point( - end, end_tz, end, freq, tz, ambiguous, nonexistent - ) + start = _maybe_localize_point(start, start_tz, start, freq, tz, + ambiguous, nonexistent) + end = _maybe_localize_point(end, end_tz, end, freq, tz, ambiguous, + nonexistent) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for # Day to mean calendar day (23/24/25 hour). Therefore, strip @@ -448,15 +450,17 @@ def _generate_range( if isinstance(freq, Tick): i8values = generate_regular_range(start, end, periods, freq) else: - xdr = generate_range(start=start, end=end, periods=periods, offset=freq) + xdr = generate_range(start=start, + end=end, + periods=periods, + offset=freq) i8values = np.array([x.value for x in xdr], dtype=np.int64) endpoint_tz = start.tz if start is not None else end.tz if tz is not None and endpoint_tz is None: i8values = tzconversion.tz_localize_to_utc( - i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent - ) + i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent) # i8values is localized datetime64 array -> have to convert # start/end as well to compare @@ -469,10 +473,9 @@ def _generate_range( # Nanosecond-granularity timestamps aren't always correctly # representable with doubles, so we limit the range that we # pass to np.linspace as much as possible - i8values = ( - np.linspace(0, end.value - start.value, periods, dtype="int64") - + start.value - ) + i8values = (np.linspace( + 0, end.value - start.value, periods, dtype="int64") + + start.value) if i8values.dtype != "i8": # 2022-01-09 I (brock) am not sure if it is possible for this # to overflow and cast to e.g. f8, but if it does we need to cast @@ -485,9 +488,11 @@ def _generate_range( start_i8 = Timestamp(start).value end_i8 = Timestamp(end).value if not left_inclusive or not right_inclusive: - if not left_inclusive and len(i8values) and i8values[0] == start_i8: + if not left_inclusive and len( + i8values) and i8values[0] == start_i8: i8values = i8values[1:] - if not right_inclusive and len(i8values) and i8values[-1] == end_i8: + if not right_inclusive and len( + i8values) and i8values[-1] == end_i8: i8values = i8values[:-1] dt64_values = i8values.view("datetime64[ns]") @@ -512,7 +517,8 @@ def _check_compatible_with(self, other, setitem: bool = False): self._assert_tzawareness_compat(other) if setitem: # Stricter check for setitem vs comparison methods - if self.tz is not None and not timezones.tz_compare(self.tz, other.tz): + if self.tz is not None and not timezones.tz_compare( + self.tz, other.tz): # TODO(2.0): remove this check. GH#37605 warnings.warn( "Setitem-like behavior with mismatched timezones is deprecated " @@ -526,7 +532,8 @@ def _check_compatible_with(self, other, setitem: bool = False): FutureWarning, stacklevel=find_stack_level(), ) - raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'") + raise ValueError( + f"Timezones don't match. '{self.tz}' != '{other.tz}'") # ----------------------------------------------------------------- # Descriptive Properties @@ -587,10 +594,8 @@ def tz(self) -> tzinfo | None: @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError( - "Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate" - ) + raise AttributeError("Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate") @property def tzinfo(self) -> tzinfo | None: @@ -641,9 +646,10 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, length) - converted = ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" - ) + converted = ints_to_pydatetime(data[start_i:end_i], + tz=self.tz, + freq=self.freq, + box="timestamp") yield from converted def astype(self, dtype, copy: bool = True): @@ -661,7 +667,8 @@ def astype(self, dtype, copy: bool = True): elif is_datetime64_ns_dtype(dtype): return astype_dt64_to_dt64tz(self, dtype, copy, via_utc=False) - elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype: + elif self.tz is None and is_datetime64_dtype( + dtype) and dtype != self.dtype: # unit conversion e.g. datetime64[s] return self._ndarray.astype(dtype) @@ -673,16 +680,19 @@ def astype(self, dtype, copy: bool = True): # Rendering Methods @dtl.ravel_compat - def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs - ) -> npt.NDArray[np.object_]: + def _format_native_types(self, + *, + na_rep="NaT", + date_format=None, + **kwargs) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values fmt = get_format_datetime64_from_values(self, date_format) - return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep - ) + return tslib.format_array_from_datetime(self.asi8, + tz=self.tz, + format=fmt, + na_rep=na_rep) # ----------------------------------------------------------------- # Comparison Methods @@ -717,8 +727,7 @@ def _assert_tzawareness_compat(self, other) -> None: ) elif other_tz is None: raise TypeError( - "Cannot compare tz-naive and tz-aware datetime-like objects" - ) + "Cannot compare tz-naive and tz-aware datetime-like objects") # ----------------------------------------------------------------- # Arithmetic Methods @@ -741,7 +750,9 @@ def _sub_datetime_arraylike(self, other): self_i8 = self.asi8 other_i8 = other.asi8 arr_mask = self._isnan | other._isnan - new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask) + new_values = checked_add_with_arr(self_i8, + -other_i8, + arr_mask=arr_mask) if self._hasna or other._hasna: np.putmask(new_values, arr_mask, iNaT) return new_values.view("timedelta64[ns]") @@ -884,7 +895,10 @@ def tz_convert(self, tz) -> DatetimeArray: return self._simple_new(self._ndarray, dtype=dtype, freq=self.freq) @dtl.ravel_compat - def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArray: + def tz_localize(self, + tz, + ambiguous="raise", + nonexistent="raise") -> DatetimeArray: """ Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. @@ -1025,19 +1039,19 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArr 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] """ - nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") + nonexistent_options = ("raise", "NaT", "shift_forward", + "shift_backward") if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta - ): + nonexistent, timedelta): raise ValueError( "The nonexistent argument must be one of 'raise', " "'NaT', 'shift_forward', 'shift_backward' or " - "a timedelta object" - ) + "a timedelta object") if self.tz is not None: if tz is None: - new_dates = tzconversion.tz_convert_from_utc(self.asi8, self.tz) + new_dates = tzconversion.tz_convert_from_utc( + self.asi8, self.tz) else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: @@ -1045,8 +1059,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArr # Convert to UTC new_dates = tzconversion.tz_localize_to_utc( - self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent - ) + self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent) new_dates = new_dates.view(DT64NS_DTYPE) dtype = tz_to_dtype(tz) @@ -1174,8 +1187,7 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: raise ValueError( - "You must pass a freq argument as current index has none." - ) + "You must pass a freq argument as current index has none.") res = get_period_alias(freq) @@ -1245,7 +1257,9 @@ def month_name(self, locale=None): """ values = self._local_timestamps() - result = fields.get_date_name_field(values, "month_name", locale=locale) + result = fields.get_date_name_field(values, + "month_name", + locale=locale) result = self._maybe_mask_results(result, fill_value=None) return result @@ -1357,9 +1371,9 @@ def isocalendar(self) -> DataFrame: values = self._local_timestamps() sarray = fields.build_isocalendar_sarray(values) - iso_calendar_df = DataFrame( - sarray, columns=["year", "week", "day"], dtype="UInt32" - ) + iso_calendar_df = DataFrame(sarray, + columns=["year", "week", "day"], + dtype="UInt32") if self._hasna: iso_calendar_df.iloc[self._isnan] = None return iso_calendar_df @@ -1680,12 +1694,11 @@ def weekofyear(self): array([False, True, False]) """ is_month_start = _field_accessor( - "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first") - ) + "is_month_start", "is_month_start", + _is_month_doc.format(first_or_last="first")) - is_month_end = _field_accessor( - "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last") - ) + is_month_end = _field_accessor("is_month_end", "is_month_end", + _is_month_doc.format(first_or_last="last")) is_quarter_start = _field_accessor( "is_quarter_start", @@ -1915,23 +1928,12 @@ def to_julian_date(self) -> np.ndarray: testarr = month < 3 year[testarr] -= 1 month[testarr] += 12 - return ( - day - + np.fix((153 * month - 457) / 5) - + 365 * year - + np.floor(year / 4) - - np.floor(year / 100) - + np.floor(year / 400) - + 1_721_118.5 - + ( - self.hour - + self.minute / 60 - + self.second / 3600 - + self.microsecond / 3600 / 10**6 - + self.nanosecond / 3600 / 10**9 - ) - / 24 - ) + return (day + np.fix( + (153 * month - 457) / 5) + 365 * year + np.floor(year / 4) - + np.floor(year / 100) + np.floor(year / 400) + 1_721_118.5 + + (self.hour + self.minute / 60 + self.second / 3600 + + self.microsecond / 3600 / 10**6 + + self.nanosecond / 3600 / 10**9) / 24) # ----------------------------------------------------------------- # Reductions @@ -1971,16 +1973,20 @@ def std( from pandas.core.arrays import TimedeltaArray tda = TimedeltaArray(self._ndarray.view("i8")) - return tda.std( - axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims, skipna=skipna - ) + return tda.std(axis=axis, + dtype=dtype, + out=out, + ddof=ddof, + keepdims=keepdims, + skipna=skipna) # ------------------------------------------------------------------- # Constructor Helpers -def sequence_to_datetimes(data, require_iso8601: bool = False) -> DatetimeArray: +def sequence_to_datetimes(data, + require_iso8601: bool = False) -> DatetimeArray: """ Parse/convert the passed data to either DatetimeArray or np.ndarray[object]. """ @@ -2070,11 +2076,8 @@ def _sequence_to_dt64ns( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if ( - is_object_dtype(data_dtype) - or is_string_dtype(data_dtype) - or is_sparse(data_dtype) - ): + if (is_object_dtype(data_dtype) or is_string_dtype(data_dtype) + or is_sparse(data_dtype)): # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False @@ -2098,7 +2101,8 @@ def _sequence_to_dt64ns( # by convention, these are _already_ UTC, e.g return data.view(DT64NS_DTYPE), tz, None - utc_vals = tzconversion.tz_convert_from_utc(data.view("i8"), tz) + utc_vals = tzconversion.tz_convert_from_utc( + data.view("i8"), tz) data = utc_vals.view(DT64NS_DTYPE) elif inferred_tz: tz = inferred_tz @@ -2122,9 +2126,9 @@ def _sequence_to_dt64ns( if tz is not None: # Convert tz-naive to UTC tz = timezones.maybe_get_tz(tz) - data = tzconversion.tz_localize_to_utc( - data.view("i8"), tz, ambiguous=ambiguous - ) + data = tzconversion.tz_localize_to_utc(data.view("i8"), + tz, + ambiguous=ambiguous) data = data.view(DT64NS_DTYPE) assert data.dtype == DT64NS_DTYPE, data.dtype @@ -2212,7 +2216,8 @@ def objects_to_datetime64ns( result = result.reshape(data.shape, order=order) except ValueError as err: try: - values, tz_parsed = conversion.datetime_to_datetime64(data.ravel("K")) + values, tz_parsed = conversion.datetime_to_datetime64( + data.ravel("K")) # If tzaware, these values represent unix timestamps, so we # return them as i8 to distinguish from wall times values = values.reshape(data.shape, order=order) @@ -2221,7 +2226,8 @@ def objects_to_datetime64ns( raise err except OverflowError as err: # Exception is raised when a part of date is greater than 32 bit signed int - raise OutOfBoundsDatetime("Out of bounds nanosecond timestamp") from err + raise OutOfBoundsDatetime( + "Out of bounds nanosecond timestamp") from err if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array @@ -2276,11 +2282,8 @@ def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None): # as wall-times instead of UTC timestamps. data = data.astype(DT64NS_DTYPE) copy = False - if ( - tz is not None - and len(data) > 0 - and not timezones.is_utc(timezones.maybe_get_tz(tz)) - ): + if (tz is not None and len(data) > 0 + and not timezones.is_utc(timezones.maybe_get_tz(tz))): # GH#23675, GH#45573 deprecate to treat symmetrically with integer dtypes warnings.warn( "The behavior of DatetimeArray._from_sequence with a timezone-aware " @@ -2296,7 +2299,8 @@ def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None): elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype): # GH#29794 enforcing deprecation introduced in GH#23539 - raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") + raise TypeError( + f"dtype {data.dtype} cannot be converted to datetime64[ns]") elif is_period_dtype(data.dtype): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails @@ -2311,7 +2315,8 @@ def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None): data = data.categories.take(data.codes, fill_value=NaT)._values copy = False - elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype): + elif is_extension_array_dtype( + data.dtype) and not is_datetime64tz_dtype(data.dtype): # TODO: We have no tests for these data = np.array(data, dtype=np.object_) copy = False @@ -2323,7 +2328,8 @@ def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None): # Validation and Inference -def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | None: +def _maybe_infer_tz(tz: tzinfo | None, + inferred_tz: tzinfo | None) -> tzinfo | None: """ If a timezone is inferred from data, check that it is compatible with the user-provided timezone, if any. @@ -2346,10 +2352,8 @@ def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | N elif inferred_tz is None: pass elif not timezones.tz_compare(tz, inferred_tz): - raise TypeError( - f"data is already tz-aware {inferred_tz}, unable to " - f"set specified tz: {tz}" - ) + raise TypeError(f"data is already tz-aware {inferred_tz}, unable to " + f"set specified tz: {tz}") return tz @@ -2381,17 +2385,14 @@ def _validate_dt64_dtype(dtype): # no precision, disallowed GH#24806 msg = ( "Passing in 'datetime64' dtype with no precision is not allowed. " - "Please pass in 'datetime64[ns]' instead." - ) + "Please pass in 'datetime64[ns]' instead.") raise ValueError(msg) - if (isinstance(dtype, np.dtype) and dtype != DT64NS_DTYPE) or not isinstance( - dtype, (np.dtype, DatetimeTZDtype) - ): - raise ValueError( - f"Unexpected value for 'dtype': '{dtype}'. " - "Must be 'datetime64[ns]' or DatetimeTZDtype'." - ) + if (isinstance(dtype, np.dtype) + and dtype != DT64NS_DTYPE) or not isinstance( + dtype, (np.dtype, DatetimeTZDtype)): + raise ValueError(f"Unexpected value for 'dtype': '{dtype}'. " + "Must be 'datetime64[ns]' or DatetimeTZDtype'.") return dtype @@ -2427,24 +2428,22 @@ def validate_tz_from_dtype(dtype, tz: tzinfo | None) -> tzinfo | None: dtz = getattr(dtype, "tz", None) if dtz is not None: if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError("cannot supply both a tz and a dtype with a tz") + raise ValueError( + "cannot supply both a tz and a dtype with a tz") tz = dtz if tz is not None and is_datetime64_dtype(dtype): # We also need to check for the case where the user passed a # tz-naive dtype (i.e. datetime64[ns]) if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError( - "cannot supply both a tz and a " - "timezone-naive dtype (i.e. datetime64[ns])" - ) + raise ValueError("cannot supply both a tz and a " + "timezone-naive dtype (i.e. datetime64[ns])") return tz -def _infer_tz_from_endpoints( - start: Timestamp, end: Timestamp, tz: tzinfo | None -) -> tzinfo | None: +def _infer_tz_from_endpoints(start: Timestamp, end: Timestamp, + tz: tzinfo | None) -> tzinfo | None: """ If a timezone is not explicitly given via `tz`, see if one can be inferred from the `start` and `end` endpoints. If more than one @@ -2477,7 +2476,8 @@ def _infer_tz_from_endpoints( if tz is not None and inferred_tz is not None: if not timezones.tz_compare(inferred_tz, tz): - raise AssertionError("Inferred time zone not equal to passed time zone") + raise AssertionError( + "Inferred time zone not equal to passed time zone") elif inferred_tz is not None: tz = inferred_tz @@ -2485,9 +2485,8 @@ def _infer_tz_from_endpoints( return tz -def _maybe_normalize_endpoints( - start: Timestamp | None, end: Timestamp | None, normalize: bool -): +def _maybe_normalize_endpoints(start: Timestamp | None, end: Timestamp | None, + normalize: bool): _normalized = True if start is not None: @@ -2507,7 +2506,8 @@ def _maybe_normalize_endpoints( return start, end, _normalized -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): +def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, + nonexistent): """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2533,7 +2533,11 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexis # Note: We can't ambiguous='infer' a singular ambiguous time; however, # we have historically defaulted ambiguous=False ambiguous = ambiguous if ambiguous != "infer" else False - localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None} + localize_args = { + "ambiguous": ambiguous, + "nonexistent": nonexistent, + "tz": None + } if isinstance(freq, Tick) or freq is None: localize_args["tz"] = tz ts = ts.tz_localize(**localize_args) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b781693530a59..27302532cdd57 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -104,10 +104,7 @@ "name": "", } - -_interval_shared_docs[ - "class" -] = """ +_interval_shared_docs["class"] = """ %(summary)s .. versionadded:: %(versionadded)s @@ -169,16 +166,21 @@ @Appender( - _interval_shared_docs["class"] - % { - "klass": "IntervalArray", - "summary": "Pandas array for interval data that are closed on the same side.", - "versionadded": "0.24.0", - "name": "", - "extra_attributes": "", - "extra_methods": "", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["class"] % { + "klass": + "IntervalArray", + "summary": + "Pandas array for interval data that are closed on the same side.", + "versionadded": + "0.24.0", + "name": + "", + "extra_attributes": + "", + "extra_methods": + "", + "examples": + textwrap.dedent("""\ Examples -------- A new ``IntervalArray`` can be constructed directly from an array-like of @@ -192,10 +194,8 @@ It may also be constructed using one of the constructor methods: :meth:`IntervalArray.from_arrays`, :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. - """ - ), - } -) + """), + }) class IntervalArray(IntervalMixin, ExtensionArray): ndim = 1 can_hold_na = True @@ -228,17 +228,14 @@ def __new__( # don't allow scalars if is_scalar(data): - msg = ( - f"{cls.__name__}(...) must be called with a collection " - f"of some kind, {data} was passed" - ) + msg = (f"{cls.__name__}(...) must be called with a collection " + f"of some kind, {data} was passed") raise TypeError(msg) # might need to convert empty or purely na data data = _maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds( - data, validate_closed=closed is None - ) + data, validate_closed=closed is None) if left.dtype == object: left = lib.maybe_convert_objects(left) right = lib.maybe_convert_objects(right) @@ -297,26 +294,21 @@ def _simple_new( left = left.astype(right.dtype) if type(left) != type(right): - msg = ( - f"must not have differing left [{type(left).__name__}] and " - f"right [{type(right).__name__}] types" - ) + msg = (f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types") raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 - msg = ( - "category, object, and string subtypes are not supported " - "for IntervalArray" - ) + msg = ("category, object, and string subtypes are not supported " + "for IntervalArray") raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) - elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): - msg = ( - "left and right must have the same time zone, got " - f"'{left.tz}' and '{right.tz}'" - ) + elif isinstance(left, + ABCDatetimeIndex) and str(left.tz) != str(right.tz): + msg = ("left and right must have the same time zone, got " + f"'{left.tz}' and '{right.tz}'") raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray @@ -351,9 +343,8 @@ def _from_sequence( return cls(scalars, dtype=dtype, copy=copy) @classmethod - def _from_factorized( - cls: type[IntervalArrayT], values: np.ndarray, original: IntervalArrayT - ) -> IntervalArrayT: + def _from_factorized(cls: type[IntervalArrayT], values: np.ndarray, + original: IntervalArrayT) -> IntervalArrayT: if len(values) == 0: # An empty array returns object-dtype here. We can't create # a new IA from an (empty) object-dtype array, so turn it into the @@ -361,8 +352,7 @@ def _from_factorized( values = values.astype(original.dtype.subtype) return cls(values, closed=original.closed) - _interval_shared_docs["from_breaks"] = textwrap.dedent( - """ + _interval_shared_docs["from_breaks"] = textwrap.dedent(""" Construct an %(klass)s from an array of splits. Parameters @@ -388,26 +378,23 @@ def _from_factorized( %(klass)s.from_tuples : Construct from a sequence of tuples. %(examples)s\ - """ - ) + """) @classmethod @Appender( - _interval_shared_docs["from_breaks"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["from_breaks"] % { + "klass": + "IntervalArray", + "examples": + textwrap.dedent("""\ Examples -------- >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) [(0, 1], (1, 2], (2, 3]] Length: 3, dtype: interval[int64, right] - """ - ), - } - ) + """), + }) def from_breaks( cls: type[IntervalArrayT], breaks, @@ -417,10 +404,13 @@ def from_breaks( ) -> IntervalArrayT: breaks = _maybe_convert_platform_interval(breaks) - return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) + return cls.from_arrays(breaks[:-1], + breaks[1:], + closed, + copy=copy, + dtype=dtype) - _interval_shared_docs["from_arrays"] = textwrap.dedent( - """ + _interval_shared_docs["from_arrays"] = textwrap.dedent(""" Construct from two arrays defining the left and right bounds. Parameters @@ -465,24 +455,21 @@ def from_breaks( 'category', 'object', and 'string' subtypes are not supported. %(examples)s\ - """ - ) + """) @classmethod @Appender( - _interval_shared_docs["from_arrays"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["from_arrays"] % { + "klass": + "IntervalArray", + "examples": + textwrap.dedent("""\ >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) [(0, 1], (1, 2], (2, 3]] Length: 3, dtype: interval[int64, right] - """ - ), - } - ) + """), + }) def from_arrays( cls: type[IntervalArrayT], left, @@ -494,12 +481,14 @@ def from_arrays( left = _maybe_convert_platform_interval(left) right = _maybe_convert_platform_interval(right) - return cls._simple_new( - left, right, closed, copy=copy, dtype=dtype, verify_integrity=True - ) + return cls._simple_new(left, + right, + closed, + copy=copy, + dtype=dtype, + verify_integrity=True) - _interval_shared_docs["from_tuples"] = textwrap.dedent( - """ + _interval_shared_docs["from_tuples"] = textwrap.dedent(""" Construct an %(klass)s from an array-like of tuples. Parameters @@ -527,26 +516,23 @@ def from_arrays( splits. %(examples)s\ - """ - ) + """) @classmethod @Appender( - _interval_shared_docs["from_tuples"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["from_tuples"] % { + "klass": + "IntervalArray", + "examples": + textwrap.dedent("""\ Examples -------- >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) [(0, 1], (1, 2]] Length: 2, dtype: interval[int64, right] - """ - ), - } - ) + """), + }) def from_tuples( cls: type[IntervalArrayT], data, @@ -599,10 +585,8 @@ def _validate(self): left_mask = notna(self._left) right_mask = notna(self._right) if not (left_mask == right_mask).all(): - msg = ( - "missing values must be missing in the same " - "location both left and right sides" - ) + msg = ("missing values must be missing in the same " + "location both left and right sides") raise ValueError(msg) if not (self._left[left_mask] <= self._right[left_mask]).all(): msg = "left side of interval must be <= right side" @@ -619,7 +603,10 @@ def _shallow_copy(self: IntervalArrayT, left, right) -> IntervalArrayT: right : Index Values to be used for the right-side of the intervals. """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) + return self._simple_new(left, + right, + closed=self.closed, + verify_integrity=False) # --------------------------------------------------------------------- # Descriptive @@ -651,12 +638,12 @@ def __getitem__(self, key: ScalarIndexer) -> IntervalOrNA: ... @overload - def __getitem__(self: IntervalArrayT, key: SequenceIndexer) -> IntervalArrayT: + def __getitem__(self: IntervalArrayT, + key: SequenceIndexer) -> IntervalArrayT: ... - def __getitem__( - self: IntervalArrayT, key: PositionalIndexer - ) -> IntervalArrayT | IntervalOrNA: + def __getitem__(self: IntervalArrayT, + key: PositionalIndexer) -> IntervalArrayT | IntervalOrNA: key = check_array_indexer(self, key) left = self._left[key] right = self._right[key] @@ -710,8 +697,9 @@ def _cmp_method(self, other, op): return invalid_comparison(self, other, op) other = other.categories.take( - other.codes, allow_fill=True, fill_value=other.categories._na_value - ) + other.codes, + allow_fill=True, + fill_value=other.categories._na_value) # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): @@ -721,19 +709,19 @@ def _cmp_method(self, other, op): other = type(self)(other) if op is operator.eq: - return (self._left == other.left) & (self._right == other.right) + return (self._left == other.left) & (self._right + == other.right) elif op is operator.ne: - return (self._left != other.left) | (self._right != other.right) + return (self._left != other.left) | (self._right != + other.right) elif op is operator.gt: return (self._left > other.left) | ( - (self._left == other.left) & (self._right > other.right) - ) + (self._left == other.left) & (self._right > other.right)) elif op is operator.ge: return (self == other) | (self > other) elif op is operator.lt: return (self._left < other.left) | ( - (self._left == other.left) & (self._right < other.right) - ) + (self._left == other.left) & (self._right < other.right)) else: # operator.lt return (self == other) | (self < other) @@ -796,9 +784,10 @@ def argsort( return np.lexsort((self.right, self.left)) # TODO: other cases we can use lexsort for? much more performant. - return super().argsort( - ascending=ascending, kind=kind, na_position=na_position, **kwargs - ) + return super().argsort(ascending=ascending, + kind=kind, + na_position=na_position, + **kwargs) def min(self, *, axis: int | None = None, skipna: bool = True): nv.validate_minmax_axis(axis, self.ndim) @@ -834,9 +823,10 @@ def max(self, *, axis: int | None = None, skipna: bool = True): indexer = obj.argsort()[-1] return obj[indexer] - def fillna( - self: IntervalArrayT, value=None, method=None, limit=None - ) -> IntervalArrayT: + def fillna(self: IntervalArrayT, + value=None, + method=None, + limit=None) -> IntervalArrayT: """ Fill NA/NaN values using the specified method. @@ -864,7 +854,8 @@ def fillna( filled : IntervalArray with NA/NaN filled """ if method is not None: - raise TypeError("Filling by method is not supported for IntervalArray.") + raise TypeError( + "Filling by method is not supported for IntervalArray.") if limit is not None: raise TypeError("limit is not supported for IntervalArray.") @@ -907,7 +898,8 @@ def astype(self, dtype, copy: bool = True): # We need to use Index rules for astype to prevent casting # np.nan entries to int subtypes new_left = Index(self._left, copy=False).astype(dtype.subtype) - new_right = Index(self._right, copy=False).astype(dtype.subtype) + new_right = Index(self._right, + copy=False).astype(dtype.subtype) except IntCastingNaNError: # e.g test_subtype_integer raise @@ -930,16 +922,14 @@ def equals(self, other) -> bool: if type(self) != type(other): return False - return bool( - self.closed == other.closed - and self.left.equals(other.left) - and self.right.equals(other.right) - ) + return bool(self.closed == other.closed + and self.left.equals(other.left) + and self.right.equals(other.right)) @classmethod def _concat_same_type( - cls: type[IntervalArrayT], to_concat: Sequence[IntervalArrayT] - ) -> IntervalArrayT: + cls: type[IntervalArrayT], + to_concat: Sequence[IntervalArrayT]) -> IntervalArrayT: """ Concatenate multiple IntervalArray @@ -977,7 +967,9 @@ def copy(self: IntervalArrayT) -> IntervalArrayT: def isna(self) -> np.ndarray: return isna(self._left) - def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: + def shift(self, + periods: int = 1, + fill_value: object = None) -> IntervalArray: if not self or periods == 0: return self.copy() @@ -1002,7 +994,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: a = empty b = self[:-periods] else: - a = self[abs(periods) :] + a = self[abs(periods):] b = empty return self._concat_same_type([a, b]) @@ -1066,12 +1058,14 @@ def take( if allow_fill: fill_left, fill_right = self._validate_scalar(fill_value) - left_take = take( - self._left, indices, allow_fill=allow_fill, fill_value=fill_left - ) - right_take = take( - self._right, indices, allow_fill=allow_fill, fill_value=fill_right - ) + left_take = take(self._left, + indices, + allow_fill=allow_fill, + fill_value=fill_left) + right_take = take(self._right, + indices, + allow_fill=allow_fill, + fill_value=fill_right) return self._shallow_copy(left_take, right_take) @@ -1089,10 +1083,8 @@ def _validate_listlike(self, value): try: self.left._validate_fill_value(value_left) except (LossySetitemError, TypeError) as err: - msg = ( - "'value' should be a compatible interval type, " - f"got {type(value)} instead." - ) + msg = ("'value' should be a compatible interval type, " + f"got {type(value)} instead.") raise TypeError(msg) from err return value_left, value_right @@ -1120,7 +1112,8 @@ def _validate_setitem_value(self, value): # can't set NaN on a numpy integer array # GH#45484 TypeError, not ValueError, matches what we get with # non-NA un-holdable value. - raise TypeError("Cannot set float NaN to integer-backed IntervalArray") + raise TypeError( + "Cannot set float NaN to integer-backed IntervalArray") value_left, value_right = value, value elif isinstance(value, Interval): @@ -1163,7 +1156,8 @@ def _format_data(self) -> str: # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, + 10) formatter = str @@ -1248,8 +1242,7 @@ def mid(self): # datetime safe version return self.left + 0.5 * self.length - _interval_shared_docs["overlaps"] = textwrap.dedent( - """ + _interval_shared_docs["overlaps"] = textwrap.dedent(""" Check elementwise if an Interval overlaps the values in the %(klass)s. Two intervals overlap if they share a common point, including closed @@ -1285,25 +1278,22 @@ def mid(self): >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) array([False, True, False]) - """ - ) + """) @Appender( - _interval_shared_docs["overlaps"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["overlaps"] % { + "klass": + "IntervalArray", + "examples": + textwrap.dedent("""\ >>> data = [(0, 1), (1, 3), (2, 4)] >>> intervals = pd.arrays.IntervalArray.from_tuples(data) >>> intervals [(0, 1], (1, 3], (2, 4]] Length: 3, dtype: interval[int64, right] - """ - ), - } - ) + """), + }) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError @@ -1330,8 +1320,7 @@ def closed(self): """ return self.dtype.closed - _interval_shared_docs["set_closed"] = textwrap.dedent( - """ + _interval_shared_docs["set_closed"] = textwrap.dedent(""" Return an %(klass)s identical to the current one, but closed on the specified side. @@ -1346,15 +1335,14 @@ def closed(self): new_index : %(klass)s %(examples)s\ - """ - ) + """) @Appender( - _interval_shared_docs["set_closed"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["set_closed"] % { + "klass": + "IntervalArray", + "examples": + textwrap.dedent("""\ Examples -------- >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) @@ -1366,22 +1354,20 @@ def closed(self): [[0, 1], [1, 2], [2, 3]] Length: 3, dtype: interval[int64, both] - """ - ), - } - ) - def set_closed(self: IntervalArrayT, closed: IntervalClosedType) -> IntervalArrayT: + """), + }) + def set_closed(self: IntervalArrayT, + closed: IntervalClosedType) -> IntervalArrayT: if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) - return type(self)._simple_new( - left=self._left, right=self._right, closed=closed, verify_integrity=False - ) + return type(self)._simple_new(left=self._left, + right=self._right, + closed=closed, + verify_integrity=False) - _interval_shared_docs[ - "is_non_overlapping_monotonic" - ] = """ + _interval_shared_docs["is_non_overlapping_monotonic"] = """ Return True if the %(klass)s is non-overlapping (no Intervals share points) and is either monotonic increasing or monotonic decreasing, else False. @@ -1390,9 +1376,8 @@ def set_closed(self: IntervalArrayT, closed: IntervalClosedType) -> IntervalArra # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties @property # type: ignore[misc] - @Appender( - _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs - ) + @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % + _shared_docs_kwargs) def is_non_overlapping_monotonic(self) -> bool: # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) @@ -1401,17 +1386,13 @@ def is_non_overlapping_monotonic(self) -> bool: # strict inequality for closed == 'both'; equality implies overlapping # at a point when both sides of intervals are included if self.closed == "both": - return bool( - (self._right[:-1] < self._left[1:]).all() - or (self._left[:-1] > self._right[1:]).all() - ) + return bool((self._right[:-1] < self._left[1:]).all() + or (self._left[:-1] > self._right[1:]).all()) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping - return bool( - (self._right[:-1] <= self._left[1:]).all() - or (self._left[:-1] >= self._right[1:]).all() - ) + return bool((self._right[:-1] <= self._left[1:]).all() + or (self._left[:-1] >= self._right[1:]).all()) # --------------------------------------------------------------------- # Conversion @@ -1447,8 +1428,7 @@ def __arrow_array__(self, type=None): except TypeError as err: raise TypeError( f"Conversion to arrow with subtype '{self.dtype.subtype}' " - "is not supported" - ) from err + "is not supported") from err interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pyarrow.StructArray.from_arrays( [ @@ -1465,7 +1445,8 @@ def __arrow_array__(self, type=None): storage_array.type, len(storage_array), [null_bitmap], - children=[storage_array.field(0), storage_array.field(1)], + children=[storage_array.field(0), + storage_array.field(1)], ) if type is not None: @@ -1481,14 +1462,12 @@ def __arrow_array__(self, type=None): ) else: raise TypeError( - f"Not supported to convert IntervalArray to '{type}' type" - ) + f"Not supported to convert IntervalArray to '{type}' type") - return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + return pyarrow.ExtensionArray.from_storage(interval_type, + storage_array) - _interval_shared_docs[ - "to_tuples" - ] = """ + _interval_shared_docs["to_tuples"] = """ Return an %(return_type)s of tuples of the form (left, right). Parameters @@ -1503,9 +1482,10 @@ def __arrow_array__(self, type=None): %(examples)s\ """ - @Appender( - _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""} - ) + @Appender(_interval_shared_docs["to_tuples"] % { + "return_type": "ndarray", + "examples": "" + }) def to_tuples(self, na_tuple=True) -> np.ndarray: tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: @@ -1525,7 +1505,8 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: self._left._putmask(mask, value_left) self._right._putmask(mask, value_right) - def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT: + def insert(self: IntervalArrayT, loc: int, + item: Interval) -> IntervalArrayT: """ Return a new IntervalArray inserting new item at location. Follows Python numpy.insert semantics for negative values. Only Interval @@ -1567,8 +1548,7 @@ def repeat( right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) - _interval_shared_docs["contains"] = textwrap.dedent( - """ + _interval_shared_docs["contains"] = textwrap.dedent(""" Check elementwise if the Intervals contain the value. Return a boolean mask whether the value is contained in the Intervals @@ -1596,31 +1576,29 @@ def repeat( %(examples)s >>> intervals.contains(0.5) array([ True, False, False]) - """ - ) + """) @Appender( - _interval_shared_docs["contains"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + _interval_shared_docs["contains"] % { + "klass": + "IntervalArray", + "examples": + textwrap.dedent("""\ >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) >>> intervals [(0, 1], (1, 3], (2, 4]] Length: 3, dtype: interval[int64, right] - """ - ), - } - ) + """), + }) def contains(self, other): if isinstance(other, Interval): - raise NotImplementedError("contains not implemented for two intervals") + raise NotImplementedError( + "contains not implemented for two intervals") - return (self._left < other if self.open_left else self._left <= other) & ( - other < self._right if self.open_right else other <= self._right - ) + return (self._left < other if self.open_left else + self._left <= other) & (other < self._right if self.open_right + else other <= self._right) def isin(self, values) -> np.ndarray: if not hasattr(values, "dtype"): @@ -1646,8 +1624,7 @@ def isin(self, values) -> np.ndarray: return np.in1d(left, right) # type: ignore[arg-type] elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion( - values.left.dtype - ): + values.left.dtype): # not comparable -> no overlap return np.zeros(self.shape, dtype=bool) @@ -1672,13 +1649,13 @@ def _from_combined(self, combined: np.ndarray) -> IntervalArray: dtype = self._left.dtype if needs_i8_conversion(dtype): # error: "Type[ndarray[Any, Any]]" has no attribute "_from_sequence" - new_left = type(self._left)._from_sequence( # type: ignore[attr-defined] - nc[:, 0], dtype=dtype - ) + new_left = type( + self._left)._from_sequence( # type: ignore[attr-defined] + nc[:, 0], dtype=dtype) # error: "Type[ndarray[Any, Any]]" has no attribute "_from_sequence" - new_right = type(self._right)._from_sequence( # type: ignore[attr-defined] - nc[:, 1], dtype=dtype - ) + new_right = type( + self._right)._from_sequence( # type: ignore[attr-defined] + nc[:, 1], dtype=dtype) else: new_left = nc[:, 0].view(dtype) new_right = nc[:, 1].view(dtype) @@ -1688,7 +1665,8 @@ def unique(self) -> IntervalArray: # No overload variant of "__getitem__" of "ExtensionArray" matches argument # type "Tuple[slice, int]" nc = unique( - self._combined.view("complex128")[:, 0] # type: ignore[call-overload] + self._combined.view("complex128")[:, + 0] # type: ignore[call-overload] ) nc = nc[:, None] return self._from_combined(nc) @@ -1720,7 +1698,8 @@ def _maybe_convert_platform_interval(values) -> ArrayLike: return values elif is_categorical_dtype(values): values = np.asarray(values) - elif not hasattr(values, "dtype") and not isinstance(values, (list, tuple, range)): + elif not hasattr(values, "dtype") and not isinstance( + values, (list, tuple, range)): # TODO: should we just cast these to list? return values else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c54c1e686c6d1..76e182463e708 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -117,7 +117,6 @@ class ellipsis(Enum): else: ellipsis = type(Ellipsis) - # ---------------------------------------------------------------------------- # Array @@ -150,9 +149,8 @@ def _get_fill(arr: SparseArray) -> np.ndarray: return np.asarray(arr.fill_value) -def _sparse_array_op( - left: SparseArray, right: SparseArray, op: Callable, name: str -) -> SparseArray: +def _sparse_array_op(left: SparseArray, right: SparseArray, op: Callable, + name: str) -> SparseArray: """ Perform a binary operation between two arrays. @@ -221,11 +219,8 @@ def _sparse_array_op( left_sp_values = left.sp_values right_sp_values = right.sp_values - if ( - name in ["floordiv", "mod"] - and (right == 0).any() - and left.dtype.kind in ["i", "u"] - ): + if (name in ["floordiv", "mod"] and (right == 0).any() + and left.dtype.kind in ["i", "u"]): # Match the non-Sparse Series behavior opname = f"sparse_{name}_float64" left_sp_values = left_sp_values.astype("float64") @@ -258,9 +253,11 @@ def _sparse_array_op( return _wrap_result(name, result, index, fill, dtype=result_dtype) -def _wrap_result( - name: str, data, sparse_index, fill_value, dtype: Dtype | None = None -) -> SparseArray: +def _wrap_result(name: str, + data, + sparse_index, + fill_value, + dtype: Dtype | None = None) -> SparseArray: """ wrap op result to have correct dtype """ @@ -276,9 +273,10 @@ def _wrap_result( if is_bool_dtype(dtype): # fill_value may be np.bool_ fill_value = bool(fill_value) - return SparseArray( - data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype - ) + return SparseArray(data, + sparse_index=sparse_index, + fill_value=fill_value, + dtype=dtype) class SparseArray(OpsMixin, PandasObject, ExtensionArray): @@ -427,7 +425,9 @@ def __init__( else: npoints = sparse_index.length - data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) + data = construct_1d_arraylike_from_scalar(data, + npoints, + dtype=None) dtype = data.dtype if dtype is not None: @@ -477,7 +477,8 @@ def __init__( # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, # Any]]]" sparse_values = np.asarray( - data.sp_values, dtype=dtype # type: ignore[arg-type] + data.sp_values, + dtype=dtype # type: ignore[arg-type] ) elif sparse_index is None: data = extract_array(data, extract_numpy=True) @@ -510,12 +511,12 @@ def __init__( # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, # Any]]]" - sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type] + sparse_values = np.asarray(data, + dtype=dtype) # type: ignore[arg-type] if len(sparse_values) != sparse_index.npoints: raise AssertionError( f"Non array-like type {type(sparse_values)} must " - "have the same length as the index" - ) + "have the same length as the index") self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value) @@ -612,7 +613,11 @@ def __setitem__(self, key, value): raise TypeError(msg) @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence(cls, + scalars, + *, + dtype: Dtype | None = None, + copy: bool = False): return cls(scalars, dtype=dtype) @classmethod @@ -723,7 +728,8 @@ def isna(self): # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) if self._null_fill_value: - return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) + return type(self)._simple_new(isna(self.sp_values), self.sp_index, + dtype) mask = np.full(len(self), False, dtype=np.bool8) mask[self.sp_index.indices] = isna(self.sp_values) return type(self)(mask, fill_value=False, dtype=dtype) @@ -766,9 +772,8 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if (method is None and value is None) or ( - method is not None and value is not None - ): + if (method is None and value is None) or (method is not None + and value is not None): raise ValueError("Must specify one of 'method' or 'value'.") elif method is not None: @@ -791,7 +796,9 @@ def fillna( return self._simple_new(new_values, self._sparse_index, new_dtype) - def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT: + def shift(self: SparseArrayT, + periods: int = 1, + fill_value=None) -> SparseArrayT: if not self or periods == 0: return self.copy() @@ -807,15 +814,15 @@ def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT else: arr = self - empty = self._from_sequence( - [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype - ) + empty = self._from_sequence([fill_value] * + min(abs(periods), len(self)), + dtype=arr.dtype) if periods > 0: a = empty b = arr[:-periods] else: - a = arr[abs(periods) :] + a = arr[abs(periods):] b = empty return arr._concat_same_type([a, b]) @@ -848,13 +855,15 @@ def _values_for_factorize(self): # Still override this for hash_pandas_object return np.asarray(self), self.fill_value - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]: + def factorize(self, + na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + codes, uniques = algos.factorize(np.asarray(self), + na_sentinel=na_sentinel) uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp @@ -876,10 +885,12 @@ def value_counts(self, dropna: bool = True) -> Series: Series, ) - keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts = algos.value_counts_arraylike(self.sp_values, + dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0 and (not self._null_fill_value or not dropna): - mask = isna(keys) if self._null_fill_value else keys == self.fill_value + mask = isna( + keys) if self._null_fill_value else keys == self.fill_value if mask.any(): counts[mask] += fcounts else: @@ -949,7 +960,8 @@ def __getitem__( end += len(self) indices = self.sp_index.indices - keep_inds = np.flatnonzero((indices >= start) & (indices < end)) + keep_inds = np.flatnonzero((indices >= start) + & (indices < end)) sp_vals = self.sp_values[keep_inds] sp_index = indices[keep_inds].copy() @@ -965,7 +977,8 @@ def __getitem__( # of the length of our original array new_len = len(range(len(self))[key]) new_sp_index = make_sparse_index(new_len, sp_index, self.kind) - return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) + return type(self)._simple_new(sp_vals, new_sp_index, + self.dtype) else: indices = np.arange(len(self), dtype=np.int32)[key] return self.take(indices) @@ -975,8 +988,7 @@ def __getitem__( # exception message copied from numpy raise IndexError( r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " - r"(`None`) and integer or boolean arrays are valid indices" - ) + r"(`None`) and integer or boolean arrays are valid indices") else: if isinstance(key, SparseArray): @@ -1020,11 +1032,14 @@ def _get_val_at(self, loc): val = maybe_box_datetimelike(val, self.sp_values.dtype) return val - def take( - self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None - ) -> SparseArrayT: + def take(self: SparseArrayT, + indices, + *, + allow_fill: bool = False, + fill_value=None) -> SparseArrayT: if is_scalar(indices): - raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") + raise ValueError( + f"'indices' must be an array, not a scalar '{indices}'.") indices = np.asarray(indices, dtype=np.int32) dtype = None @@ -1036,19 +1051,18 @@ def take( else: return self._take_without_fill(indices) - return type(self)( - result, fill_value=self.fill_value, kind=self.kind, dtype=dtype - ) + return type(self)(result, + fill_value=self.fill_value, + kind=self.kind, + dtype=dtype) def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: if fill_value is None: fill_value = self.dtype.na_value if indices.min() < -1: - raise ValueError( - "Invalid value in 'indices'. Must be between -1 " - "and the length of the array." - ) + raise ValueError("Invalid value in 'indices'. Must be between -1 " + "and the length of the array.") if indices.max() >= len(self): raise IndexError("out of bounds value in 'indices'.") @@ -1061,7 +1075,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: taken.fill(fill_value) return taken else: - raise IndexError("cannot do a non-empty take from an empty axes.") + raise IndexError( + "cannot do a non-empty take from an empty axes.") # sp_indexer may be -1 for two reasons # 1.) we took for an index of -1 (new) @@ -1072,14 +1087,16 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: if self.sp_index.npoints == 0 and old_fill_indices.all(): # We've looked up all valid points on an all-sparse array. - taken = np.full( - sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype - ) + taken = np.full(sp_indexer.shape, + fill_value=self.fill_value, + dtype=self.dtype.subtype) elif self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values _dtype = np.result_type(self.dtype.subtype, type(fill_value)) - taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) + taken = np.full(sp_indexer.shape, + fill_value=fill_value, + dtype=_dtype) else: taken = self.sp_values.take(sp_indexer) @@ -1094,7 +1111,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: result_type = taken.dtype if m0.any(): - result_type = np.result_type(result_type, type(self.fill_value)) + result_type = np.result_type(result_type, + type(self.fill_value)) taken = taken.astype(result_type) taken[old_fill_indices] = self.fill_value @@ -1112,7 +1130,8 @@ def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: if (indices.max() >= n) or (indices.min() < -n): if n == 0: - raise IndexError("cannot do a non-empty take from an empty axes.") + raise IndexError( + "cannot do a non-empty take from an empty axes.") else: raise IndexError("out of bounds value in 'indices'.") @@ -1126,8 +1145,12 @@ def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False) - new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) - return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) + new_sp_index = make_sparse_index(len(indices), + value_indices, + kind=self.kind) + return type(self)._simple_new(new_sp_values, + new_sp_index, + dtype=self.dtype) def searchsorted( self, @@ -1141,16 +1164,16 @@ def searchsorted( if not is_scalar(v): v = np.asarray(v) v = np.asarray(v) - return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) + return np.asarray(self, dtype=self.dtype.subtype).searchsorted( + v, side, sorter) def copy(self: SparseArrayT) -> SparseArrayT: values = self.sp_values.copy() return self._simple_new(values, self.sp_index, self.dtype) @classmethod - def _concat_same_type( - cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT] - ) -> SparseArrayT: + def _concat_same_type(cls: type[SparseArrayT], + to_concat: Sequence[SparseArrayT]) -> SparseArrayT: fill_value = to_concat[0].fill_value values = [] @@ -1286,7 +1309,9 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): # error: Argument 1 to "_simple_new" of "SparseArray" has incompatible type # "ExtensionArray"; expected "ndarray" return self._simple_new( - sp_values, self.sp_index, dtype # type: ignore[arg-type] + sp_values, + self.sp_index, + dtype # type: ignore[arg-type] ) def map(self: SparseArrayT, mapper) -> SparseArrayT: @@ -1339,7 +1364,9 @@ def map(self: SparseArrayT, mapper) -> SparseArrayT: fill_value = mapper(self.fill_value) sp_values = [mapper(x) for x in self.sp_values] - return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) + return type(self)(sp_values, + sparse_index=self.sp_index, + fill_value=fill_value) def to_dense(self) -> np.ndarray: """ @@ -1357,7 +1384,8 @@ def _where(self, mask, value): # NB: may not preserve dtype, e.g. result may be Sparse[float64] # while self is Sparse[int64] naive_implementation = np.where(mask, self, value) - dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value) + dtype = SparseDtype(naive_implementation.dtype, + fill_value=self.fill_value) result = type(self)._from_sequence(naive_implementation, dtype=dtype) return result @@ -1381,9 +1409,9 @@ def __setstate__(self, state): def nonzero(self): if self.fill_value == 0: - return (self.sp_index.indices,) + return (self.sp_index.indices, ) else: - return (self.sp_index.indices[self.sp_values != 0],) + return (self.sp_index.indices[self.sp_values != 0], ) # ------------------------------------------------------------------------ # Reductions @@ -1444,9 +1472,12 @@ def any(self, axis=0, *args, **kwargs): return values.any().item() - def sum( - self, axis: int = 0, min_count: int = 0, skipna: bool = True, *args, **kwargs - ) -> Scalar: + def sum(self, + axis: int = 0, + min_count: int = 0, + skipna: bool = True, + *args, + **kwargs) -> Scalar: """ Sum of non-NA/null values @@ -1479,7 +1510,8 @@ def sum( return sp_sum else: nsparse = self.sp_index.ngaps - if check_below_min_count(valid_vals.shape, None, min_count - nsparse): + if check_below_min_count(valid_vals.shape, None, + min_count - nsparse): return na_value_for_dtype(self.dtype.subtype, compat=False) return sp_sum + self.fill_value * nsparse @@ -1615,27 +1647,24 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): out = kwargs.get("out", ()) for x in inputs + out: - if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): + if not isinstance(x, self._HANDLED_TYPES + (SparseArray, )): return NotImplemented # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) + result = ops.maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, + *inputs, **kwargs) if result is not NotImplemented: return result if "out" in kwargs: # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace - res = arraylike.dispatch_ufunc_with_out( - self, ufunc, method, *inputs, **kwargs - ) + res = arraylike.dispatch_ufunc_with_out(self, ufunc, method, + *inputs, **kwargs) return res if method == "reduce": result = arraylike.dispatch_reduction_ufunc( - self, ufunc, method, *inputs, **kwargs - ) + self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: # e.g. tests.series.test_ufunc.TestNumpyReductions return result @@ -1648,19 +1677,16 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if ufunc.nout > 1: # multiple outputs. e.g. modf arrays = tuple( - self._simple_new( - sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) - ) - for sp_value, fv in zip(sp_values, fill_value) - ) + self._simple_new(sp_value, self.sp_index, + SparseDtype(sp_value.dtype, fv)) + for sp_value, fv in zip(sp_values, fill_value)) return arrays elif method == "reduce": # e.g. reductions return sp_values - return self._simple_new( - sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) - ) + return self._simple_new(sp_values, self.sp_index, + SparseDtype(sp_values.dtype, fill_value)) new_inputs = tuple(np.asarray(x) for x in inputs) result = getattr(ufunc, method)(*new_inputs, **kwargs) @@ -1707,11 +1733,12 @@ def _arith_method(self, other, op): with np.errstate(all="ignore"): if len(self) != len(other): raise AssertionError( - f"length mismatch: {len(self)} vs. {len(other)}" - ) + f"length mismatch: {len(self)} vs. {len(other)}") if not isinstance(other, SparseArray): dtype = getattr(other, "dtype", None) - other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) + other = SparseArray(other, + fill_value=self.fill_value, + dtype=dtype) return _sparse_array_op(self, other, op, op_name) def _cmp_method(self, other, op) -> SparseArray: @@ -1841,23 +1868,27 @@ def make_sparse( # error: Argument "dtype" to "astype_nansafe" has incompatible type "Union[str, # dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" sparsified_values = astype_nansafe( - sparsified_values, dtype=dtype # type: ignore[arg-type] + sparsified_values, + dtype=dtype # type: ignore[arg-type] ) # TODO: copy return sparsified_values, index, fill_value @overload -def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: +def make_sparse_index(length: int, indices, + kind: Literal["block"]) -> BlockIndex: ... @overload -def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: +def make_sparse_index(length: int, indices, + kind: Literal["integer"]) -> IntIndex: ... -def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: +def make_sparse_index(length: int, indices, + kind: SparseIndexKind) -> SparseIndex: index: SparseIndex if kind == "block": locs, lens = splib.get_blocks(indices) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 46d83e2a3aa76..1dede596eafc9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -77,7 +77,6 @@ "ge": pc.greater_equal, } - if TYPE_CHECKING: from pandas import Series @@ -95,9 +94,8 @@ def _chk_pyarrow_available() -> None: # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray( - OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin -): +class ArrowStringArray(OpsMixin, ArrowExtensionArray, BaseStringArray, + ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -148,7 +146,8 @@ def __init__(self, values): elif isinstance(values, pa.ChunkedArray): self._data = values else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + raise ValueError( + f"Unsupported type '{type(values)}' for ArrowStringArray") if not pa.types.is_string(self._data.type): raise ValueError( @@ -156,21 +155,27 @@ def __init__(self, values): ) @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence(cls, + scalars, + dtype: Dtype | None = None, + copy: bool = False): from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, + StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, + copy=copy, + convert_na_value=False) return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str @@ -178,9 +183,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod - def _from_sequence_of_strings( - cls, strings, dtype: Dtype | None = None, copy: bool = False - ): + def _from_sequence_of_strings(cls, + strings, + dtype: Dtype | None = None, + copy: bool = False): return cls._from_sequence(strings, dtype=dtype, copy=copy) @property @@ -216,11 +222,11 @@ def to_numpy( return result @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize(self, + na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: encoded = self._data.dictionary_encode() - indices = pa.chunked_array( - [c.indices for c in encoded.chunks], type=encoded.type.index_type - ).to_pandas() + indices = pa.chunked_array([c.indices for c in encoded.chunks], + type=encoded.type.index_type).to_pandas() if indices.dtype.kind == "f": indices[np.isnan(indices)] = na_sentinel indices = indices.astype(np.int64, copy=False) @@ -237,11 +243,12 @@ def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: ... @overload - def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray: + def __getitem__(self: ArrowStringArray, + item: SequenceIndexer) -> ArrowStringArray: ... def __getitem__( - self: ArrowStringArray, item: PositionalIndexer + self: ArrowStringArray, item: PositionalIndexer ) -> ArrowStringArray | ArrowStringScalarOrNAT: """Select a subset of self. @@ -276,10 +283,8 @@ def __getitem__( elif is_bool_dtype(item.dtype): return type(self)(self._data.filter(item)) else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) + raise IndexError("Only integers, slices and integer or " + "boolean arrays are valid indices.") elif isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) @@ -295,8 +300,7 @@ def __getitem__( # exception message copied from numpy raise IndexError( r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " - r"(`None`) and integer or boolean arrays are valid indices" - ) + r"(`None`) and integer or boolean arrays are valid indices") # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. value = self._data[item] @@ -377,7 +381,7 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), - *self._data[(key + 1) :].chunks, + *self._data[(key + 1):].chunks, ] self._data = pa.chunked_array(new_data) else: @@ -509,8 +513,8 @@ def isin(self, values): return super().isin(values) value_set = [ - pa_scalar.as_py() - for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] + pa_scalar.as_py() for pa_scalar in + [pa.scalar(value, from_pandas=True) for value in values] if pa_scalar.type in (pa.string(), pa.null()) ] @@ -588,9 +592,11 @@ def astype(self, dtype, copy: bool = True): # error: Cannot determine type of 'na_value' _str_na_value = StringDtype.na_value # type: ignore[has-type] - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): + def _str_map(self, + f, + na_value=None, + dtype: Dtype | None = None, + convert: bool = True): # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. @@ -636,10 +642,15 @@ def _str_map( elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + result = lib.map_infer_mask(arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value) + result = pa.array(result, + mask=mask, + type=pa.string(), + from_pandas=True) return type(self)(result) else: # This is when the result type is object. We reach this when @@ -648,7 +659,12 @@ def _str_map( # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) - def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): + def _str_contains(self, + pat, + case=True, + flags=0, + na=np.nan, + regex: bool = True): if flags: return super()._str_contains(pat, case, flags, na, regex) @@ -661,7 +677,8 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): if case: result = pc.match_substring(self._data, pat) else: - result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = pc.match_substring(pc.utf8_upper(self._data), + pat.upper()) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) @@ -690,22 +707,22 @@ def _str_replace( flags: int = 0, regex: bool = True, ): - if ( - pa_version_under4p0 - or isinstance(pat, re.Pattern) - or callable(repl) - or not case - or flags - ): + if (pa_version_under4p0 or isinstance(pat, re.Pattern) + or callable(repl) or not case or flags): return super()._str_replace(pat, repl, n, case, flags, regex) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + result = func(self._data, + pattern=pat, + replacement=repl, + max_replacements=n) return type(self)(result) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): + def _str_match(self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | None = None): if pa_version_under4p0: return super()._str_match(pat, case, flags, na) @@ -713,9 +730,11 @@ def _str_match( pat = "^" + pat return self._str_contains(pat, case, flags, na, regex=True) - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): + def _str_fullmatch(self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | None = None): if pa_version_under4p0: return super()._str_fullmatch(pat, case, flags, na) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 73c3f63b39d06..7a7949c520048 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -224,16 +224,22 @@ # Docstring templates _shared_doc_kwargs = { - "axes": "index, columns", - "klass": "DataFrame", - "axes_single_arg": "{0 or 'index', 1 or 'columns'}", - "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 + "axes": + "index, columns", + "klass": + "DataFrame", + "axes_single_arg": + "{0 or 'index', 1 or 'columns'}", + "axis": + """axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index': apply function to each column. If 1 or 'columns': apply function to each row.""", - "inplace": """ + "inplace": + """ inplace : bool, default False If True, performs operation inplace and returns None.""", - "optional_by": """ + "optional_by": + """ by : str or list of str Name or list of names to sort by. @@ -241,12 +247,15 @@ levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column levels and/or index labels.""", - "optional_labels": """labels : array-like, optional + "optional_labels": + """labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", - "optional_axis": """axis : int or str, optional + "optional_axis": + """axis : int or str, optional Axis to target. Can be either the axis name ('index', 'columns') or number (0, 1).""", - "replace_iloc": """ + "replace_iloc": + """ This differs from updating with ``.loc`` or ``.iloc``, which require you to specify a location to update with some value.""", } @@ -453,7 +462,6 @@ 3 bar 8 """ - # ----------------------------------------------------------------------- # DataFrame class @@ -619,11 +627,9 @@ def __init__( if isinstance(data, dict): # retain pre-GH#38939 default behavior copy = True - elif ( - manager == "array" - and isinstance(data, (np.ndarray, ExtensionArray)) - and data.ndim == 2 - ): + elif (manager == "array" + and isinstance(data, (np.ndarray, ExtensionArray)) + and data.ndim == 2): # INFO(ArrayManager) by default copy the 2D input array to get # contiguous 1D arrays copy = True @@ -631,13 +637,22 @@ def __init__( copy = False if isinstance(data, (BlockManager, ArrayManager)): - mgr = self._init_mgr( - data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy - ) + mgr = self._init_mgr(data, + axes={ + "index": index, + "columns": columns + }, + dtype=dtype, + copy=copy) elif isinstance(data, dict): # GH#38939 de facto copy defaults to False only in non-dict cases - mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) + mgr = dict_to_mgr(data, + index, + columns, + dtype=dtype, + copy=copy, + typ=manager) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords @@ -721,7 +736,8 @@ def __init__( # error: Argument 1 to "ensure_index" has incompatible type # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, # ndarray], Index, Series], Sequence[Any]]" - columns = ensure_index(columns) # type: ignore[arg-type] + columns = ensure_index( + columns) # type: ignore[arg-type] arrays, columns, index = nested_data_to_arrays( # error: Argument 3 to "nested_data_to_arrays" has incompatible # type "Optional[Collection[Any]]"; expected "Optional[Index]" @@ -779,7 +795,11 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) + mgr = arrays_to_mgr(values, + columns, + index, + dtype=None, + typ=manager) else: arr2d = construct_2d_arraylike_from_scalar( data, @@ -901,8 +921,7 @@ def _can_fast_transpose(self) -> bool: # "_values" incompatible with return type "ndarray" in supertype "NDFrame" @property def _values( # type: ignore[override] - self, - ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: + self, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: """ Analogue to ._values that may return a 2D ExtensionArray. """ @@ -911,7 +930,8 @@ def _values( # type: ignore[override] mgr = self._mgr if isinstance(mgr, ArrayManager): - if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): + if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype( + mgr.arrays[0].dtype): # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" # has no attribute "reshape" return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] @@ -927,7 +947,8 @@ def _values( # type: ignore[override] return self.values # more generally, whatever we allow in NDArrayBackedExtensionBlock - arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", + arr) return arr.T # ---------------------------------------------------------------------- @@ -957,8 +978,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: # exceed max columns if (max_columns and nb_columns > max_columns) or ( - (not ignore_width) and width and nb_columns > (width // 2) - ): + (not ignore_width) and width and nb_columns > (width // 2)): return False # used by repr_html under IPython notebook or scripts ignore terminal @@ -966,7 +986,8 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: if ignore_width or not console.in_interactive_session(): return True - if get_option("display.width") is not None or console.in_ipython_frontend(): + if get_option( + "display.width") is not None or console.in_ipython_frontend(): # check at least the column row for excessive width max_rows = 1 else: @@ -983,7 +1004,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: if max_rows is not None: # unlimited rows # min of two, where one may be None - d = d.iloc[: min(max_rows, len(d))] + d = d.iloc[:min(max_rows, len(d))] else: return True @@ -998,9 +1019,8 @@ def _info_repr(self) -> bool: True if the repr should show the info view. """ info_repr_option = get_option("display.large_repr") == "info" - return info_repr_option and not ( - self._repr_fits_horizontal_() and self._repr_fits_vertical_() - ) + return info_repr_option and not (self._repr_fits_horizontal_() + and self._repr_fits_vertical_()) def __repr__(self) -> str: """ @@ -1118,7 +1138,8 @@ def to_string( "every integers corresponds with one column. If a dict is given, the key " "references the column, while the value defines the space to use.", ) - @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + @Substitution(shared_params=fmt.common_docstring, + returns=fmt.return_docstring) def to_string( self, buf: FilePath | WriteBuffer[str] | None = None, @@ -1217,9 +1238,7 @@ def style(self) -> Styler: return Styler(self) - _shared_docs[ - "items" - ] = r""" + _shared_docs["items"] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with @@ -1334,9 +1353,9 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]: s = klass(v, index=columns, name=k) yield k, s - def itertuples( - self, index: bool = True, name: str | None = "Pandas" - ) -> Iterable[tuple[Any, ...]]: + def itertuples(self, + index: bool = True, + name: str | None = "Pandas") -> Iterable[tuple[Any, ...]]: """ Iterate over DataFrame rows as namedtuples. @@ -1411,8 +1430,7 @@ def itertuples( # https://github.com/python/mypy/issues/9046 # error: namedtuple() expects a string literal as the first argument itertuple = collections.namedtuple( # type: ignore[misc] - name, fields, rename=True - ) + name, fields, rename=True) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples @@ -1511,7 +1529,8 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ if isinstance(other, (Series, DataFrame)): common = self.columns.union(other.index) - if len(common) > len(self.columns) or len(common) > len(other.index): + if len(common) > len(self.columns) or len(common) > len( + other.index): raise ValueError("matrices are not aligned") left = self.reindex(columns=common, copy=False) @@ -1528,11 +1547,12 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ) if isinstance(other, DataFrame): - return self._constructor( - np.dot(lvals, rvals), index=left.index, columns=other.columns - ) + return self._constructor(np.dot(lvals, rvals), + index=left.index, + columns=other.columns) elif isinstance(other, Series): - return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) + return self._constructor_sliced(np.dot(lvals, rvals), + index=left.index) elif isinstance(rvals, (np.ndarray, Index)): result = np.dot(lvals, rvals) if result.ndim == 2: @@ -1548,13 +1568,13 @@ def __matmul__(self, other: Series) -> Series: @overload def __matmul__( - self, other: AnyArrayLike | DataFrame | Series - ) -> DataFrame | Series: + self, + other: AnyArrayLike | DataFrame | Series) -> DataFrame | Series: ... def __matmul__( - self, other: AnyArrayLike | DataFrame | Series - ) -> DataFrame | Series: + self, + other: AnyArrayLike | DataFrame | Series) -> DataFrame | Series: """ Matrix multiplication using binary `@` operator in Python>=3.5. """ @@ -1677,7 +1697,8 @@ def from_dict( data, index = list(data.values()), list(data.keys()) elif orient in ("columns", "tight"): if columns is not None: - raise ValueError(f"cannot use columns parameter with orient='{orient}'") + raise ValueError( + f"cannot use columns parameter with orient='{orient}'") else: # pragma: no cover raise ValueError("only recognize index or columns for orient") @@ -1876,12 +1897,12 @@ def to_dict(self, orient: str = "dict", into=dict): orient = orient.lower() # GH32515 if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { - "dict", - "list", - "series", - "split", - "records", - "index", + "dict", + "list", + "series", + "split", + "records", + "index", }: warnings.warn( "Using short name for 'orient' is deprecated. Only the " @@ -1912,57 +1933,51 @@ def to_dict(self, orient: str = "dict", into=dict): return into_c((k, v.tolist()) for k, v in self.items()) elif orient == "split": - return into_c( + return into_c(( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), - ) - ) + "data", + [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + )) elif orient == "tight": - return into_c( + return into_c(( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), - ("index_names", list(self.index.names)), - ("column_names", list(self.columns.names)), - ) - ) + "data", + [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + ("index_names", list(self.index.names)), + ("column_names", list(self.columns.names)), + )) elif orient == "series": return into_c((k, v) for k, v in self.items()) elif orient == "records": columns = self.columns.tolist() - rows = ( - dict(zip(columns, row)) - for row in self.itertuples(index=False, name=None) - ) + rows = (dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None)) return [ - into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows + into_c((k, maybe_box_native(v)) for k, v in row.items()) + for row in rows ] elif orient == "index": if not self.index.is_unique: - raise ValueError("DataFrame index must be unique for orient='index'.") - return into_c( - (t[0], dict(zip(self.columns, t[1:]))) - for t in self.itertuples(name=None) - ) + raise ValueError( + "DataFrame index must be unique for orient='index'.") + return into_c((t[0], dict(zip(self.columns, t[1:]))) + for t in self.itertuples(name=None)) else: raise ValueError(f"orient '{orient}' not understood") @@ -2162,8 +2177,8 @@ def from_records( columns = ensure_index(columns) def maybe_reorder( - arrays: list[ArrayLike], arr_columns: Index, columns: Index, index - ) -> tuple[list[ArrayLike], Index, Index | None]: + arrays: list[ArrayLike], arr_columns: Index, columns: Index, + index) -> tuple[list[ArrayLike], Index, Index | None]: """ If our desired 'columns' do not match the data's pre-existing 'arr_columns', we re-order our arrays. This is like a pre-emptive (cheap) reindex. @@ -2178,7 +2193,8 @@ def maybe_reorder( # for backward compat use an object Index instead of RangeIndex result_index = Index([]) - arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, + length) return arrays, arr_columns, result_index if is_iterator(data): @@ -2220,8 +2236,7 @@ def maybe_reorder( arr_columns = Index(arr_columns_list) arrays, arr_columns, result_index = maybe_reorder( - arrays, arr_columns, columns, index - ) + arrays, arr_columns, columns, index) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) @@ -2244,8 +2259,7 @@ def maybe_reorder( columns = arr_columns else: arrays, arr_columns, result_index = maybe_reorder( - arrays, arr_columns, columns, index - ) + arrays, arr_columns, columns, index) if exclude is None: exclude = set() @@ -2262,12 +2276,15 @@ def maybe_reorder( result_index = Index([], name=index) else: try: - index_data = [arrays[arr_columns.get_loc(field)] for field in index] + index_data = [ + arrays[arr_columns.get_loc(field)] for field in index + ] except (KeyError, TypeError): # raised by get_loc, see GH#29258 result_index = index else: - result_index = ensure_index_from_sequences(index_data, names=index) + result_index = ensure_index_from_sequences(index_data, + names=index) exclude.update(index) if any(exclude): @@ -2282,9 +2299,10 @@ def maybe_reorder( return cls(mgr) - def to_records( - self, index=True, column_dtypes=None, index_dtypes=None - ) -> np.recarray: + def to_records(self, + index=True, + column_dtypes=None, + index_dtypes=None) -> np.recarray: """ Convert DataFrame to a NumPy record array. @@ -2385,9 +2403,14 @@ def to_records( elif index_names[0] is None: index_names = ["index"] - names = [str(name) for name in itertools.chain(index_names, self.columns)] + names = [ + str(name) + for name in itertools.chain(index_names, self.columns) + ] else: - arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] + arrays = [ + np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) + ] names = [str(c) for c in self.columns] index_names = [] @@ -2443,7 +2466,11 @@ def to_records( msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" raise ValueError(msg) - return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) + return np.rec.fromarrays(arrays, + dtype={ + "names": names, + "formats": formats + }) @classmethod def _from_arrays( @@ -2619,7 +2646,8 @@ def to_stata( >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): - raise ValueError("Only formats 114, 117, 118 and 119 are supported.") + raise ValueError( + "Only formats 114, 117, 118 and 119 are supported.") if version == 114: if convert_strl is not None: raise ValueError("strl is not supported in format 114") @@ -2660,7 +2688,8 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: + def to_feather(self, path: FilePath | WriteBuffer[bytes], + **kwargs) -> None: """ Write a DataFrame to the binary Feather format. @@ -2860,7 +2889,8 @@ def to_parquet( " .. versionadded:: 0.25.0\n" " Ability to use str", ) - @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + @Substitution(shared_params=fmt.common_docstring, + returns=fmt.return_docstring) def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, @@ -2949,11 +2979,13 @@ def to_html( @doc( storage_options=_shared_docs["storage_options"], - compression_options=_shared_docs["compression_options"] % "path_or_buffer", + compression_options=_shared_docs["compression_options"] % + "path_or_buffer", ) def to_xml( self, - path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] + | None = None, index: bool = True, root_name: str | None = "data", row_name: str | None = "row", @@ -2966,7 +2998,8 @@ def to_xml( xml_declaration: bool | None = True, pretty_print: bool | None = True, parser: str | None = "lxml", - stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] + | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> str | None: @@ -3122,8 +3155,7 @@ def to_xml( TreeBuilder = LxmlXMLFormatter else: raise ImportError( - "lxml not found, please install or use the etree parser." - ) + "lxml not found, please install or use the etree parser.") elif parser == "etree": TreeBuilder = EtreeXMLFormatter @@ -3165,7 +3197,8 @@ def info( ) -> None: if null_counts is not None: if show_counts is not None: - raise ValueError("null_counts used with show_counts. Use show_counts.") + raise ValueError( + "null_counts used with show_counts. Use show_counts.") warnings.warn( "null_counts is deprecated. Use show_counts instead", FutureWarning, @@ -3273,13 +3306,15 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: 5244 """ result = self._constructor_sliced( - [c.memory_usage(index=False, deep=deep) for col, c in self.items()], + [ + c.memory_usage(index=False, deep=deep) + for col, c in self.items() + ], index=self.columns, ) if index: index_memory_usage = self._constructor_sliced( - self.index.memory_usage(deep=deep), index=["Index"] - ) + self.index.memory_usage(deep=deep), index=["Index"]) result = index_memory_usage._append(result) return result @@ -3391,26 +3426,31 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if copy: new_vals = new_vals.copy() - result = self._constructor(new_vals, index=self.columns, columns=self.index) + result = self._constructor(new_vals, + index=self.columns, + columns=self.index) - elif ( - self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) - ): + elif (self._is_homogeneous_type and dtypes + and is_extension_array_dtype(dtypes[0])): # We have EAs with the same dtype. We can preserve that dtype in transpose. dtype = dtypes[0] arr_type = dtype.construct_array_type() values = self.values - new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] - result = type(self)._from_arrays( - new_values, index=self.columns, columns=self.index - ) + new_values = [ + arr_type._from_sequence(row, dtype=dtype) for row in values + ] + result = type(self)._from_arrays(new_values, + index=self.columns, + columns=self.index) else: new_arr = self.values.T if copy: new_arr = new_arr.copy() - result = self._constructor(new_arr, index=self.columns, columns=self.index) + result = self._constructor(new_arr, + index=self.columns, + columns=self.index) return result.__finalize__(self, method="transpose") @@ -3437,7 +3477,8 @@ def _ixs(self, i: int, axis: int = 0): new_values = self._mgr.fast_xs(i) # if we are a copy, mark as such - copy = isinstance(new_values, np.ndarray) and new_values.base is None + copy = isinstance(new_values, + np.ndarray) and new_values.base is None result = self._constructor_sliced( new_values, index=self.columns, @@ -3491,8 +3532,7 @@ def __getitem__(self, key): if indexer is not None: if isinstance(indexer, np.ndarray): indexer = lib.maybe_indices_to_slice( - indexer.astype(np.intp, copy=False), len(self) - ) + indexer.astype(np.intp, copy=False), len(self)) if isinstance(indexer, np.ndarray): # GH#43223 If we can not convert, use take return self.take(indexer, axis=0) @@ -3554,8 +3594,7 @@ def _getitem_bool_array(self, key): ) elif len(key) != len(self.index): raise ValueError( - f"Item wrong length {len(key)} instead of {len(self.index)}." - ) + f"Item wrong length {len(key)} instead of {len(self.index)}.") # check_bool_indexer will throw exception if Series key cannot # be reindexed to match DataFrame rows @@ -3574,9 +3613,9 @@ def _getitem_multilevel(self, key): result.columns = result_columns else: new_values = self.values[:, loc] - result = self._constructor( - new_values, index=self.index, columns=result_columns - ) + result = self._constructor(new_values, + index=self.index, + columns=result_columns) result = result.__finalize__(self) # If there is only one column being returned, and its name is @@ -3592,9 +3631,9 @@ def _getitem_multilevel(self, key): if top == "": result = result[""] if isinstance(result, Series): - result = self._constructor_sliced( - result, index=self.index, name=key - ) + result = self._constructor_sliced(result, + index=self.index, + name=key) result._set_is_copy(self) return result @@ -3656,11 +3695,8 @@ def __setitem__(self, key, value): self._setitem_array(key, value) elif isinstance(value, DataFrame): self._set_item_frame_value(key, value) - elif ( - is_list_like(value) - and not self.columns.is_unique - and 1 < len(self.columns.get_indexer_for([key])) == len(value) - ): + elif (is_list_like(value) and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value)): # Column to set is duplicated self._setitem_array([key], value) else: @@ -3764,13 +3800,13 @@ def _setitem_frame(self, key, value): # df[df > df2] = 0 if isinstance(key, np.ndarray): if key.shape != self.shape: - raise ValueError("Array conditional must be same shape as self") + raise ValueError( + "Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict()) if key.size and not is_bool_dtype(key.values): raise TypeError( - "Must pass DataFrame or 2-d ndarray with boolean values only" - ) + "Must pass DataFrame or 2-d ndarray with boolean values only") self._check_inplace_setting(value) self._check_setitem_copy() @@ -3790,8 +3826,7 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: # align right-hand-side columns if self.columns # is multi-index and self[key] is a sub-frame if isinstance(self.columns, MultiIndex) and isinstance( - loc, (slice, Series, np.ndarray, Index) - ): + loc, (slice, Series, np.ndarray, Index)): cols = maybe_droplevels(cols, key) if len(cols) and not cols.equals(value.columns): value = value.reindex(cols, axis=1) @@ -3800,9 +3835,10 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: arraylike = _reindex_for_setitem(value, self.index) self._set_item_mgr(key, arraylike) - def _iset_item_mgr( - self, loc: int | slice | np.ndarray, value, inplace: bool = False - ) -> None: + def _iset_item_mgr(self, + loc: int | slice | np.ndarray, + value, + inplace: bool = False) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc self._mgr.iset(loc, value, inplace=inplace) self._clear_item_cache() @@ -3844,22 +3880,22 @@ def _set_item(self, key, value) -> None: """ value = self._sanitize_column(value) - if ( - key in self.columns - and value.ndim == 1 - and not is_extension_array_dtype(value) - ): + if (key in self.columns and value.ndim == 1 + and not is_extension_array_dtype(value)): # broadcast across multiple columns if necessary - if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + if not self.columns.is_unique or isinstance( + self.columns, MultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)).T self._set_item_mgr(key, value) - def _set_value( - self, index: IndexLabel, col, value: Scalar, takeable: bool = False - ) -> None: + def _set_value(self, + index: IndexLabel, + col, + value: Scalar, + takeable: bool = False) -> None: """ Put single value at passed column and index. @@ -3914,7 +3950,9 @@ def _ensure_valid_index(self, value) -> None: if self.index.name is not None: index_copy.name = self.index.name - self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) + self._mgr = self._mgr.reindex_axis(index_copy, + axis=1, + fill_value=np.nan) def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: """ @@ -4336,14 +4374,15 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: 5 False 2.0 """ if not is_list_like(include): - include = (include,) if include is not None else () + include = (include, ) if include is not None else () if not is_list_like(exclude): - exclude = (exclude,) if exclude is not None else () + exclude = (exclude, ) if exclude is not None else () selection = (frozenset(include), frozenset(exclude)) if not any(selection): - raise ValueError("at least one of include or exclude must be nonempty") + raise ValueError( + "at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation def check_int_infer_dtype(dtypes): @@ -4351,7 +4390,8 @@ def check_int_infer_dtype(dtypes): for dtype in dtypes: # Numpy maps int to different types (int32, in64) on Windows and Linux # see https://github.com/numpy/numpy/issues/9464 - if (isinstance(dtype, str) and dtype == "int") or (dtype is int): + if (isinstance(dtype, str) + and dtype == "int") or (dtype is int): converted_dtypes.append(np.int32) converted_dtypes.append(np.int64) elif dtype == "float" or dtype is float: @@ -4369,12 +4409,13 @@ def check_int_infer_dtype(dtypes): # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError(f"include and exclude overlap on {(include & exclude)}") + raise ValueError( + f"include and exclude overlap on {(include & exclude)}") def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set and getattr(dtype, "_is_numeric", False) - ) + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False)) def predicate(arr: ArrayLike) -> bool: dtype = arr.dtype @@ -4446,10 +4487,8 @@ def insert( if allow_duplicates is lib.no_default: allow_duplicates = False if allow_duplicates and not self.flags.allows_duplicate_labels: - raise ValueError( - "Cannot specify 'allow_duplicates=True' when " - "'self.flags.allows_duplicate_labels' is False." - ) + raise ValueError("Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False.") if not allow_duplicates and column in self.columns: # Should this be a different kind of error?? raise ValueError(f"cannot insert {column}, already exists") @@ -4553,15 +4592,15 @@ def _sanitize_column(self, value) -> ArrayLike: @property def _series(self): return { - item: Series( - self._mgr.iget(idx), index=self.index, name=item, fastpath=True - ) + item: Series(self._mgr.iget(idx), + index=self.index, + name=item, + fastpath=True) for idx, item in enumerate(self.columns) } - def lookup( - self, row_labels: Sequence[IndexLabel], col_labels: Sequence[IndexLabel] - ) -> np.ndarray: + def lookup(self, row_labels: Sequence[IndexLabel], + col_labels: Sequence[IndexLabel]) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. Given equal-length arrays of row and column labels, return an @@ -4585,12 +4624,10 @@ def lookup( numpy.ndarray The found values. """ - msg = ( - "The 'lookup' method is deprecated and will be " - "removed in a future version. " - "You can use DataFrame.melt and DataFrame.loc " - "as a substitute." - ) + msg = ("The 'lookup' method is deprecated and will be " + "removed in a future version. " + "You can use DataFrame.melt and DataFrame.loc " + "as a substitute.") warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) n = len(row_labels) @@ -4598,7 +4635,8 @@ def lookup( raise ValueError("Row labels must have same size as column labels") if not (self.index.is_unique and self.columns.is_unique): # GH#33041 - raise ValueError("DataFrame.lookup requires unique index and columns") + raise ValueError( + "DataFrame.lookup requires unique index and columns") thresh = 1000 if not self._is_mixed_type or n > thresh: @@ -4624,20 +4662,19 @@ def lookup( # ---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, + copy): frame = self columns = axes["columns"] if columns is not None: - frame = frame._reindex_columns( - columns, method, copy, level, fill_value, limit, tolerance - ) + frame = frame._reindex_columns(columns, method, copy, level, + fill_value, limit, tolerance) index = axes["index"] if index is not None: - frame = frame._reindex_index( - index, method, copy, level, fill_value, limit, tolerance - ) + frame = frame._reindex_index(index, method, copy, level, + fill_value, limit, tolerance) return frame @@ -4651,9 +4688,11 @@ def _reindex_index( limit=None, tolerance=None, ): - new_index, indexer = self.index.reindex( - new_index, method=method, level=level, limit=limit, tolerance=tolerance - ) + new_index, indexer = self.index.reindex(new_index, + method=method, + level=level, + limit=limit, + tolerance=tolerance) return self._reindex_with_indexers( {0: [new_index, indexer]}, copy=copy, @@ -4671,9 +4710,11 @@ def _reindex_columns( limit=None, tolerance=None, ): - new_columns, indexer = self.columns.reindex( - new_columns, method=method, level=level, limit=limit, tolerance=tolerance - ) + new_columns, indexer = self.columns.reindex(new_columns, + method=method, + level=level, + limit=limit, + tolerance=tolerance) return self._reindex_with_indexers( {1: [new_columns, indexer]}, copy=copy, @@ -4681,9 +4722,8 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi( - self, axes: dict[str, Index], copy: bool, fill_value - ) -> DataFrame: + def _reindex_multi(self, axes: dict[str, Index], copy: bool, + fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ @@ -4698,11 +4738,18 @@ def _reindex_multi( # ensures that self.values is cheap. It may be worth making this # condition more specific. indexer = row_indexer, col_indexer - new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) - return self._constructor(new_values, index=new_index, columns=new_columns) + new_values = take_2d_multi(self.values, + indexer, + fill_value=fill_value) + return self._constructor(new_values, + index=new_index, + columns=new_columns) else: return self._reindex_with_indexers( - {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + { + 0: [new_index, row_indexer], + 1: [new_columns, col_indexer] + }, copy=copy, fill_value=fill_value, ) @@ -4735,9 +4782,10 @@ def align( ) @overload - def set_axis( - self, labels, axis: Axis = ..., inplace: Literal[False] = ... - ) -> DataFrame: + def set_axis(self, + labels, + axis: Axis = ..., + inplace: Literal[False] = ...) -> DataFrame: ... @overload @@ -4749,14 +4797,15 @@ def set_axis(self, labels, *, inplace: Literal[True]) -> None: ... @overload - def set_axis( - self, labels, axis: Axis = ..., inplace: bool = ... - ) -> DataFrame | None: + def set_axis(self, + labels, + axis: Axis = ..., + inplace: bool = ...) -> DataFrame | None: ... - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) - @Appender( - """ + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "labels"]) + @Appender(""" Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) @@ -4785,8 +4834,7 @@ def set_axis( 0 1 4 1 2 5 2 3 6 - """ - ) + """) @Substitution( **_shared_doc_kwargs, extended_summary_sub=" column or", @@ -4811,14 +4859,16 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): ], ) def reindex(self, *args, **kwargs) -> DataFrame: - axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") + axes = validate_axis_style_args(self, args, kwargs, "labels", + "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) return super().reindex(**kwargs) - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "labels"]) def drop( self, labels=None, @@ -5221,7 +5271,8 @@ def fillna( ) -> DataFrame | None: ... - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "value"]) @doc(NDFrame.fillna, **_shared_doc_kwargs) def fillna( self, @@ -5303,9 +5354,8 @@ def replace( method=method, ) - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex - ): + def _replace_columnwise(self, mapping: dict[Hashable, tuple[Any, Any]], + inplace: bool, regex): """ Dispatch to Series.replace column-wise. @@ -5366,45 +5416,39 @@ def shift( for col in range(min(ncols, abs(periods))): # Define filler inside loop so we get a copy filler = self.iloc[:, -1].shift(len(self)) - result.insert( - len(result.columns), label, filler, allow_duplicates=True - ) + result.insert(len(result.columns), + label, + filler, + allow_duplicates=True) result.columns = self.columns.copy() return result - elif ( - axis == 1 - and periods != 0 - and fill_value is not lib.no_default - and ncols > 0 - ): + elif (axis == 1 and periods != 0 and fill_value is not lib.no_default + and ncols > 0): arrays = self._mgr.arrays if len(arrays) > 1 or ( - # If we only have one block and we know that we can't - # keep the same dtype (i.e. the _can_hold_element check) - # then we can go through the reindex_indexer path - # (and avoid casting logic in the Block method). - # The exception to this (until 2.0) is datetimelike - # dtypes with integers, which cast. - not can_hold_element(arrays[0], fill_value) - # TODO(2.0): remove special case for integer-with-datetimelike - # once deprecation is enforced - and not ( - lib.is_integer(fill_value) and needs_i8_conversion(arrays[0].dtype) - ) - ): + # If we only have one block and we know that we can't + # keep the same dtype (i.e. the _can_hold_element check) + # then we can go through the reindex_indexer path + # (and avoid casting logic in the Block method). + # The exception to this (until 2.0) is datetimelike + # dtypes with integers, which cast. + not can_hold_element(arrays[0], fill_value) + # TODO(2.0): remove special case for integer-with-datetimelike + # once deprecation is enforced + and not (lib.is_integer(fill_value) + and needs_i8_conversion(arrays[0].dtype))): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default nper = abs(periods) nper = min(nper, ncols) if periods > 0: - indexer = np.array( - [-1] * nper + list(range(ncols - periods)), dtype=np.intp - ) + indexer = np.array([-1] * nper + + list(range(ncols - periods)), + dtype=np.intp) else: - indexer = np.array( - list(range(nper, ncols)) + [-1] * nper, dtype=np.intp - ) + indexer = np.array(list(range(nper, ncols)) + [-1] * nper, + dtype=np.intp) mgr = self._mgr.reindex_indexer( self.columns, indexer, @@ -5415,11 +5459,13 @@ def shift( res_df = self._constructor(mgr) return res_df.__finalize__(self, method="shift") - return super().shift( - periods=periods, freq=freq, axis=axis, fill_value=fill_value - ) + return super().shift(periods=periods, + freq=freq, + axis=axis, + fill_value=fill_value) - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "keys"]) def set_index( self, keys, @@ -5522,15 +5568,14 @@ def set_index( if not isinstance(keys, list): keys = [keys] - err_msg = ( - 'The parameter "keys" may be a column key, one-dimensional ' - "array, or a list containing only valid column keys and " - "one-dimensional arrays." - ) + err_msg = ('The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays.") missing: list[Hashable] = [] for col in keys: - if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): + if isinstance(col, + (Index, Series, np.ndarray, list, abc.Iterator)): # arrays are fine as long as they are one-dimensional # iterators get converted to list below if getattr(col, "ndim", 1) != 1: @@ -5600,8 +5645,7 @@ def set_index( # ensure_index_from_sequences would not raise for append=False. raise ValueError( f"Length mismatch: Expected {len(self)} rows, " - f"received array of length {len(arrays[-1])}" - ) + f"received array of length {len(arrays[-1])}") index = ensure_index_from_sequences(arrays, names) @@ -5692,7 +5736,8 @@ def reset_index( ) -> DataFrame | None: ... - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "level"]) def reset_index( self, level: Hashable | Sequence[Hashable] | None = None, @@ -5854,7 +5899,8 @@ class max type else: new_obj = self.copy() if allow_duplicates is not lib.no_default: - allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") + allow_duplicates = validate_bool_kwarg(allow_duplicates, + "allow_duplicates") new_index = default_index(len(new_obj)) if level is not None: @@ -5871,8 +5917,9 @@ class max type to_insert = zip(self.index.levels, self.index.codes) else: default = "index" if "index" not in self else "level_0" - names = [default] if self.index.name is None else [self.index.name] - to_insert = ((self.index, None),) + names = [default + ] if self.index.name is None else [self.index.name] + to_insert = ((self.index, None), ) multi_col = isinstance(self.columns, MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): @@ -5880,13 +5927,13 @@ class max type continue name = names[i] if multi_col: - col_name = list(name) if isinstance(name, tuple) else [name] + col_name = list(name) if isinstance(name, + tuple) else [name] if col_fill is None: if len(col_name) not in (1, self.columns.nlevels): raise ValueError( "col_fill=None is incompatible " - f"with incomplete column name {name}" - ) + f"with incomplete column name {name}") col_fill = col_name[0] lev_num = self.columns._get_level_number(col_level) @@ -5902,9 +5949,10 @@ class max type if lab is not None: # if we have the codes, extract the values with a mask - level_values = algorithms.take( - level_values, lab, allow_fill=True, fill_value=lev._na_value - ) + level_values = algorithms.take(level_values, + lab, + allow_fill=True, + fill_value=lev._na_value) new_obj.insert( 0, @@ -6060,7 +6108,8 @@ def dropna( inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(axis, (tuple, list)): # GH20987 - raise TypeError("supplying multiple axes to axis is no longer supported.") + raise TypeError( + "supplying multiple axes to axis is no longer supported.") axis = self._get_axis_number(axis) agg_axis = 1 - axis @@ -6102,7 +6151,8 @@ def dropna( else: return result - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "subset"]) def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, @@ -6306,13 +6356,9 @@ def f(vals) -> tuple[np.ndarray, int]: # Incompatible types in assignment (expression has type "Index", variable # has type "Sequence[Any]") subset = self.columns # type: ignore[assignment] - elif ( - not np.iterable(subset) - or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self.columns - ): - subset = (subset,) + elif (not np.iterable(subset) or isinstance(subset, str) + or isinstance(subset, tuple) and subset in self.columns): + subset = (subset, ) # needed for mypy since can't narrow types using np.iterable subset = cast(Sequence, subset) @@ -6340,7 +6386,8 @@ def f(vals) -> tuple[np.ndarray, int]: sort=False, xnull=False, ) - result = self._constructor_sliced(duplicated(ids, keep), index=self.index) + result = self._constructor_sliced(duplicated(ids, keep), + index=self.index) return result.__finalize__(self, method="duplicated") # ---------------------------------------------------------------------- @@ -6383,9 +6430,10 @@ def sort_values( # type: ignore[override] for (k, name) in zip(keys, by) ] - indexer = lexsort_indexer( - keys, orders=ascending, na_position=na_position, key=key - ) + indexer = lexsort_indexer(keys, + orders=ascending, + na_position=na_position, + key=key) elif len(by): # len(by) == 1 @@ -6401,20 +6449,21 @@ def sort_values( # type: ignore[override] if isinstance(ascending, (tuple, list)): ascending = ascending[0] - indexer = nargsort( - k, kind=kind, ascending=ascending, na_position=na_position, key=key - ) + indexer = nargsort(k, + kind=kind, + ascending=ascending, + na_position=na_position, + key=key) else: return self.copy() - new_data = self._mgr.take( - indexer, axis=self._get_block_manager_axis(axis), verify=False - ) + new_data = self._mgr.take(indexer, + axis=self._get_block_manager_axis(axis), + verify=False) if ignore_index: - new_data.set_axis( - self._get_block_manager_axis(axis), default_index(len(indexer)) - ) + new_data.set_axis(self._get_block_manager_axis(axis), + default_index(len(indexer))) result = self._constructor(new_data) if inplace: @@ -6656,13 +6705,15 @@ def value_counts( # Force MultiIndex for single column if len(subset) == 1: - counts.index = MultiIndex.from_arrays( - [counts.index], names=[counts.index.name] - ) + counts.index = MultiIndex.from_arrays([counts.index], + names=[counts.index.name]) return counts - def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: + def nlargest(self, + n: int, + columns: IndexLabel, + keep: str = "first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in descending order. @@ -6767,9 +6818,13 @@ def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFram Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() + return algorithms.SelectNFrame(self, n=n, keep=keep, + columns=columns).nlargest() - def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: + def nsmallest(self, + n: int, + columns: IndexLabel, + keep: str = "first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in ascending order. @@ -6865,20 +6920,16 @@ def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFra Anguilla 11300 311 AI Nauru 337000 182 NR """ - return algorithms.SelectNFrame( - self, n=n, keep=keep, columns=columns - ).nsmallest() + return algorithms.SelectNFrame(self, n=n, keep=keep, + columns=columns).nsmallest() @doc( Series.swaplevel, klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """axis : {0 or 'index', 1 or 'columns'}, default 0 + extra_params=dedent("""axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise.""" - ), - examples=dedent( - """\ + 'columns' for column-wise."""), + examples=dedent("""\ Examples -------- >>> df = pd.DataFrame( @@ -6928,15 +6979,18 @@ def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFra History Final exam January A Geography Final exam February B History Coursework March A - Geography Coursework April C""" - ), + Geography Coursework April C"""), ) - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + def swaplevel(self, + i: Axis = -2, + j: Axis = -1, + axis: Axis = 0) -> DataFrame: result = self.copy() axis = self._get_axis_number(axis) - if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover + if not isinstance(result._get_axis(axis), + MultiIndex): # pragma: no cover raise TypeError("Can only swap levels on a hierarchical axis.") if axis == 0: @@ -6947,7 +7001,9 @@ def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order: Sequence[Axis], axis: Axis = 0) -> DataFrame: + def reorder_levels(self, + order: Sequence[Axis], + axis: Axis = 0) -> DataFrame: """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -6989,7 +7045,8 @@ class diet Reptiles Snakes """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + if not isinstance(self._get_axis(axis), + MultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -7008,7 +7065,11 @@ class diet def _cmp_method(self, other, op): axis = 1 # only relevant for Series other case - self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) + self, other = ops.align_method_FRAME(self, + other, + axis, + flex=False, + level=None) # See GH#4537 for discussion of scalar op behavior new_data = self._dispatch_frame_op(other, op, axis=axis) @@ -7019,16 +7080,23 @@ def _arith_method(self, other, op): return ops.frame_arith_method_with_reindex(self, other, op) axis = 1 # only relevant for Series other case - other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) + other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis], )) - self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) + self, other = ops.align_method_FRAME(self, + other, + axis, + flex=True, + level=None) new_data = self._dispatch_frame_op(other, op, axis=axis) return self._construct_result(new_data) _logical_method = _arith_method - def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): + def _dispatch_frame_op(self, + right, + func: Callable, + axis: int | None = None): """ Evaluate the frame operation func(left, right) by evaluating column-by-column, dispatching to the Series implementation. @@ -7093,15 +7161,19 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): right = right._values with np.errstate(all="ignore"): - arrays = [array_op(left, right) for left in self._iter_column_arrays()] + arrays = [ + array_op(left, right) + for left in self._iter_column_arrays() + ] else: # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) - return type(self)._from_arrays( - arrays, self.columns, self.index, verify_integrity=False - ) + return type(self)._from_arrays(arrays, + self.columns, + self.index, + verify_integrity=False) def _combine_frame(self, other: DataFrame, func, fill_value=None): # at this point we have `self._indexed_same(other)` @@ -7277,9 +7349,11 @@ def compare( keep_equal=keep_equal, ) - def combine( - self, other: DataFrame, func, fill_value=None, overwrite: bool = True - ) -> DataFrame: + def combine(self, + other: DataFrame, + func, + fill_value=None, + overwrite: bool = True) -> DataFrame: """ Perform column-wise combine with another DataFrame. @@ -7643,7 +7717,8 @@ def update( if join != "left": # pragma: no cover raise NotImplementedError("Only left join is supported") if errors not in ["ignore", "raise"]: - raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + raise ValueError( + "The parameter errors must be either 'ignore' or 'raise'") if not isinstance(other, DataFrame): other = DataFrame(other) @@ -7676,8 +7751,7 @@ def update( # ---------------------------------------------------------------------- # Data reshaping - @Appender( - """ + @Appender(""" Examples -------- >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', @@ -7757,8 +7831,7 @@ def update( a 13.0 13.0 b 12.3 123.0 NaN 12.3 33.0 -""" - ) +""") @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) def groupby( self, @@ -7776,10 +7849,8 @@ def groupby( if squeeze is not no_default: warnings.warn( - ( - "The `squeeze` parameter is deprecated and " - "will be removed in a future version." - ), + ("The `squeeze` parameter is deprecated and " + "will be removed in a future version."), FutureWarning, stacklevel=find_stack_level(), ) @@ -7806,9 +7877,7 @@ def groupby( dropna=dropna, ) - _shared_docs[ - "pivot" - ] = """ + _shared_docs["pivot"] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -7959,9 +8028,7 @@ def pivot(self, index=None, columns=None, values=None) -> DataFrame: return pivot(self, index=index, columns=columns, values=values) - _shared_docs[ - "pivot_table" - ] = """ + _shared_docs["pivot_table"] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects @@ -8408,8 +8475,7 @@ def explode( if is_scalar(column) or isinstance(column, tuple): columns = [column] elif isinstance(column, list) and all( - map(lambda c: is_scalar(c) or isinstance(c, tuple), column) - ): + map(lambda c: is_scalar(c) or isinstance(c, tuple), column)): if not column: raise ValueError("column must be nonempty") if len(column) > len(set(column)): @@ -8426,7 +8492,8 @@ def explode( counts0 = self[columns[0]].apply(mylen) for c in columns[1:]: if not all(counts0 == self[c].apply(mylen)): - raise ValueError("columns must have matching element counts") + raise ValueError( + "columns must have matching element counts") result = DataFrame({c: df[c].explode() for c in columns}) result = df.drop(columns, axis=1).join(result) if ignore_index: @@ -8534,8 +8601,7 @@ def melt( extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " "Take difference over rows (0) or columns (1).\n", other_klass="Series", - examples=dedent( - """ + examples=dedent(""" Difference with previous row >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], @@ -8598,16 +8664,14 @@ def melt( >>> df.diff() a 0 NaN - 1 255.0""" - ), + 1 255.0"""), ) def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: if not lib.is_integer(periods): - if not ( - is_float(periods) - # error: "int" has no attribute "is_integer" - and periods.is_integer() # type: ignore[attr-defined] - ): + if not (is_float(periods) + # error: "int" has no attribute "is_integer" + and periods.is_integer() # type: ignore[attr-defined] + ): raise ValueError("periods must be an integer") periods = int(periods) @@ -8646,8 +8710,7 @@ def _gotitem( # TODO: _shallow_copy(subset)? return subset[key] - _agg_summary_and_see_also_doc = dedent( - """ + _agg_summary_and_see_also_doc = dedent(""" The aggregation operations are always performed over an axis, either the index (default) or the column axis. This behavior is different from `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, @@ -8667,11 +8730,9 @@ def _gotitem( core.window.Expanding : Perform operations over expanding window. core.window.ExponentialMovingWindow : Perform operation over exponential weighted window. - """ - ) + """) - _agg_examples_doc = dedent( - """ + _agg_examples_doc = dedent(""" Examples -------- >>> df = pd.DataFrame([[1, 2, 3], @@ -8712,8 +8773,7 @@ def _gotitem( 2 8.0 3 NaN dtype: float64 - """ - ) + """) @doc( _shared_docs["aggregate"], @@ -8753,9 +8813,11 @@ def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform( - self, func: AggFuncType, axis: Axis = 0, *args, **kwargs - ) -> DataFrame: + def transform(self, + func: AggFuncType, + axis: Axis = 0, + *args, + **kwargs) -> DataFrame: from pandas.core.apply import frame_apply op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) @@ -8764,13 +8826,13 @@ def transform( return result def apply( - self, - func: AggFuncType, - axis: Axis = 0, - raw: bool = False, - result_type=None, - args=(), - **kwargs, + self, + func: AggFuncType, + axis: Axis = 0, + raw: bool = False, + result_type=None, + args=(), + **kwargs, ): """ Apply a function along an axis of the DataFrame. @@ -8922,9 +8984,10 @@ def apply( ) return op.apply().__finalize__(self, method="apply") - def applymap( - self, func: PythonFuncType, na_action: str | None = None, **kwargs - ) -> DataFrame: + def applymap(self, + func: PythonFuncType, + na_action: str | None = None, + **kwargs) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -8994,8 +9057,7 @@ def applymap( """ if na_action not in {"ignore", None}: raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" - ) + f"na_action must be 'ignore' or None. Got {repr(na_action)}") ignore_na = na_action == "ignore" func = functools.partial(func, **kwargs) @@ -9003,7 +9065,9 @@ def applymap( def infer(x): if x.empty: return lib.map_infer(x, func, ignore_na=ignore_na) - return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) + return lib.map_infer(x.astype(object)._values, + func, + ignore_na=ignore_na) return self.apply(infer).__finalize__(self, "applymap") @@ -9133,13 +9197,13 @@ def _append( if isinstance(other, (Series, dict)): if isinstance(other, dict): if not ignore_index: - raise TypeError("Can only append a dict if ignore_index=True") + raise TypeError( + "Can only append a dict if ignore_index=True") other = Series(other) if other.name is None and not ignore_index: raise TypeError( "Can only append a Series if ignore_index=True " - "or if the Series has a name" - ) + "or if the Series has a name") index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) @@ -9169,11 +9233,8 @@ def _append( verify_integrity=verify_integrity, sort=sort, ) - if ( - combined_columns is not None - and not sort - and not combined_columns.equals(result.columns) - ): + if (combined_columns is not None and not sort + and not combined_columns.equals(result.columns)): # TODO: reindexing here is a kludge bc union_indexes does not # pass sort to index.union, xref #43375 # combined_columns.equals check is necessary for preserving dtype @@ -9335,9 +9396,12 @@ def join( 4 K0 A4 B0 5 K1 A5 B1 """ - return self._join_compat( - other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort - ) + return self._join_compat(other, + on=on, + how=how, + lsuffix=lsuffix, + rsuffix=rsuffix, + sort=sort) def _join_compat( self, @@ -9389,21 +9453,27 @@ def _join_compat( # join indexes only using concat if can_concat: if how == "left": - res = concat( - frames, axis=1, join="outer", verify_integrity=True, sort=sort - ) + res = concat(frames, + axis=1, + join="outer", + verify_integrity=True, + sort=sort) return res.reindex(self.index, copy=False) else: - return concat( - frames, axis=1, join=how, verify_integrity=True, sort=sort - ) + return concat(frames, + axis=1, + join=how, + verify_integrity=True, + sort=sort) joined = frames[0] for frame in frames[1:]: - joined = merge( - joined, frame, how=how, left_index=True, right_index=True - ) + joined = merge(joined, + frame, + how=how, + left_index=True, + right_index=True) return joined @@ -9442,9 +9512,10 @@ def merge( validate=validate, ) - def round( - self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs - ) -> DataFrame: + def round(self, + decimals: int | dict[IndexLabel, int] | Series = 0, + *args, + **kwargs) -> DataFrame: """ Round a DataFrame to a variable number of decimal places. @@ -9541,20 +9612,21 @@ def _series_round(ser: Series, decimals: int): if isinstance(decimals, Series) and not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") if is_dict_like(decimals) and not all( - is_integer(value) for _, value in decimals.items() - ): + is_integer(value) for _, value in decimals.items()): raise TypeError("Values in decimals must be integers") new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.items()] else: - raise TypeError("decimals must be an integer, a dict-like or a Series") + raise TypeError( + "decimals must be an integer, a dict-like or a Series") if len(new_cols) > 0: - return self._constructor( - concat(new_cols, axis=1), index=self.index, columns=self.columns - ).__finalize__(self, method="round") + return self._constructor(concat(new_cols, axis=1), + index=self.index, + columns=self.columns).__finalize__( + self, method="round") else: return self @@ -9658,15 +9730,15 @@ def corr( correl[i, j] = c correl[j, i] = c else: - raise ValueError( - "method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - f"'{method}' was supplied" - ) + raise ValueError("method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied") return self._constructor(correl, index=idx, columns=cols) - def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame: + def cov(self, + min_periods: int | None = None, + ddof: int | None = 1) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -9782,7 +9854,11 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame return self._constructor(base_cov, index=idx, columns=cols) - def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Series: + def corrwith(self, + other, + axis: Axis = 0, + drop=False, + method="pearson") -> Series: """ Compute pairwise correlation. @@ -9860,8 +9936,7 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie for i, r in enumerate(ndf): nonnull_mask = ~np.isnan(r) & ~np.isnan(k) corrs[numeric_cols[i]] = np.corrcoef( - r[nonnull_mask], k[nonnull_mask] - )[0, 1] + r[nonnull_mask], k[nonnull_mask])[0, 1] else: for i, r in enumerate(ndf): nonnull_mask = ~np.isnan(r) & ~np.isnan(k) @@ -9871,7 +9946,8 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie )[0, 1] return Series(corrs) else: - return this.apply(lambda x: other.corr(x, method=method), axis=axis) + return this.apply(lambda x: other.corr(x, method=method), + axis=axis) other = other._get_numeric_data() left, right = this.align(other, join="inner", copy=False) @@ -9899,16 +9975,14 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie def c(x): return nanops.nancorr(x[0], x[1], method=method) - correl = self._constructor_sliced( - map(c, zip(left.values.T, right.values.T)), index=left.columns - ) + correl = self._constructor_sliced(map( + c, zip(left.values.T, right.values.T)), + index=left.columns) else: - raise ValueError( - f"Invalid method {method} was passed, " - "valid methods are: 'pearson', 'kendall', " - "'spearman', or callable" - ) + raise ValueError(f"Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable") if not drop: # Find non-matching labels along the given axis @@ -9919,17 +9993,17 @@ def c(x): if len(idx_diff) > 0: correl = correl._append( - Series([np.nan] * len(idx_diff), index=idx_diff) - ) + Series([np.nan] * len(idx_diff), index=idx_diff)) return correl # ---------------------------------------------------------------------- # ndarray-like stats methods - def count( - self, axis: Axis = 0, level: Level | None = None, numeric_only: bool = False - ): + def count(self, + axis: Axis = 0, + level: Level | None = None, + numeric_only: bool = False): """ Count non-NA cells for each column or row. @@ -10006,7 +10080,9 @@ def count( FutureWarning, stacklevel=find_stack_level(), ) - res = self._count_level(level, axis=axis, numeric_only=numeric_only) + res = self._count_level(level, + axis=axis, + numeric_only=numeric_only) return res.__finalize__(self, method="count") if numeric_only: @@ -10016,7 +10092,8 @@ def count( # GH #423 if len(frame._get_axis(axis)) == 0: - result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) + result = self._constructor_sliced(0, + index=frame._get_agg_axis(axis)) else: if frame._is_mixed_type or frame._mgr.any_extension_types: # the or any_extension_types is really only hit for single- @@ -10027,12 +10104,14 @@ def count( series_counts = notna(frame).sum(axis=axis) counts = series_counts.values result = self._constructor_sliced( - counts, index=frame._get_agg_axis(axis) - ) + counts, index=frame._get_agg_axis(axis)) return result.astype("int64").__finalize__(self, method="count") - def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): + def _count_level(self, + level: Level, + axis: int = 0, + numeric_only: bool = False): if numeric_only: frame = self._get_numeric_data() else: @@ -10068,12 +10147,19 @@ def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): level_name = count_axis._names[level] level_index = count_axis.levels[level]._rename(name=level_name) level_codes = ensure_platform_int(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) + counts = lib.count_level_2d(mask, + level_codes, + len(level_index), + axis=axis) if axis == 1: - result = self._constructor(counts, index=agg_axis, columns=level_index) + result = self._constructor(counts, + index=agg_axis, + columns=level_index) else: - result = self._constructor(counts, index=level_index, columns=agg_axis) + result = self._constructor(counts, + index=level_index, + columns=agg_axis) return result @@ -10127,8 +10213,7 @@ def func(values: np.ndarray): def blk_func(values, axis=1): if isinstance(values, ExtensionArray): if not is_1d_only_ea_dtype(values.dtype) and not isinstance( - self._mgr, ArrayManager - ): + self._mgr, ArrayManager): return values._reduce(name, axis=1, skipna=skipna, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: @@ -10360,9 +10445,11 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series: """ axis = self._get_axis_number(axis) - res = self._reduce( - nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False - ) + res = self._reduce(nanops.nanargmin, + "argmin", + axis=axis, + skipna=skipna, + numeric_only=False) indices = res._values # indices will always be np.ndarray since axis is not None and @@ -10437,9 +10524,11 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series: """ axis = self._get_axis_number(axis) - res = self._reduce( - nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False - ) + res = self._reduce(nanops.nanargmax, + "argmax", + axis=axis, + skipna=skipna, + numeric_only=False) indices = res._values # indices will always be np.ndarray since axis is not None and @@ -10462,9 +10551,10 @@ def _get_agg_axis(self, axis_num: int) -> Index: else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode( - self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True - ) -> DataFrame: + def mode(self, + axis: Axis = 0, + numeric_only: bool = False, + dropna: bool = True) -> DataFrame: """ Get the mode(s) of each element along the selected axis. @@ -10631,9 +10721,10 @@ def quantile( if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here - res_df = self.quantile( - [q], axis=axis, numeric_only=numeric_only, interpolation=interpolation - ) + res_df = self.quantile([q], + axis=axis, + numeric_only=numeric_only, + interpolation=interpolation) res = res_df.iloc[0] if axis == 1 and len(self) == 0: # GH#41544 try to get an appropriate dtype @@ -10662,7 +10753,10 @@ def quantile( if is_list_like(q): res = self._constructor([], index=q, columns=cols, dtype=dtype) return res.__finalize__(self, method="quantile") - return self._constructor_sliced([], index=cols, name=q, dtype=dtype) + return self._constructor_sliced([], + index=cols, + name=q, + dtype=dtype) res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) @@ -10755,9 +10849,10 @@ def to_timestamp( setattr(new_obj, axis_name, new_ax) return new_obj - def to_period( - self, freq: Frequency | None = None, axis: Axis = 0, copy: bool = True - ) -> DataFrame: + def to_period(self, + freq: Frequency | None = None, + axis: Axis = 0, + copy: bool = True) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -10885,10 +10980,8 @@ def isin(self, values) -> DataFrame: values = collections.defaultdict(list, values) result = concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), + (self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns)), axis=1, ) elif isinstance(values, Series): @@ -10904,10 +10997,10 @@ def isin(self, values) -> DataFrame: raise TypeError( "only list-like or dict-like objects are allowed " "to be passed to DataFrame.isin(), " - f"you passed a '{type(values).__name__}'" - ) + f"you passed a '{type(values).__name__}'") result = self._constructor( - algorithms.isin(self.values.ravel(), values).reshape(self.shape), + algorithms.isin(self.values.ravel(), + values).reshape(self.shape), self.index, self.columns, ) @@ -10926,11 +11019,9 @@ def isin(self, values) -> DataFrame: _info_axis_name = "columns" index: Index = properties.AxisProperty( - axis=1, doc="The index (row labels) of the DataFrame." - ) + axis=1, doc="The index (row labels) of the DataFrame.") columns: Index = properties.AxisProperty( - axis=0, doc="The column labels of the DataFrame." - ) + axis=0, doc="The column labels of the DataFrame.") @property def _AXIS_NUMBERS(self) -> dict[str, int]: @@ -11067,9 +11158,8 @@ def bfill( ) -> DataFrame | None: return super().bfill(axis, inplace, limit, downcast) - @deprecate_nonkeyword_arguments( - version=None, allowed_args=["self", "lower", "upper"] - ) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "lower", "upper"]) def clip( self: DataFrame, lower=None, @@ -11081,7 +11171,8 @@ def clip( ) -> DataFrame | None: return super().clip(lower, upper, axis, inplace, *args, **kwargs) - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "method"]) def interpolate( self: DataFrame, method: str = "linear", @@ -11104,9 +11195,8 @@ def interpolate( **kwargs, ) - @deprecate_nonkeyword_arguments( - version=None, allowed_args=["self", "cond", "other"] - ) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "cond", "other"]) def where( self, cond, @@ -11117,11 +11207,11 @@ def where( errors="raise", try_cast=lib.no_default, ): - return super().where(cond, other, inplace, axis, level, errors, try_cast) + return super().where(cond, other, inplace, axis, level, errors, + try_cast) - @deprecate_nonkeyword_arguments( - version=None, allowed_args=["self", "cond", "other"] - ) + @deprecate_nonkeyword_arguments(version=None, + allowed_args=["self", "cond", "other"]) def mask( self, cond, @@ -11132,7 +11222,8 @@ def mask( errors="raise", try_cast=lib.no_default, ): - return super().mask(cond, other, inplace, axis, level, errors, try_cast) + return super().mask(cond, other, inplace, axis, level, errors, + try_cast) DataFrame._add_numeric_operations() @@ -11164,6 +11255,5 @@ def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: raise err raise TypeError( - "incompatible index of inserted column with frame index" - ) from err + "incompatible index of inserted column with frame index") from err return reindexed_value diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 344dc7e003ea6..a98c28a36879a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -124,9 +124,8 @@ def prop(self): return property(prop) -def pin_allowlisted_properties( - klass: type[DataFrame | Series], allowlist: frozenset[str] -): +def pin_allowlisted_properties(klass: type[DataFrame | Series], + allowlist: frozenset[str]): """ Create GroupBy member defs for DataFrame/Series names in a allowlist. @@ -185,8 +184,7 @@ def _get_data_to_aggregate(self) -> SingleManager: def _iterate_slices(self) -> Iterable[Series]: yield self._selected_obj - _agg_examples_doc = dedent( - """ + _agg_examples_doc = dedent(""" Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -232,28 +230,33 @@ def _iterate_slices(self) -> Iterable[Series]: 1 1.0 2 3.0 dtype: float64 - """ - ) + """) - @Appender( - _apply_docs["template"].format( - input="series", examples=_apply_docs["series_examples"] - ) - ) + @Appender(_apply_docs["template"].format( + input="series", examples=_apply_docs["series_examples"])) def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) @doc(_agg_template, examples=_agg_examples_doc, klass="Series") - def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + def aggregate(self, + func=None, + *args, + engine=None, + engine_kwargs=None, + **kwargs): if maybe_use_numba(engine): with self._group_selection_context(): data = self._selected_obj - result = self._aggregate_with_numba( - data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs - ) + result = self._aggregate_with_numba(data.to_frame(), + func, + *args, + engine_kwargs=engine_kwargs, + **kwargs) index = self.grouper.result_index - return self.obj._constructor(result.ravel(), index=index, name=data.name) + return self.obj._constructor(result.ravel(), + index=index, + name=data.name) relabeling = func is None columns = None @@ -292,9 +295,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # result is a dict whose keys are the elements of result_index index = self.grouper.result_index - return create_series_with_explicit_dtype( - result, index=index, dtype_if_empty=object - ) + return create_series_with_explicit_dtype(result, + index=index, + dtype_if_empty=object) agg = aggregate @@ -307,7 +310,8 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: raise SpecificationError("nested renamer is not supported") elif any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + arg = [(x, x) if not isinstance(x, (tuple, list)) else x + for x in arg] # indicated column order columns = next(zip(*arg)) @@ -328,9 +332,9 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: if any(isinstance(x, DataFrame) for x in results.values()): from pandas import concat - res_df = concat( - results.values(), axis=1, keys=[key.label for key in results.keys()] - ) + res_df = concat(results.values(), + axis=1, + keys=[key.label for key in results.keys()]) return res_df indexed_output = {key.position: val for key, val in results.items()} @@ -341,8 +345,7 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: return output def _indexed_output_to_ndframe( - self, output: Mapping[base.OutputKey, ArrayLike] - ) -> Series: + self, output: Mapping[base.OutputKey, ArrayLike]) -> Series: """ Wrap the dict result of a GroupBy aggregation into a Series. """ @@ -395,12 +398,13 @@ def _wrap_applied_output( res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): - return self._concat_objects(values, not_indexed_same=not_indexed_same) + return self._concat_objects(values, + not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = self.obj._constructor( - data=values, index=self.grouper.result_index, name=self.obj.name - ) + result = self.obj._constructor(data=values, + index=self.grouper.result_index, + name=self.obj.name) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -424,24 +428,33 @@ def _aggregate_named(self, func, *args, **kwargs): @Substitution(klass="Series") @Appender(_transform_template) - def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - return self._transform( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) - - def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs - ): + def transform(self, + func, + *args, + engine=None, + engine_kwargs=None, + **kwargs): + return self._transform(func, + *args, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs) + + def _cython_transform(self, + how: str, + numeric_only: bool = True, + axis: int = 0, + **kwargs): assert axis == 0 # handled by caller obj = self._selected_obj try: - result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs - ) + result = self.grouper._cython_operation("transform", obj._values, + how, axis, **kwargs) except NotImplementedError as err: - raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err + raise TypeError( + f"{how} is not supported for {obj.dtype} dtype") from err return obj._constructor(result, index=self.obj.index, name=obj.name) @@ -519,7 +532,8 @@ def true_and_notna(x) -> bool: try: indices = [ - self._get_index(name) for name, group in self if true_and_notna(group) + self._get_index(name) for name, group in self + if true_and_notna(group) ] except (ValueError, TypeError) as err: raise TypeError("the filter must return a boolean result") from err @@ -601,9 +615,8 @@ def value_counts( names = self.grouper.names + [self.obj.name] - if is_categorical_dtype(val.dtype) or ( - bins is not None and not np.iterable(bins) - ): + if is_categorical_dtype(val.dtype) or (bins is not None + and not np.iterable(bins)): # scalar bins cannot be done at top level # in a backward compatible way # GH38672 relates to categorical dtype @@ -686,7 +699,9 @@ def value_counts( if mask.all(): dropna = False else: - out, codes = out[mask], [level_codes[mask] for level_codes in codes] + out, codes = out[mask], [ + level_codes[mask] for level_codes in codes + ] if normalize: out = out.astype("float") @@ -713,7 +728,10 @@ def value_counts( ncat, nbin = diff.sum(), len(levels[-1]) - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] + left = [ + np.repeat(np.arange(ncat), nbin), + np.tile(np.arange(nbin), ncat) + ] right = [diff.cumsum() - 1, codes[-1]] @@ -731,7 +749,10 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] codes.append(left[-1]) - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) + mi = MultiIndex(levels=levels, + codes=codes, + names=names, + verify_integrity=False) if is_integer_dtype(out.dtype): out = ensure_int64(out) @@ -761,8 +782,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): _apply_allowlist = base.dataframe_apply_allowlist - _agg_examples_doc = dedent( - """ + _agg_examples_doc = dedent(""" Examples -------- >>> df = pd.DataFrame( @@ -843,20 +863,28 @@ class DataFrameGroupBy(GroupBy[DataFrame]): A 1 1.0 2 3.0 - """ - ) + """) @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") - def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + def aggregate(self, + func=None, + *args, + engine=None, + engine_kwargs=None, + **kwargs): if maybe_use_numba(engine): with self._group_selection_context(): data = self._selected_obj - result = self._aggregate_with_numba( - data, func, *args, engine_kwargs=engine_kwargs, **kwargs - ) + result = self._aggregate_with_numba(data, + func, + *args, + engine_kwargs=engine_kwargs, + **kwargs) index = self.grouper.result_index - return self.obj._constructor(result, index=index, columns=data.columns) + return self.obj._constructor(result, + index=index, + columns=data.columns) relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) @@ -906,12 +934,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if isinstance(sobj, Series): # GH#35246 test_groupby_as_index_select_column_sum_empty_df - result.columns = self._obj_with_exclusions.columns.copy() + result.columns = self._obj_with_exclusions.columns.copy( + ) else: # Retain our column names result.columns._set_names( - sobj.columns.names, level=list(range(sobj.columns.nlevels)) - ) + sobj.columns.names, + level=list(range(sobj.columns.nlevels))) # select everything except for the last level, which is the one # containing the name of the function(s), see GH#32040 result.columns = result.columns.droplevel(-1) @@ -960,7 +989,9 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: result_index = self.grouper.result_index other_ax = obj.axes[1 - self.axis] - out = self.obj._constructor(result, index=other_ax, columns=result_index) + out = self.obj._constructor(result, + index=other_ax, + columns=result_index) if self.axis == 0: out = out.T @@ -982,14 +1013,14 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: res_df.columns = obj.columns return res_df - def _wrap_applied_output( - self, data: DataFrame, values: list, not_indexed_same: bool = False - ): + def _wrap_applied_output(self, + data: DataFrame, + values: list, + not_indexed_same: bool = False): if len(values) == 0: - result = self.obj._constructor( - index=self.grouper.result_index, columns=data.columns - ) + result = self.obj._constructor(index=self.grouper.result_index, + columns=data.columns) result = result.astype(data.dtypes, copy=False) return result @@ -1000,7 +1031,8 @@ def _wrap_applied_output( # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): - return self._concat_objects(values, not_indexed_same=not_indexed_same) + return self._concat_objects(values, + not_indexed_same=not_indexed_same) key_index = self.grouper.result_index if self.as_index else None @@ -1009,9 +1041,9 @@ def _wrap_applied_output( # fall through to the outer else clause # TODO: sure this is right? we used to do this # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection - ) + return self.obj._constructor_sliced(values, + index=key_index, + name=self._selection) elif not isinstance(first_not_none, Series): # values are not series or array-like but scalars # self._selection not passed through to Series as the @@ -1020,14 +1052,14 @@ def _wrap_applied_output( if self.as_index: return self.obj._constructor_sliced(values, index=key_index) else: - result = self.obj._constructor(values, columns=[self._selection]) + result = self.obj._constructor(values, + columns=[self._selection]) self._insert_inaxis_grouper_inplace(result) return result else: # values are Series - return self._wrap_applied_output_series( - values, not_indexed_same, first_not_none, key_index - ) + return self._wrap_applied_output_series(values, not_indexed_same, + first_not_none, key_index) def _wrap_applied_output_series( self, @@ -1039,7 +1071,8 @@ def _wrap_applied_output_series( # this is to silence a DeprecationWarning # TODO(2.0): Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) + backup = create_series_with_explicit_dtype(dtype_if_empty=object, + **kwargs) values = [x if (x is not None) else backup for x in values] all_indexed_same = all_indexes_same(x.index for x in values) @@ -1058,7 +1091,8 @@ def _wrap_applied_output_series( # if any of the sub-series are not indexed the same # OR we don't have a multi-index and we have only a # single values - return self._concat_objects(values, not_indexed_same=not_indexed_same) + return self._concat_objects(values, + not_indexed_same=not_indexed_same) # still a series # path added as of GH 5545 @@ -1091,16 +1125,20 @@ def _wrap_applied_output_series( if stacked_values.dtype == object: # We'll have the DataFrame constructor do inference stacked_values = stacked_values.tolist() - result = self.obj._constructor(stacked_values, index=index, columns=columns) + result = self.obj._constructor(stacked_values, + index=index, + columns=columns) if not self.as_index: self._insert_inaxis_grouper_inplace(result) return self._reindex_output(result) - def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs - ) -> DataFrame: + def _cython_transform(self, + how: str, + numeric_only: bool = True, + axis: int = 0, + **kwargs) -> DataFrame: assert axis == 0 # handled by caller # TODO: no tests with self.ndim == 1 for DataFrameGroupBy @@ -1114,9 +1152,8 @@ def _cython_transform( mgr = mgr.get_numeric_data(copy=False) def arr_func(bvalues: ArrayLike) -> ArrayLike: - return self.grouper._cython_operation( - "transform", bvalues, how, 1, **kwargs - ) + return self.grouper._cython_operation("transform", bvalues, how, 1, + **kwargs) # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat @@ -1171,30 +1208,39 @@ def _transform_general(self, func, *args, **kwargs): concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) - concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) + concatenated = concatenated.reindex(concat_index, + axis=other_axis, + copy=False) return self._set_result_index_ordered(concatenated) @Substitution(klass="DataFrame") @Appender(_transform_template) - def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - return self._transform( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + def transform(self, + func, + *args, + engine=None, + engine_kwargs=None, + **kwargs): + return self._transform(func, + *args, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs) def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): fast_path = lambda group: getattr(group, func)(*args, **kwargs) - slow_path = lambda group: group.apply( - lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis - ) + slow_path = lambda group: group.apply(lambda x: getattr(x, func) + (*args, **kwargs), + axis=self.axis) else: fast_path = lambda group: func(group, *args, **kwargs) slow_path = lambda group: group.apply( - lambda x: func(x, *args, **kwargs), axis=self.axis - ) + lambda x: func(x, *args, **kwargs), axis=self.axis) return fast_path, slow_path - def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): + def _choose_path(self, fast_path: Callable, slow_path: Callable, + group: DataFrame): path = slow_path res = slow_path(group) @@ -1240,7 +1286,8 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - warn_dropping_nuisance_columns_deprecated(type(self), "transform") + warn_dropping_nuisance_columns_deprecated( + type(self), "transform") else: inds.append(i) @@ -1316,8 +1363,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # non scalars aren't allowed raise TypeError( f"filter function returned a {type(res).__name__}, " - "but expected a scalar bool" - ) + "but expected a scalar bool") return self._apply_filter(indices, dropna) @@ -1397,9 +1443,9 @@ def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: # zip in reverse so we can always insert at loc 0 columns = result.columns for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), ): # GH #28549 # When using .apply(-), name will be in columns already @@ -1407,14 +1453,14 @@ def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: result.insert(0, name, lev) def _indexed_output_to_ndframe( - self, output: Mapping[base.OutputKey, ArrayLike] - ) -> DataFrame: + self, output: Mapping[base.OutputKey, ArrayLike]) -> DataFrame: """ Wrap the dict result of a GroupBy aggregation into a DataFrame. """ indexed_output = {key.position: val for key, val in output.items()} columns = Index([key.label for key in output]) - columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) + columns._set_names( + self._obj_with_exclusions._get_axis(1 - self.axis).names) result = self.obj._constructor(indexed_output) result.columns = columns @@ -1452,17 +1498,21 @@ def _iterate_column_groupbys(self, obj: DataFrame | Series): observed=self.observed, ) - def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame: + def _apply_to_column_groupbys(self, func, + obj: DataFrame | Series) -> DataFrame: from pandas.core.reshape.concat import concat columns = obj.columns results = [ - func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj) + func(col_groupby) + for _, col_groupby in self._iterate_column_groupbys(obj) ] if not results: # concat would raise - return DataFrame([], columns=columns, index=self.grouper.result_index) + return DataFrame([], + columns=columns, + index=self.grouper.result_index) else: return concat(results, keys=columns, axis=1) @@ -1517,8 +1567,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: obj = self._obj_with_exclusions results = self._apply_to_column_groupbys( - lambda sgb: sgb.nunique(dropna), obj=obj - ) + lambda sgb: sgb.nunique(dropna), obj=obj) if not self.as_index: results.index = Index(range(len(results))) @@ -1685,14 +1734,14 @@ def value_counts( """ if self.axis == 1: raise NotImplementedError( - "DataFrameGroupBy.value_counts only handles axis=0" - ) + "DataFrameGroupBy.value_counts only handles axis=0") with self._group_selection_context(): df = self.obj in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis + grouping.name + for grouping in self.grouper.groupings if grouping.in_axis } if isinstance(self._selected_obj, Series): name = self._selected_obj.name @@ -1708,10 +1757,8 @@ def value_counts( if subset is not None: clashing = set(subset) & set(in_axis_names) if clashing: - raise ValueError( - f"Keys {clashing} in subset cannot be in " - "the groupby column keys" - ) + raise ValueError(f"Keys {clashing} in subset cannot be in " + "the groupby column keys") groupings = list(self.grouper.groupings) for key in keys: @@ -1738,8 +1785,8 @@ def value_counts( # We are guaranteed to have the first N levels be the # user-requested grouping. levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) - ) + range(len(self.grouper.groupings), + result_series.index.nlevels)) indexed_group_size = result_series.groupby( result_series.index.droplevel(levels), sort=self.sort, @@ -1753,8 +1800,8 @@ def value_counts( # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) result_series = result_series.sort_values( - ascending=ascending - ).sort_index(level=index_level, sort_remaining=False) + ascending=ascending).sort_index(level=index_level, + sort_remaining=False) result: Series | DataFrame if self.as_index: @@ -1766,8 +1813,7 @@ def value_counts( columns = com.fill_missing_names(index.names) if name in columns: raise ValueError( - f"Column label '{name}' is duplicate of result column" - ) + f"Column label '{name}' is duplicate of result column") result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() @@ -1776,9 +1822,8 @@ def value_counts( return result.__finalize__(self.obj, method="value_counts") -def _wrap_transform_general_frame( - obj: DataFrame, group: DataFrame, res: DataFrame | Series -) -> DataFrame: +def _wrap_transform_general_frame(obj: DataFrame, group: DataFrame, + res: DataFrame | Series) -> DataFrame: from pandas import concat if isinstance(res, Series):