From 662b7363a26fe20d8cac7767448d7164dda03615 Mon Sep 17 00:00:00 2001 From: space monkey Date: Mon, 31 Oct 2022 21:11:49 +0530 Subject: [PATCH 1/2] trying did some progress --- modin/core/dataframe/algebra/binary.py | 6 +- .../algebra/default2pandas/binary.py | 5 +- .../dataframe/algebra/default2pandas/cat.py | 3 +- .../algebra/default2pandas/datetime.py | 4 +- .../algebra/default2pandas/default.py | 24 ++--- .../algebra/default2pandas/groupby.py | 60 ++++++------ .../algebra/default2pandas/resample.py | 7 +- .../algebra/default2pandas/rolling.py | 7 +- .../algebra/default2pandas/series.py | 4 +- .../dataframe/algebra/default2pandas/str.py | 4 +- modin/core/dataframe/algebra/fold.py | 5 +- modin/core/dataframe/algebra/groupby.py | 97 ++++++++++--------- modin/core/dataframe/algebra/map.py | 5 +- modin/core/dataframe/algebra/operator.py | 6 +- modin/core/dataframe/algebra/reduce.py | 5 +- modin/core/dataframe/algebra/tree_reduce.py | 5 +- 16 files changed, 130 insertions(+), 117 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 2514d9dc31c..e6d6e4662d3 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -13,6 +13,7 @@ """Module houses builder class for Binary operator.""" +from typing import Any, Callable import numpy as np import pandas @@ -23,7 +24,7 @@ class Binary(Operator): """Builder class for Binary operator.""" @classmethod - def register(cls, func, join_type="outer", labels="replace"): + def register(cls, func: Callable, join_type : str ="outer", labels : str ="replace") -> Callable: """ Build template binary operator. @@ -44,8 +45,7 @@ def register(cls, func, join_type="outer", labels="replace"): """ def caller( - query_compiler, other, broadcast=False, *args, dtypes=None, **kwargs - ): + query_compiler: Any, other: Any, broadcast: bool = False, *args: Any, dtypes: Any | None = None, **kwargs: Any) -> Any : """ Apply binary `func` to passed operands. diff --git a/modin/core/dataframe/algebra/default2pandas/binary.py b/modin/core/dataframe/algebra/default2pandas/binary.py index 9186ec96504..f1fdf9a7876 100644 --- a/modin/core/dataframe/algebra/default2pandas/binary.py +++ b/modin/core/dataframe/algebra/default2pandas/binary.py @@ -13,6 +13,7 @@ """Module houses default binary functions builder class.""" +from typing import Any, Callable from .default import DefaultMethod import pandas @@ -23,7 +24,7 @@ class BinaryDefault(DefaultMethod): """Build default-to-pandas methods which executes binary functions.""" @classmethod - def build_default_to_pandas(cls, fn, fn_name): + def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable: """ Build function that do fallback to pandas for passed binary `fn`. @@ -41,7 +42,7 @@ def build_default_to_pandas(cls, fn, fn_name): to the casted to pandas frame. """ - def bin_ops_wrapper(df, other, *args, **kwargs): + def bin_ops_wrapper(df: Any, other: Any, *args: Any, **kwargs: Any) -> None: """Apply specified binary function to the passed operands.""" squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( "squeeze_other", False diff --git a/modin/core/dataframe/algebra/default2pandas/cat.py b/modin/core/dataframe/algebra/default2pandas/cat.py index 146b48a5cb2..22163586949 100644 --- a/modin/core/dataframe/algebra/default2pandas/cat.py +++ b/modin/core/dataframe/algebra/default2pandas/cat.py @@ -14,13 +14,14 @@ """Module houses default applied-on-category functions builder class.""" from .series import SeriesDefault +import pandas class CatDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under category accessor.""" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.core.arrays.categorical.CategoricalAccessor: """ Get category accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/default2pandas/datetime.py b/modin/core/dataframe/algebra/default2pandas/datetime.py index f29750eba8c..55d60a49fd0 100644 --- a/modin/core/dataframe/algebra/default2pandas/datetime.py +++ b/modin/core/dataframe/algebra/default2pandas/datetime.py @@ -14,13 +14,13 @@ """Module houses default applied-on-datetime functions builder class.""" from .series import SeriesDefault - +import pandas class DateTimeDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under datetime accessor.""" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.core.indexes.accessors.DatetimeProperties: """ Get datetime accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/default2pandas/default.py b/modin/core/dataframe/algebra/default2pandas/default.py index 0dc3e6493f9..4db86845f0c 100644 --- a/modin/core/dataframe/algebra/default2pandas/default.py +++ b/modin/core/dataframe/algebra/default2pandas/default.py @@ -13,6 +13,8 @@ """Module houses default functions builder class.""" +from typing import Any, Callable, Optional, Union +from xmlrpc.client import boolean from modin.core.dataframe.algebra import Operator from modin.utils import try_cast_to_pandas, MODIN_UNNAMED_SERIES_LABEL @@ -28,7 +30,7 @@ class ObjTypeDeterminer: to an object under which `key` function is applied. """ - def __getattr__(self, key): + def __getattr__(self, key: str) -> Callable: """ Build function that executes `key` function over passed frame. @@ -42,7 +44,7 @@ def __getattr__(self, key): Function that takes DataFrame and executes `key` function on it. """ - def func(df, *args, **kwargs): + def func(df, *args: Any, **kwargs : Any) -> Any: """Access specified attribute of the passed object and call it if it's callable.""" prop = getattr(df, key) if callable(prop): @@ -69,7 +71,7 @@ class DefaultMethod(Operator): DEFAULT_OBJECT_TYPE = ObjTypeDeterminer @classmethod - def register(cls, func, obj_type=None, inplace=None, fn_name=None): + def register(cls, func: Union[Callable, str], obj_type: Optional[object] =None, inplace: Optional[boolean]=None, fn_name: Optional[str]=None) -> Callable: """ Build function that do fallback to default pandas implementation for passed `func`. @@ -106,7 +108,7 @@ def register(cls, func, obj_type=None, inplace=None, fn_name=None): if type(fn) == property: fn = cls.build_property_wrapper(fn) - def applyier(df, *args, **kwargs): + def applyier(df: pandas.DataFrame, *args: Any, **kwargs: Any) -> (pandas.DataFrame | Any) : """ Apply target function to the casted to pandas frame. @@ -159,7 +161,7 @@ def applyier(df, *args, **kwargs): @classmethod # FIXME: this method is almost a duplicate of `cls.build_default_to_pandas`. # Those two methods should be merged into a single one. - def build_wrapper(cls, fn, fn_name): + def build_wrapper(cls, fn: Callable, fn_name: str) -> Callable: """ Build function that do fallback to pandas for passed `fn`. @@ -180,7 +182,7 @@ def build_wrapper(cls, fn, fn_name): """ wrapper = cls.build_default_to_pandas(fn, fn_name) - def args_cast(self, *args, **kwargs): + def args_cast(self, *args: Any, **kwargs: Any) -> Any: """ Preprocess `default_to_pandas` function arguments and apply default function. @@ -193,7 +195,7 @@ def args_cast(self, *args, **kwargs): return args_cast @classmethod - def build_property_wrapper(cls, prop): + def build_property_wrapper(cls, prop: str) -> Callable: """ Build function that accesses specified property of the frame. @@ -208,14 +210,14 @@ def build_property_wrapper(cls, prop): Function that takes DataFrame and returns its value of `prop` property. """ - def property_wrapper(df): + def property_wrapper(df: pandas.DataFrame) -> Any: """Get specified property of the passed object.""" return prop.fget(df) return property_wrapper @classmethod - def build_default_to_pandas(cls, fn, fn_name): + def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable: """ Build function that do fallback to pandas for passed `fn`. @@ -233,14 +235,14 @@ def build_default_to_pandas(cls, fn, fn_name): """ fn.__name__ = f"" - def wrapper(self, *args, **kwargs): + def wrapper(self, *args: Any, **kwargs: Any): """Do fallback to pandas for the specified function.""" return self.default_to_pandas(fn, *args, **kwargs) return wrapper @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.DataFrame : """ Extract frame property to apply function on. diff --git a/modin/core/dataframe/algebra/default2pandas/groupby.py b/modin/core/dataframe/algebra/default2pandas/groupby.py index ed3a2bb81ac..93fe3ba3046 100644 --- a/modin/core/dataframe/algebra/default2pandas/groupby.py +++ b/modin/core/dataframe/algebra/default2pandas/groupby.py @@ -13,6 +13,8 @@ """Module houses default GroupBy functions builder class.""" +from gc import callbacks +from typing import Any, Callable, Optional, Union from .default import DefaultMethod import pandas @@ -34,7 +36,7 @@ class GroupBy: ] @classmethod - def validate_by(cls, by): + def validate_by(cls, by: Union[pandas.DataFrame, pandas.Series, list]) -> (list[pandas.Scalar | pandas.Series] | list[Any | pandas.Series] | Any): """ Build valid `by` parameter for `pandas.DataFrame.groupby`. @@ -52,7 +54,7 @@ def validate_by(cls, by): By parameter with all DataFrames casted to Series. """ - def try_cast_series(df): + def try_cast_series(df: pandas.DataFrame): """Cast one-column frame to Series.""" if isinstance(df, pandas.DataFrame): df = df.squeeze(axis=1) @@ -71,7 +73,7 @@ def try_cast_series(df): return by @classmethod - def inplace_applyier_builder(cls, key, func=None): + def inplace_applyier_builder(cls, key: callable, func: Optional[Union[callable, str]] =None) -> Callable: """ Bind actual aggregation function to the GroupBy aggregation method. @@ -96,7 +98,7 @@ def inplace_applyier(grp, *func_args, **func_kwargs): return inplace_applyier @classmethod - def get_func(cls, key, **kwargs): + def get_func(cls, key: Union[str, callable], **kwargs: Any) -> Callable: """ Extract aggregation function from groupby arguments. @@ -129,7 +131,7 @@ def get_func(cls, key, **kwargs): return cls.inplace_applyier_builder(key) @classmethod - def build_aggregate_method(cls, key): + def build_aggregate_method(cls, key: Union[str, callable]) -> Callable: """ Build function for `QueryCompiler.groupby_agg` that can be executed as default-to-pandas. @@ -146,7 +148,7 @@ def build_aggregate_method(cls, key): """ def fn( - df, + df: pandas.DataFrame, by, axis, groupby_kwargs, @@ -167,7 +169,7 @@ def fn( return fn @classmethod - def build_groupby_reduce_method(cls, agg_func): + def build_groupby_reduce_method(cls, agg_func: Union[callable, str]): """ Build function for `QueryCompiler.groupby_*` that can be executed as default-to-pandas. @@ -260,7 +262,7 @@ def is_aggregate(cls, key): # noqa: PR01 return key in cls.agg_aliases @classmethod - def build_groupby(cls, func): + def build_groupby(cls, func: Union[callable, str]) -> Callable: """ Build function that groups DataFrame and applies aggregation function to the every group. @@ -281,15 +283,15 @@ def build_groupby(cls, func): @staticmethod def handle_as_index_for_dataframe( - result, - internal_by_cols, - by_cols_dtypes=None, - by_length=None, - selection=None, - partition_idx=0, - drop=True, - method=None, - inplace=False, + result: pandas.DataFrame, + internal_by_cols: list, + by_cols_dtypes: Optional[list]=None, + by_length: Optional[int]=None, + selection: Optional[Union[str, list]]=None, + partition_idx: int =0, + drop: bool =True, + method: Optional[str]=None, + inplace: bool =False, ): """ Handle `as_index=False` parameter for the passed GroupBy aggregation result. @@ -353,16 +355,16 @@ def handle_as_index_for_dataframe( @staticmethod def handle_as_index( - result_cols, - result_index_names, - internal_by_cols, - by_cols_dtypes=None, - by_length=None, - selection=None, - partition_idx=0, - drop=True, - method=None, - ): + result_cols: pandas.Index, + result_index_names: list, + internal_by_cols: list, + by_cols_dtypes: Optional[list]=None, + by_length: Optional[int]=None, + selection: Optional[Union[str, list]]=None, + partition_idx: int=0, + drop: bool =True, + method: Optional[str] =None, + ) -> Union[bool, bool, list(str), list(int)]: """ Compute hints to process ``as_index=False`` parameter for the GroupBy result. @@ -523,7 +525,7 @@ class GroupByDefault(DefaultMethod): OBJECT_TYPE = "GroupBy" @classmethod - def register(cls, func, **kwargs): + def register(cls, func: Union[Callable, str], **kwargs: Any) -> Callable: """ Build default-to-pandas GroupBy aggregation function. @@ -558,7 +560,7 @@ def register(cls, func, **kwargs): } @classmethod - def get_aggregation_method(cls, how): + def get_aggregation_method(cls, how: Any) -> Callable: """ Return `pandas.DataFrameGroupBy` method that implements the passed `how` UDF applying strategy. diff --git a/modin/core/dataframe/algebra/default2pandas/resample.py b/modin/core/dataframe/algebra/default2pandas/resample.py index f3dca22fe55..8350945a67d 100644 --- a/modin/core/dataframe/algebra/default2pandas/resample.py +++ b/modin/core/dataframe/algebra/default2pandas/resample.py @@ -13,6 +13,7 @@ """Module houses default Resamle functions builder class.""" +from typing import Any, Callable from .default import DefaultMethod @@ -22,7 +23,7 @@ class Resampler: """Builder class for resampled aggregation functions.""" @classmethod - def build_resample(cls, func, squeeze_self): + def build_resample(cls, func: Callable, squeeze_self: bool) -> Callable: """ Build function that resamples time-series data and does aggregation. @@ -40,7 +41,7 @@ def build_resample(cls, func, squeeze_self): to resampled time-series data. """ - def fn(df, resample_kwargs, *args, **kwargs): + def fn(df: Any, resample_kwargs: Any, *args: Any, **kwargs: Any)-> Any: """Resample time-series data of the passed frame and apply specified aggregation.""" if squeeze_self: df = df.squeeze(axis=1) @@ -60,7 +61,7 @@ class ResampleDefault(DefaultMethod): OBJECT_TYPE = "Resampler" @classmethod - def register(cls, func, squeeze_self=False, **kwargs): + def register(cls, func: Callable, squeeze_self: bool =False, **kwargs: Any) -> Callable: """ Build function that do fallback to pandas and aggregate resampled data. diff --git a/modin/core/dataframe/algebra/default2pandas/rolling.py b/modin/core/dataframe/algebra/default2pandas/rolling.py index b5c07821def..b2796cc3a8e 100644 --- a/modin/core/dataframe/algebra/default2pandas/rolling.py +++ b/modin/core/dataframe/algebra/default2pandas/rolling.py @@ -13,6 +13,7 @@ """Module houses default Rolling functions builder class.""" +from typing import Any, Callable from .default import DefaultMethod @@ -22,7 +23,7 @@ class Rolling: """Builder for aggregation on a rolling window functions.""" @classmethod - def build_rolling(cls, func): + def build_rolling (cls , func: Callable) -> Callable: """ Build function that creates a rolling window and executes `func` on it. @@ -37,7 +38,7 @@ def build_rolling(cls, func): Function that takes pandas DataFrame and applies `func` on a rolling window. """ - def fn(df, rolling_args, *args, **kwargs): + def fn(df: Any, rolling_args: Any, *args: Any, **kwargs: Any) -> Any: """Create rolling window for the passed frame and execute specified `func` on it.""" roller = df.rolling(*rolling_args) @@ -55,7 +56,7 @@ class RollingDefault(DefaultMethod): OBJECT_TYPE = "Rolling" @classmethod - def register(cls, func, **kwargs): + def register(cls, func: Callable, **kwargs: Any) -> Callable: """ Build function that do fallback to pandas to apply `func` on a rolling window. diff --git a/modin/core/dataframe/algebra/default2pandas/series.py b/modin/core/dataframe/algebra/default2pandas/series.py index a3d3e84c58f..6a744304134 100644 --- a/modin/core/dataframe/algebra/default2pandas/series.py +++ b/modin/core/dataframe/algebra/default2pandas/series.py @@ -14,7 +14,7 @@ """Module houses default Series functions builder class.""" from .default import DefaultMethod - +import pandas class SeriesDefault(DefaultMethod): """Builder for default-to-pandas methods which is executed under Series.""" @@ -22,7 +22,7 @@ class SeriesDefault(DefaultMethod): OBJECT_TYPE = "Series" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.Series: """ Squeeze passed DataFrame to be able to process Series-specific functions on it. diff --git a/modin/core/dataframe/algebra/default2pandas/str.py b/modin/core/dataframe/algebra/default2pandas/str.py index 002b7744fdd..01fa31a8fdc 100644 --- a/modin/core/dataframe/algebra/default2pandas/str.py +++ b/modin/core/dataframe/algebra/default2pandas/str.py @@ -14,13 +14,13 @@ """Module houses default applied-on-str functions builder class.""" from .series import SeriesDefault - +import pandas class StrDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under `str` accessor.""" @classmethod - def frame_wrapper(cls, df): + def frame_wrapper(cls, df : pandas.DataFrame) -> pandas.core.strings.accessor.StringMethods: """ Get `str` accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py index 419a0b56903..5fdcb2cae54 100644 --- a/modin/core/dataframe/algebra/fold.py +++ b/modin/core/dataframe/algebra/fold.py @@ -13,6 +13,7 @@ """Module houses builder class for Fold operator.""" +from typing import Any, Callable, Iterable from .operator import Operator @@ -20,7 +21,7 @@ class Fold(Operator): """Builder class for Fold functions.""" @classmethod - def register(cls, fold_function): + def register(cls, fold_function: Callable) -> Callable: """ Build Fold operator that will be performed across rows/columns. @@ -35,7 +36,7 @@ def register(cls, fold_function): Function that takes query compiler and executes Fold function. """ - def caller(query_compiler, fold_axis=None, *args, **kwargs): + def caller(query_compiler: Any, fold_axis: int | None = None, *args: Iterable, **kwargs: Any) -> Any: """ Execute Fold function against passed query compiler. diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index 0a25c2f2a91..914e6df72bd 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -14,6 +14,7 @@ """Module houses builder class for GroupByReduce operator.""" from collections.abc import Container +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pandas from .tree_reduce import TreeReduce @@ -26,7 +27,7 @@ class GroupByReduce(TreeReduce): """Builder class for GroupBy aggregation functions.""" @classmethod - def register(cls, map_func, reduce_func=None, **call_kwds): + def register(cls, map_func: Union[str, dict, Callable], reduce_func: Union[str, dict, Callable]=None, **call_kwds: Any) -> Callable: """ Build template GroupBy aggregation function. @@ -72,15 +73,15 @@ def build_fn(name): @classmethod def map( cls, - df, - map_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - other=None, - by=None, - ): + df: pandas.DataFrame, + map_func: Union[dict, Callable[pandas.DataFrameGroupBy]], + axis: int, + groupby_kwargs: Dict, + agg_args: List, + agg_kwargs : dict, + other: Optional[pandas.DataFrame] =None, + by: Optional[Union[list, str]]=None, + ) -> pandas.DataFrame: """ Execute Map phase of GroupByReduce. @@ -146,15 +147,15 @@ def map( @classmethod def reduce( cls, - df, - reduce_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - partition_idx=0, - drop=False, - method=None, + df: pandas.DataFrame, + reduce_func: Callable, + axis: int, + groupby_kwargs: dict, + agg_args: list, + agg_kwargs: dict, + partition_idx: int =0, + drop: bool =False, + method: Optional[str]=None, ): """ Execute Reduce phase of GroupByReduce. @@ -237,18 +238,18 @@ def reduce( @classmethod def caller( cls, - query_compiler, - by, - map_func, - reduce_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - method=None, - default_to_pandas_func=None, - ): + query_compiler: Any, + by: Any, + map_func: Union[dict, Callable], + reduce_func: Union[dict, Callable], + axis: int, + groupby_kwargs: dict, + agg_args: list, + agg_kwargs: dict, + drop: bool=False, + method: Optional[str]=None, + default_to_pandas_func: Callable=None, + ) -> Any: """ Execute GroupBy aggregation with TreeReduce approach. @@ -355,7 +356,7 @@ def caller( return result @staticmethod - def try_filter_dict(agg_func, df): + def try_filter_dict(agg_func: Union[dict, Callable], df: pandas.DataFrame) -> Callable: """ Build aggregation function to apply to each group at this particular partition. @@ -382,16 +383,16 @@ def try_filter_dict(agg_func, df): @classmethod def build_map_reduce_functions( cls, - by, - axis, - groupby_kwargs, - map_func, - reduce_func, - agg_args, - agg_kwargs, - drop=False, - method=None, - ): + by: Any, + axis: int, + groupby_kwargs: dict, + map_func: pandas.DataFrame, + reduce_func: pandas.DataFrame, + agg_args:list, + agg_kwargs: dict, + drop: bool=False, + method: Optional[str]=None, + ) -> Tuple(Callable): """ Bind appropriate arguments to map and reduce functions. @@ -428,8 +429,8 @@ def build_map_reduce_functions( if hasattr(by, "_modin_frame"): by = None - def _map(df, other=None, **kwargs): - def wrapper(df, other=None): + def _map(df: pandas.DataFrame, other=None, **kwargs) -> pandas.DataFrame: + def wrapper(df: pandas.DataFrame , other=None) -> pandas.DataFrame: return cls.map( df, other=other, @@ -450,8 +451,8 @@ def wrapper(df, other=None): result = wrapper(df.copy(), other if other is None else other.copy()) return result - def _reduce(df, **call_kwargs): - def wrapper(df): + def _reduce(df: pandas.DataFrame, **call_kwargs: Any) -> pandas.DataFrame: + def wrapper(df: pandas.DataFrame) -> pandas.DataFrame: return cls.reduce( df, axis=axis, @@ -488,7 +489,7 @@ def wrapper(df): } -def _is_reduce_function_with_depth(fn, depth: int = 0): +def _is_reduce_function_with_depth(fn: Any, depth: int = 0) -> bool: """ Check whether all functions defined by `fn` are groupby reductions. @@ -526,7 +527,7 @@ def _is_reduce_function_with_depth(fn, depth: int = 0): return isinstance(fn, str) and fn in groupby_reduce_functions -def is_reduce_function(fn): +def is_reduce_function(fn: Any) -> bool: """ Check whether all functions defined by `fn` are groupby reductions. diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py index f5922617b03..5521989b100 100644 --- a/modin/core/dataframe/algebra/map.py +++ b/modin/core/dataframe/algebra/map.py @@ -13,6 +13,7 @@ """Module houses builder class for Map operator.""" +from typing import Any, Callable from .operator import Operator @@ -20,7 +21,7 @@ class Map(Operator): """Builder class for Map operator.""" @classmethod - def register(cls, function, *call_args, **call_kwds): + def register(cls, function: Callable, *call_args: Any, **call_kwds: Any) -> Callable: """ Build Map operator that will be performed across each partition. @@ -41,7 +42,7 @@ def register(cls, function, *call_args, **call_kwds): Function that takes query compiler and executes map function. """ - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: Any, *args: Any, **kwargs: Any) -> Any: """Execute Map function against passed query compiler.""" return query_compiler.__constructor__( query_compiler._modin_frame.map( diff --git a/modin/core/dataframe/algebra/operator.py b/modin/core/dataframe/algebra/operator.py index cc093e6720b..0eaaf15f117 100644 --- a/modin/core/dataframe/algebra/operator.py +++ b/modin/core/dataframe/algebra/operator.py @@ -13,13 +13,13 @@ """Module contains an interface for operator builder classes.""" -from typing import Optional +from typing import Any, Callable, Optional class Operator(object): """Interface for building operators that can execute in parallel across partitions.""" - def __init__(self): + def __init__(self) -> None: raise ValueError( "Please use {}.register instead of the constructor".format( type(self).__name__ @@ -27,7 +27,7 @@ def __init__(self): ) @classmethod - def register(cls, func, **kwargs): + def register(cls, func: Callable, **kwargs: Any) -> Callable: """ Build operator that applies source function across the entire dataset. diff --git a/modin/core/dataframe/algebra/reduce.py b/modin/core/dataframe/algebra/reduce.py index 0f4fbe3667f..374087f632f 100644 --- a/modin/core/dataframe/algebra/reduce.py +++ b/modin/core/dataframe/algebra/reduce.py @@ -13,6 +13,7 @@ """Module houses builder class for Reduce operator.""" +from typing import Any, Callable, Optional from .operator import Operator @@ -20,7 +21,7 @@ class Reduce(Operator): """Builder class for Reduce operator.""" @classmethod - def register(cls, reduce_function, axis=None): + def register(cls, reduce_function: Callable, axis: Optional[int] =None) -> Callable: """ Build Reduce operator that will be performed across rows/columns. @@ -39,7 +40,7 @@ def register(cls, reduce_function, axis=None): Function that takes query compiler and executes Reduce function. """ - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: Any, *args: Any, **kwargs: Any) -> Any: """Execute Reduce function against passed query compiler.""" _axis = kwargs.get("axis") if axis is None else axis return query_compiler.__constructor__( diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py index 671faa1ea0a..4135864732e 100644 --- a/modin/core/dataframe/algebra/tree_reduce.py +++ b/modin/core/dataframe/algebra/tree_reduce.py @@ -13,6 +13,7 @@ """Module houses builder class for TreeReduce operator.""" +from typing import Any, Callable, Optional from .operator import Operator @@ -20,7 +21,7 @@ class TreeReduce(Operator): """Builder class for TreeReduce operator.""" @classmethod - def register(cls, map_function, reduce_function=None, axis=None): + def register(cls, map_function: Callable, reduce_function: Optional[Callable] =None, axis: Optional[int] =None) -> Callable: """ Build TreeReduce operator. @@ -42,7 +43,7 @@ def register(cls, map_function, reduce_function=None, axis=None): if reduce_function is None: reduce_function = map_function - def caller(query_compiler, *args, **kwargs): + def caller(query_compiler: Any, *args: Any, **kwargs: Any) -> Any: """Execute TreeReduce function against passed query compiler.""" _axis = kwargs.get("axis") if axis is None else axis return query_compiler.__constructor__( From 561feccdd37f255d2278047be9b71cd86eba221d Mon Sep 17 00:00:00 2001 From: space monkey Date: Mon, 31 Oct 2022 21:38:03 +0530 Subject: [PATCH 2/2] formatted --- asv_bench/benchmarks/benchmarks.py | 7 +- asv_bench/benchmarks/hdk/benchmarks.py | 7 +- asv_bench/benchmarks/hdk/io.py | 7 +- asv_bench/benchmarks/io/parquet.py | 6 +- asv_bench/benchmarks/utils/common.py | 13 +- asv_bench/benchmarks/utils/data_shapes.py | 5 +- modin/_compat/core/py36/base_io.py | 23 +- modin/_compat/pandas_api/classes.py | 8 +- modin/_compat/pandas_api/latest/base.py | 22 +- modin/_compat/pandas_api/latest/window.py | 22 +- modin/_compat/pandas_api/py36/base.py | 13 +- modin/_compat/pandas_api/py36/io.py | 5 +- modin/_compat/pandas_api/py36/series.py | 7 +- modin/_compat/pandas_api/py36/utils.py | 7 +- modin/_compat/pandas_api/py36/window.py | 24 +- modin/conftest.py | 10 +- modin/core/dataframe/algebra/binary.py | 15 +- .../dataframe/algebra/default2pandas/cat.py | 4 +- .../algebra/default2pandas/datetime.py | 5 +- .../algebra/default2pandas/default.py | 2 +- .../algebra/default2pandas/groupby.py | 40 ++-- .../algebra/default2pandas/resample.py | 6 +- .../algebra/default2pandas/rolling.py | 2 +- .../algebra/default2pandas/series.py | 1 + .../dataframe/algebra/default2pandas/str.py | 5 +- modin/core/dataframe/algebra/fold.py | 7 +- modin/core/dataframe/algebra/groupby.py | 40 ++-- modin/core/dataframe/algebra/map.py | 4 +- modin/core/dataframe/algebra/reduce.py | 4 +- modin/core/dataframe/algebra/tree_reduce.py | 7 +- .../dataframe/base/dataframe/dataframe.py | 5 +- .../dataframe/pandas/dataframe/dataframe.py | 39 +--- .../dataframe_protocol/dataframe.py | 5 +- .../dataframe_protocol/from_dataframe.py | 7 +- .../pandas/partitioning/axis_partition.py | 4 +- .../pandas/partitioning/partition_manager.py | 9 +- .../execution/dask/common/engine_wrapper.py | 7 +- .../partitioning/virtual_partition.py | 5 +- modin/core/execution/ray/common/utils.py | 2 +- .../cudf_on_ray/partitioning/partition.py | 14 +- .../pandas_on_ray/partitioning/partition.py | 9 +- .../partitioning/partition_manager.py | 3 +- modin/core/io/__init__.py | 4 +- .../column_stores/column_store_dispatcher.py | 7 +- .../io/column_stores/feather_dispatcher.py | 5 +- .../io/column_stores/parquet_dispatcher.py | 16 +- modin/core/io/io.py | 63 +----- modin/core/io/pickle/pickle_dispatcher.py | 7 +- modin/core/io/text/csv_glob_dispatcher.py | 21 +- modin/core/io/text/excel_dispatcher.py | 4 +- modin/core/io/text/json_dispatcher.py | 9 +- modin/core/io/text/text_file_dispatcher.py | 18 +- modin/core/storage_formats/base/doc_utils.py | 5 +- .../storage_formats/base/query_compiler.py | 211 +++--------------- modin/core/storage_formats/cudf/parser.py | 6 +- modin/core/storage_formats/pandas/parsers.py | 24 +- .../storage_formats/pandas/query_compiler.py | 54 +---- .../dataframe/pandas/partitions.py | 6 +- modin/experimental/batch/pipeline.py | 3 +- .../experimental/batch/test/test_pipeline.py | 16 +- modin/experimental/cloud/rpyc_proxy.py | 21 +- modin/experimental/cloud/test/test_cloud.py | 10 +- .../hdk_on_native/base_worker.py | 4 +- .../hdk_on_native/dataframe/dataframe.py | 21 +- .../hdk_on_native/df_algebra.py | 13 +- .../interchange/dataframe_protocol/utils.py | 14 +- .../implementations/hdk_on_native/io/io.py | 23 +- .../hdk_on_native/partitioning/partition.py | 4 +- .../hdk_on_native/test/test_dataframe.py | 34 +-- .../partitioning/axis_partition.py | 15 +- .../storage_formats/hdk/query_compiler.py | 18 +- modin/experimental/pandas/test/test_io_exp.py | 19 +- .../experimental/xgboost/test/test_default.py | 3 +- .../experimental/xgboost/test/test_dmatrix.py | 7 +- .../experimental/xgboost/test/test_xgboost.py | 37 +-- modin/experimental/xgboost/xgboost.py | 4 +- modin/experimental/xgboost/xgboost_ray.py | 33 +-- modin/pandas/base.py | 93 ++------ modin/pandas/dataframe.py | 41 +--- modin/pandas/general.py | 6 +- modin/pandas/groupby.py | 31 +-- modin/pandas/resample.py | 50 +---- modin/pandas/series.py | 55 +---- modin/pandas/test/dataframe/test_binary.py | 2 +- modin/pandas/test/dataframe/test_default.py | 39 +--- modin/pandas/test/dataframe/test_indexing.py | 28 +-- modin/pandas/test/dataframe/test_iter.py | 3 +- modin/pandas/test/dataframe/test_join_sort.py | 48 ++-- modin/pandas/test/dataframe/test_reduce.py | 38 +--- modin/pandas/test/dataframe/test_window.py | 6 +- modin/pandas/test/test_concat.py | 5 +- modin/pandas/test/test_general.py | 41 +--- modin/pandas/test/test_groupby.py | 25 +-- modin/pandas/test/test_io.py | 99 ++------ modin/pandas/test/test_rolling.py | 19 +- modin/pandas/test/test_series.py | 101 +++------ modin/pandas/test/utils.py | 10 +- modin/pandas/utils.py | 6 +- modin/pandas/window.py | 11 +- .../dataframe_protocol/base/test_utils.py | 5 +- .../storage_formats/pandas/test_internals.py | 17 +- modin/test/test_envvar_npartitions.py | 4 +- modin/test/test_partition_api.py | 6 +- modin/test/test_utils.py | 6 +- scripts/doc_checker.py | 3 +- 105 files changed, 490 insertions(+), 1504 deletions(-) diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index 7191912a14e..62c8f0e49f2 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -42,12 +42,7 @@ class BaseTimeGroupBy: def setup(self, shape, ngroups=5, groupby_ncols=1): ngroups = translator_groupby_ngroups(ngroups, shape) self.df, self.groupby_columns = generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - groupby_ncols, - count_groups=ngroups, + "int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups, ) diff --git a/asv_bench/benchmarks/hdk/benchmarks.py b/asv_bench/benchmarks/hdk/benchmarks.py index f98680ff8c9..7252d101f62 100644 --- a/asv_bench/benchmarks/hdk/benchmarks.py +++ b/asv_bench/benchmarks/hdk/benchmarks.py @@ -442,12 +442,7 @@ class BaseTimeGroupBy: def setup(self, shape, ngroups=5, groupby_ncols=1): ngroups = translator_groupby_ngroups(ngroups, shape) self.df, self.groupby_columns = generate_dataframe( - "int", - *shape, - RAND_LOW, - RAND_HIGH, - groupby_ncols, - count_groups=ngroups, + "int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups, ) # correct while we use 'col*' like name for non-groupby columns # and 'groupby_col*' like name for groupby columns diff --git a/asv_bench/benchmarks/hdk/io.py b/asv_bench/benchmarks/hdk/io.py index dccb55966ad..e169e6640df 100644 --- a/asv_bench/benchmarks/hdk/io.py +++ b/asv_bench/benchmarks/hdk/io.py @@ -54,10 +54,5 @@ def setup(self, cache, shape): self.filename, self.names, self.dtype = cache[file_id] def time_read_csv_names(self, cache, shape): - df = IMPL.read_csv( - self.filename, - names=self.names, - header=0, - dtype=self.dtype, - ) + df = IMPL.read_csv(self.filename, names=self.names, header=0, dtype=self.dtype,) trigger_import(df) diff --git a/asv_bench/benchmarks/io/parquet.py b/asv_bench/benchmarks/io/parquet.py index 5b2ffc9c470..021ae2a253a 100644 --- a/asv_bench/benchmarks/io/parquet.py +++ b/asv_bench/benchmarks/io/parquet.py @@ -44,8 +44,4 @@ def setup(self, test_filenames, shape): self.shape_id = get_shape_id(shape) def time_read_parquet(self, test_filenames, shape): - execute( - IMPL.read_parquet( - test_filenames[self.shape_id], - ) - ) + execute(IMPL.read_parquet(test_filenames[self.shape_id],)) diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py index 6d028353b0f..d9423aa979b 100644 --- a/asv_bench/benchmarks/utils/common.py +++ b/asv_bench/benchmarks/utils/common.py @@ -244,11 +244,7 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high): def gen_data( - data_type: str, - nrows: int, - ncols: int, - rand_low: int, - rand_high: int, + data_type: str, nrows: int, ncols: int, rand_low: int, rand_high: int, ) -> dict: """ Generate data with caching. @@ -501,12 +497,7 @@ def execute( return # compatibility with old Modin versions - all( - map( - lambda partition: partition.drain_call_queue() or True, - partitions, - ) - ) + all(map(lambda partition: partition.drain_call_queue() or True, partitions,)) if ASV_USE_ENGINE == "ray": from ray import wait diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py index 21aa935f805..284389e67de 100644 --- a/asv_bench/benchmarks/utils/data_shapes.py +++ b/asv_bench/benchmarks/utils/data_shapes.py @@ -142,10 +142,7 @@ "hdk.TimeReadCsvNames", ], ), - ( - HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], - ["hdk.TimeMerge", "hdk.TimeAppend"], - ), + (HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], ["hdk.TimeMerge", "hdk.TimeAppend"],), ( HDK_SERIES_DATA_SIZE[ASV_DATASET_SIZE], ["hdk.TimeBinaryOpSeries", "hdk.TimeValueCountsSeries"], diff --git a/modin/_compat/core/py36/base_io.py b/modin/_compat/core/py36/base_io.py index bb153911b61..13d2dc73c54 100644 --- a/modin/_compat/core/py36/base_io.py +++ b/modin/_compat/core/py36/base_io.py @@ -206,16 +206,9 @@ def read_json( returns=_doc_returns_qc, ) def read_feather( - cls, - path, - columns=None, - use_threads=True, + cls, path, columns=None, use_threads=True, ): # noqa: PR01 - return cls._read_feather( - path=path, - columns=columns, - use_threads=use_threads, - ) + return cls._read_feather(path=path, columns=columns, use_threads=use_threads,) @classmethod @_inherit_docstrings(pandas.read_stata, apilink="pandas.read_stata") @@ -258,13 +251,10 @@ def read_stata( returns=_doc_returns_qc, ) def read_pickle( - cls, - filepath_or_buffer, - compression="infer", + cls, filepath_or_buffer, compression="infer", ): # noqa: PR01 return cls._read_pickle( - filepath_or_buffer=filepath_or_buffer, - compression=compression, + filepath_or_buffer=filepath_or_buffer, compression=compression, ) @classmethod @@ -306,8 +296,5 @@ def to_pickle( protocol: int = 4, # older pandas only supports protocol <= 4 ): # noqa: PR01, D200 return cls._to_pickle( - obj, - filepath_or_buffer, - compression=compression, - protocol=protocol, + obj, filepath_or_buffer, compression=compression, protocol=protocol, ) diff --git a/modin/_compat/pandas_api/classes.py b/modin/_compat/pandas_api/classes.py index 63e59d9ccaa..927f7c33782 100644 --- a/modin/_compat/pandas_api/classes.py +++ b/modin/_compat/pandas_api/classes.py @@ -16,9 +16,7 @@ from modin._compat import PandasCompatVersion if PandasCompatVersion.CURRENT == PandasCompatVersion.PY36: - from .py36 import ( - Python36CompatibleBasePandasDataset as BasePandasDatasetCompat, - ) + from .py36 import Python36CompatibleBasePandasDataset as BasePandasDatasetCompat from .py36 import Python36CompatibleDataFrame as DataFrameCompat from .py36 import Python36CompatibleSeries as SeriesCompat from .py36 import Python36CompatibleDataFrameGroupBy as DataFrameGroupByCompat @@ -26,9 +24,7 @@ from .py36 import Python36CompatibleWindow as WindowCompat from .py36 import Python36CompatibleRolling as RollingCompat elif PandasCompatVersion.CURRENT == PandasCompatVersion.LATEST: - from .latest import ( - LatestCompatibleBasePandasDataset as BasePandasDatasetCompat, - ) + from .latest import LatestCompatibleBasePandasDataset as BasePandasDatasetCompat from .latest import LatestCompatibleDataFrame as DataFrameCompat from .latest import LatestCompatibleSeries as SeriesCompat from .latest import LatestCompatibleDataFrameGroupBy as DataFrameGroupByCompat diff --git a/modin/_compat/pandas_api/latest/base.py b/modin/_compat/pandas_api/latest/base.py index 81a462f3c34..52b0ecf6683 100644 --- a/modin/_compat/pandas_api/latest/base.py +++ b/modin/_compat/pandas_api/latest/base.py @@ -212,11 +212,7 @@ def rank( ) def reindex( - self, - index=None, - columns=None, - copy=True, - **kwargs, + self, index=None, columns=None, copy=True, **kwargs, ): return self._reindex(index=index, columns=columns, copy=copy, **kwargs) @@ -263,13 +259,7 @@ def sample( ) def sem( - self, - axis=None, - skipna=True, - level=None, - ddof=1, - numeric_only=None, - **kwargs, + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs, ): return self._sem( axis=axis, @@ -320,13 +310,7 @@ def skew( ) def std( - self, - axis=None, - skipna=True, - level=None, - ddof=1, - numeric_only=None, - **kwargs, + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs, ): return self._std( axis=axis, diff --git a/modin/_compat/pandas_api/latest/window.py b/modin/_compat/pandas_api/latest/window.py index a1cb3721f9c..af2a2b94ec0 100644 --- a/modin/_compat/pandas_api/latest/window.py +++ b/modin/_compat/pandas_api/latest/window.py @@ -36,16 +36,7 @@ def __init__( ): self._init( dataframe, - [ - window, - min_periods, - center, - win_type, - on, - axis, - closed, - method, - ], + [window, min_periods, center, win_type, on, axis, closed, method,], axis, ) @@ -67,15 +58,6 @@ def __init__( ): self._init( dataframe, - [ - window, - min_periods, - center, - win_type, - on, - axis, - closed, - method, - ], + [window, min_periods, center, win_type, on, axis, closed, method,], axis, ) diff --git a/modin/_compat/pandas_api/py36/base.py b/modin/_compat/pandas_api/py36/base.py index cfe5da81823..0ed6281a80f 100644 --- a/modin/_compat/pandas_api/py36/base.py +++ b/modin/_compat/pandas_api/py36/base.py @@ -149,18 +149,9 @@ def rank( ) def reindex( - self, - index=None, - columns=None, - copy=True, - **kwargs, + self, index=None, columns=None, copy=True, **kwargs, ): - return self._reindex( - index=index, - columns=columns, - copy=copy, - **kwargs, - ) + return self._reindex(index=index, columns=columns, copy=copy, **kwargs,) def rolling( self, diff --git a/modin/_compat/pandas_api/py36/io.py b/modin/_compat/pandas_api/py36/io.py index a99e3ff9360..353ac4d2b2b 100644 --- a/modin/_compat/pandas_api/py36/io.py +++ b/modin/_compat/pandas_api/py36/io.py @@ -155,10 +155,7 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): return DataFrame( query_compiler=FactoryDispatcher.read_parquet( - path=path, - engine=engine, - columns=columns, - **kwargs, + path=path, engine=engine, columns=columns, **kwargs, ) ) diff --git a/modin/_compat/pandas_api/py36/series.py b/modin/_compat/pandas_api/py36/series.py index a212c8d194c..ed5713ab046 100644 --- a/modin/_compat/pandas_api/py36/series.py +++ b/modin/_compat/pandas_api/py36/series.py @@ -38,12 +38,7 @@ def between(self, left, right, inclusive=True): # noqa: PR01, RT01, D200 return self._between(left, right, inclusive=inclusive) def kurt( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs, + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): # noqa: PR01, RT01, D200 return self._kurt( axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs diff --git a/modin/_compat/pandas_api/py36/utils.py b/modin/_compat/pandas_api/py36/utils.py index ec27515e048..98dfb50fcba 100644 --- a/modin/_compat/pandas_api/py36/utils.py +++ b/modin/_compat/pandas_api/py36/utils.py @@ -30,12 +30,7 @@ def create_stat_method(name): """ def stat_method( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs, + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): return self._stat_operation(name, axis, skipna, level, numeric_only, **kwargs) diff --git a/modin/_compat/pandas_api/py36/window.py b/modin/_compat/pandas_api/py36/window.py index 9ff64702b25..cef5196cdb9 100644 --- a/modin/_compat/pandas_api/py36/window.py +++ b/modin/_compat/pandas_api/py36/window.py @@ -34,17 +34,7 @@ def __init__( closed=None, ): self._init( - dataframe, - [ - window, - min_periods, - center, - win_type, - on, - axis, - closed, - ], - axis, + dataframe, [window, min_periods, center, win_type, on, axis, closed,], axis, ) @@ -63,15 +53,5 @@ def __init__( closed=None, ): self._init( - dataframe, - [ - window, - min_periods, - center, - win_type, - on, - axis, - closed, - ], - axis, + dataframe, [window, min_periods, center, win_type, on, axis, closed,], axis, ) diff --git a/modin/conftest.py b/modin/conftest.py index d875eb95b9f..1fe40b3f00c 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -372,9 +372,7 @@ def TestReadCSVFixture(): # each xdist worker spawned in separate process with separate namespace and dataset pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids} # test_read_csv_col_handling, test_read_csv_parsing - _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_regular"], - ) + _make_csv_file(filenames)(filename=pytest.csvs_names["test_read_csv_regular"],) # test_read_csv_parsing _make_csv_file(filenames)( filename=pytest.csvs_names["test_read_csv_yes_no"], @@ -382,8 +380,7 @@ def TestReadCSVFixture(): ) # test_read_csv_col_handling _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_blank_lines"], - add_blank_lines=True, + filename=pytest.csvs_names["test_read_csv_blank_lines"], add_blank_lines=True, ) # test_read_csv_nans_handling _make_csv_file(filenames)( @@ -393,8 +390,7 @@ def TestReadCSVFixture(): ) # test_read_csv_error_handling _make_csv_file(filenames)( - filename=pytest.csvs_names["test_read_csv_bad_lines"], - add_bad_lines=True, + filename=pytest.csvs_names["test_read_csv_bad_lines"], add_bad_lines=True, ) yield diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index e6d6e4662d3..835103532f9 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -24,7 +24,9 @@ class Binary(Operator): """Builder class for Binary operator.""" @classmethod - def register(cls, func: Callable, join_type : str ="outer", labels : str ="replace") -> Callable: + def register( + cls, func: Callable, join_type: str = "outer", labels: str = "replace" + ) -> Callable: """ Build template binary operator. @@ -45,7 +47,13 @@ def register(cls, func: Callable, join_type : str ="outer", labels : str ="repla """ def caller( - query_compiler: Any, other: Any, broadcast: bool = False, *args: Any, dtypes: Any | None = None, **kwargs: Any) -> Any : + query_compiler: Any, + other: Any, + broadcast: bool = False, + *args: Any, + dtypes: Any | None = None, + **kwargs: Any + ) -> Any: """ Apply binary `func` to passed operands. @@ -112,8 +120,7 @@ def caller( ) else: new_modin_frame = query_compiler._modin_frame.map( - lambda df: func(df, other, *args, **kwargs), - dtypes=dtypes, + lambda df: func(df, other, *args, **kwargs), dtypes=dtypes, ) return query_compiler.__constructor__(new_modin_frame) diff --git a/modin/core/dataframe/algebra/default2pandas/cat.py b/modin/core/dataframe/algebra/default2pandas/cat.py index 22163586949..1ed5c898df8 100644 --- a/modin/core/dataframe/algebra/default2pandas/cat.py +++ b/modin/core/dataframe/algebra/default2pandas/cat.py @@ -21,7 +21,9 @@ class CatDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under category accessor.""" @classmethod - def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.core.arrays.categorical.CategoricalAccessor: + def frame_wrapper( + cls, df: pandas.DataFrame + ) -> pandas.core.arrays.categorical.CategoricalAccessor: """ Get category accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/default2pandas/datetime.py b/modin/core/dataframe/algebra/default2pandas/datetime.py index 55d60a49fd0..3e54a24dc7d 100644 --- a/modin/core/dataframe/algebra/default2pandas/datetime.py +++ b/modin/core/dataframe/algebra/default2pandas/datetime.py @@ -16,11 +16,14 @@ from .series import SeriesDefault import pandas + class DateTimeDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under datetime accessor.""" @classmethod - def frame_wrapper(cls, df: pandas.DataFrame) -> pandas.core.indexes.accessors.DatetimeProperties: + def frame_wrapper( + cls, df: pandas.DataFrame + ) -> pandas.core.indexes.accessors.DatetimeProperties: """ Get datetime accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/default2pandas/default.py b/modin/core/dataframe/algebra/default2pandas/default.py index 4db86845f0c..d254cc83bdb 100644 --- a/modin/core/dataframe/algebra/default2pandas/default.py +++ b/modin/core/dataframe/algebra/default2pandas/default.py @@ -235,7 +235,7 @@ def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable: """ fn.__name__ = f"" - def wrapper(self, *args: Any, **kwargs: Any): + def wrapper(self, *args: Any, **kwargs: Any) -> Any: """Do fallback to pandas for the specified function.""" return self.default_to_pandas(fn, *args, **kwargs) diff --git a/modin/core/dataframe/algebra/default2pandas/groupby.py b/modin/core/dataframe/algebra/default2pandas/groupby.py index 93fe3ba3046..c154ab502f2 100644 --- a/modin/core/dataframe/algebra/default2pandas/groupby.py +++ b/modin/core/dataframe/algebra/default2pandas/groupby.py @@ -36,7 +36,9 @@ class GroupBy: ] @classmethod - def validate_by(cls, by: Union[pandas.DataFrame, pandas.Series, list]) -> (list[pandas.Scalar | pandas.Series] | list[Any | pandas.Series] | Any): + def validate_by( + cls, by: Union[pandas.DataFrame, pandas.Series, list] + ) -> Any: """ Build valid `by` parameter for `pandas.DataFrame.groupby`. @@ -73,7 +75,9 @@ def try_cast_series(df: pandas.DataFrame): return by @classmethod - def inplace_applyier_builder(cls, key: callable, func: Optional[Union[callable, str]] =None) -> Callable: + def inplace_applyier_builder( + cls, key: callable, func: Optional[Union[callable, str]] = None + ) -> Callable: """ Bind actual aggregation function to the GroupBy aggregation method. @@ -193,11 +197,7 @@ def fn( by = cls.validate_by(by) grp = df.groupby(by, axis=axis, **groupby_kwargs) grp_agg_func = cls.get_func(agg_func, **kwargs) - return grp_agg_func( - grp, - *agg_args, - **agg_kwargs, - ) + return grp_agg_func(grp, *agg_args, **agg_kwargs,) if isinstance(by, pandas.DataFrame): by = by.squeeze(axis=1) @@ -285,13 +285,13 @@ def build_groupby(cls, func: Union[callable, str]) -> Callable: def handle_as_index_for_dataframe( result: pandas.DataFrame, internal_by_cols: list, - by_cols_dtypes: Optional[list]=None, - by_length: Optional[int]=None, - selection: Optional[Union[str, list]]=None, - partition_idx: int =0, - drop: bool =True, - method: Optional[str]=None, - inplace: bool =False, + by_cols_dtypes: Optional[list] = None, + by_length: Optional[int] = None, + selection: Optional[Union[str, list]] = None, + partition_idx: int = 0, + drop: bool = True, + method: Optional[str] = None, + inplace: bool = False, ): """ Handle `as_index=False` parameter for the passed GroupBy aggregation result. @@ -358,12 +358,12 @@ def handle_as_index( result_cols: pandas.Index, result_index_names: list, internal_by_cols: list, - by_cols_dtypes: Optional[list]=None, - by_length: Optional[int]=None, - selection: Optional[Union[str, list]]=None, - partition_idx: int=0, - drop: bool =True, - method: Optional[str] =None, + by_cols_dtypes: Optional[list] = None, + by_length: Optional[int] = None, + selection: Optional[Union[str, list]] = None, + partition_idx: int = 0, + drop: bool = True, + method: Optional[str] = None, ) -> Union[bool, bool, list(str), list(int)]: """ Compute hints to process ``as_index=False`` parameter for the GroupBy result. diff --git a/modin/core/dataframe/algebra/default2pandas/resample.py b/modin/core/dataframe/algebra/default2pandas/resample.py index 8350945a67d..21f1a6b9a52 100644 --- a/modin/core/dataframe/algebra/default2pandas/resample.py +++ b/modin/core/dataframe/algebra/default2pandas/resample.py @@ -41,7 +41,7 @@ def build_resample(cls, func: Callable, squeeze_self: bool) -> Callable: to resampled time-series data. """ - def fn(df: Any, resample_kwargs: Any, *args: Any, **kwargs: Any)-> Any: + def fn(df: Any, resample_kwargs: Any, *args: Any, **kwargs: Any) -> Any: """Resample time-series data of the passed frame and apply specified aggregation.""" if squeeze_self: df = df.squeeze(axis=1) @@ -61,7 +61,9 @@ class ResampleDefault(DefaultMethod): OBJECT_TYPE = "Resampler" @classmethod - def register(cls, func: Callable, squeeze_self: bool =False, **kwargs: Any) -> Callable: + def register( + cls, func: Callable, squeeze_self: bool = False, **kwargs: Any + ) -> Callable: """ Build function that do fallback to pandas and aggregate resampled data. diff --git a/modin/core/dataframe/algebra/default2pandas/rolling.py b/modin/core/dataframe/algebra/default2pandas/rolling.py index b2796cc3a8e..8c69721add5 100644 --- a/modin/core/dataframe/algebra/default2pandas/rolling.py +++ b/modin/core/dataframe/algebra/default2pandas/rolling.py @@ -23,7 +23,7 @@ class Rolling: """Builder for aggregation on a rolling window functions.""" @classmethod - def build_rolling (cls , func: Callable) -> Callable: + def build_rolling(cls, func: Callable) -> Callable: """ Build function that creates a rolling window and executes `func` on it. diff --git a/modin/core/dataframe/algebra/default2pandas/series.py b/modin/core/dataframe/algebra/default2pandas/series.py index 6a744304134..601cd4b6a61 100644 --- a/modin/core/dataframe/algebra/default2pandas/series.py +++ b/modin/core/dataframe/algebra/default2pandas/series.py @@ -16,6 +16,7 @@ from .default import DefaultMethod import pandas + class SeriesDefault(DefaultMethod): """Builder for default-to-pandas methods which is executed under Series.""" diff --git a/modin/core/dataframe/algebra/default2pandas/str.py b/modin/core/dataframe/algebra/default2pandas/str.py index 01fa31a8fdc..cfba6a62162 100644 --- a/modin/core/dataframe/algebra/default2pandas/str.py +++ b/modin/core/dataframe/algebra/default2pandas/str.py @@ -16,11 +16,14 @@ from .series import SeriesDefault import pandas + class StrDefault(SeriesDefault): """Builder for default-to-pandas methods which is executed under `str` accessor.""" @classmethod - def frame_wrapper(cls, df : pandas.DataFrame) -> pandas.core.strings.accessor.StringMethods: + def frame_wrapper( + cls, df: pandas.DataFrame + ) -> pandas.core.strings.accessor.StringMethods: """ Get `str` accessor of the passed frame. diff --git a/modin/core/dataframe/algebra/fold.py b/modin/core/dataframe/algebra/fold.py index 5fdcb2cae54..122a489d93c 100644 --- a/modin/core/dataframe/algebra/fold.py +++ b/modin/core/dataframe/algebra/fold.py @@ -36,7 +36,12 @@ def register(cls, fold_function: Callable) -> Callable: Function that takes query compiler and executes Fold function. """ - def caller(query_compiler: Any, fold_axis: int | None = None, *args: Iterable, **kwargs: Any) -> Any: + def caller( + query_compiler: Any, + fold_axis: int | None = None, + *args: Iterable, + **kwargs: Any + ) -> Any: """ Execute Fold function against passed query compiler. diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index 914e6df72bd..77a0f3850fb 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -27,7 +27,12 @@ class GroupByReduce(TreeReduce): """Builder class for GroupBy aggregation functions.""" @classmethod - def register(cls, map_func: Union[str, dict, Callable], reduce_func: Union[str, dict, Callable]=None, **call_kwds: Any) -> Callable: + def register( + cls, + map_func: Union[str, dict, Callable], + reduce_func: Union[str, dict, Callable] = None, + **call_kwds: Any, + ) -> Callable: """ Build template GroupBy aggregation function. @@ -78,9 +83,9 @@ def map( axis: int, groupby_kwargs: Dict, agg_args: List, - agg_kwargs : dict, - other: Optional[pandas.DataFrame] =None, - by: Optional[Union[list, str]]=None, + agg_kwargs: dict, + other: Optional[pandas.DataFrame] = None, + by: Optional[Union[list, str]] = None, ) -> pandas.DataFrame: """ Execute Map phase of GroupByReduce. @@ -130,8 +135,7 @@ def map( other = other.squeeze(axis=axis ^ 1) if isinstance(other, pandas.DataFrame): df = pandas.concat( - [df] + [other[[o for o in other if o not in df]]], - axis=1, + [df] + [other[[o for o in other if o not in df]]], axis=1, ) other = list(other.columns) by_part = other @@ -153,9 +157,9 @@ def reduce( groupby_kwargs: dict, agg_args: list, agg_kwargs: dict, - partition_idx: int =0, - drop: bool =False, - method: Optional[str]=None, + partition_idx: int = 0, + drop: bool = False, + method: Optional[str] = None, ): """ Execute Reduce phase of GroupByReduce. @@ -246,9 +250,9 @@ def caller( groupby_kwargs: dict, agg_args: list, agg_kwargs: dict, - drop: bool=False, - method: Optional[str]=None, - default_to_pandas_func: Callable=None, + drop: bool = False, + method: Optional[str] = None, + default_to_pandas_func: Callable = None, ) -> Any: """ Execute GroupBy aggregation with TreeReduce approach. @@ -356,7 +360,9 @@ def caller( return result @staticmethod - def try_filter_dict(agg_func: Union[dict, Callable], df: pandas.DataFrame) -> Callable: + def try_filter_dict( + agg_func: Union[dict, Callable], df: pandas.DataFrame + ) -> Callable: """ Build aggregation function to apply to each group at this particular partition. @@ -388,10 +394,10 @@ def build_map_reduce_functions( groupby_kwargs: dict, map_func: pandas.DataFrame, reduce_func: pandas.DataFrame, - agg_args:list, + agg_args: list, agg_kwargs: dict, - drop: bool=False, - method: Optional[str]=None, + drop: bool = False, + method: Optional[str] = None, ) -> Tuple(Callable): """ Bind appropriate arguments to map and reduce functions. @@ -430,7 +436,7 @@ def build_map_reduce_functions( by = None def _map(df: pandas.DataFrame, other=None, **kwargs) -> pandas.DataFrame: - def wrapper(df: pandas.DataFrame , other=None) -> pandas.DataFrame: + def wrapper(df: pandas.DataFrame, other=None) -> pandas.DataFrame: return cls.map( df, other=other, diff --git a/modin/core/dataframe/algebra/map.py b/modin/core/dataframe/algebra/map.py index 5521989b100..3a45156bbf2 100644 --- a/modin/core/dataframe/algebra/map.py +++ b/modin/core/dataframe/algebra/map.py @@ -21,7 +21,9 @@ class Map(Operator): """Builder class for Map operator.""" @classmethod - def register(cls, function: Callable, *call_args: Any, **call_kwds: Any) -> Callable: + def register( + cls, function: Callable, *call_args: Any, **call_kwds: Any + ) -> Callable: """ Build Map operator that will be performed across each partition. diff --git a/modin/core/dataframe/algebra/reduce.py b/modin/core/dataframe/algebra/reduce.py index 374087f632f..2089192a00d 100644 --- a/modin/core/dataframe/algebra/reduce.py +++ b/modin/core/dataframe/algebra/reduce.py @@ -21,7 +21,9 @@ class Reduce(Operator): """Builder class for Reduce operator.""" @classmethod - def register(cls, reduce_function: Callable, axis: Optional[int] =None) -> Callable: + def register( + cls, reduce_function: Callable, axis: Optional[int] = None + ) -> Callable: """ Build Reduce operator that will be performed across rows/columns. diff --git a/modin/core/dataframe/algebra/tree_reduce.py b/modin/core/dataframe/algebra/tree_reduce.py index 4135864732e..94cb1aaeebf 100644 --- a/modin/core/dataframe/algebra/tree_reduce.py +++ b/modin/core/dataframe/algebra/tree_reduce.py @@ -21,7 +21,12 @@ class TreeReduce(Operator): """Builder class for TreeReduce operator.""" @classmethod - def register(cls, map_function: Callable, reduce_function: Optional[Callable] =None, axis: Optional[int] =None) -> Callable: + def register( + cls, + map_function: Callable, + reduce_function: Optional[Callable] = None, + axis: Optional[int] = None, + ) -> Callable: """ Build TreeReduce operator. diff --git a/modin/core/dataframe/base/dataframe/dataframe.py b/modin/core/dataframe/base/dataframe/dataframe.py index 536d43ceaea..6595a852dbd 100644 --- a/modin/core/dataframe/base/dataframe/dataframe.py +++ b/modin/core/dataframe/base/dataframe/dataframe.py @@ -255,10 +255,7 @@ def groupby( @abstractmethod def reduce( - self, - axis: Union[int, Axis], - function: Callable, - dtypes: Optional[str] = None, + self, axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, ) -> "ModinDataframe": """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 1b3dcda21a3..ca4536e842d 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1019,11 +1019,7 @@ def from_labels_executor(df, **kwargs): return result new_parts = self._partition_mgr_cls.apply_func_to_select_indices( - 0, - self._partitions, - from_labels_executor, - [0], - keep_remaining=True, + 0, self._partitions, from_labels_executor, [0], keep_remaining=True, ) new_column_widths = [ self.index.nlevels + self.column_widths[0] @@ -1574,19 +1570,13 @@ def _compute_tree_reduce_metadata(self, axis, new_parts): new_dtypes = None result = self.__constructor__( - new_parts, - *new_axes, - *new_axes_lengths, - new_dtypes, + new_parts, *new_axes, *new_axes_lengths, new_dtypes, ) return result @lazy_metadata_decorator(apply_axis="both") def reduce( - self, - axis: Union[int, Axis], - function: Callable, - dtypes: Optional[str] = None, + self, axis: Union[int, Axis], function: Callable, dtypes: Optional[str] = None, ) -> "PandasDataframe": """ Perform a user-defined aggregation on the specified axis, where the axis reduces down to a singleton. Requires knowledge of the full axis for the reduction. @@ -2037,12 +2027,7 @@ def explode(self, axis: Union[int, Axis], func: Callable) -> "PandasDataframe": @lazy_metadata_decorator(apply_axis="both") def apply_full_axis( - self, - axis, - func, - new_index=None, - new_columns=None, - dtypes=None, + self, axis, func, new_index=None, new_columns=None, dtypes=None, ): """ Perform a function across an entire axis. @@ -2128,14 +2113,8 @@ def apply_full_axis_select_indices( # Get the indices for the axis being applied to (it is the opposite of axis # being applied over) dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) - new_partitions = ( - self._partition_mgr_cls.apply_func_to_select_indices_along_full_axis( - axis, - self._partitions, - func, - dict_indices, - keep_remaining=keep_remaining, - ) + new_partitions = self._partition_mgr_cls.apply_func_to_select_indices_along_full_axis( + axis, self._partitions, func, dict_indices, keep_remaining=keep_remaining, ) # TODO Infer columns and index from `keep_remaining` and `apply_indices` if new_index is None: @@ -2714,11 +2693,7 @@ def n_ary_op(self, op, right_frames: list, join_type="outer"): ) return self.__constructor__( - new_frame, - joined_index, - joined_columns, - row_lengths, - column_widths, + new_frame, joined_index, joined_columns, row_lengths, column_widths, ) @lazy_metadata_decorator(apply_axis="both") diff --git a/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py b/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py index e330e78512b..61fc21ebf95 100644 --- a/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py +++ b/modin/core/dataframe/pandas/interchange/dataframe_protocol/dataframe.py @@ -70,10 +70,7 @@ class PandasProtocolDataframe(ProtocolDataframe): """ def __init__( - self, - df: PandasDataframe, - nan_as_null: bool = False, - allow_copy: bool = True, + self, df: PandasDataframe, nan_as_null: bool = False, allow_copy: bool = True, ) -> None: self._df = df self._nan_as_null = nan_as_null diff --git a/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py index 2b312bd77fd..35dfb300272 100644 --- a/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/interchange/dataframe_protocol/from_dataframe.py @@ -125,12 +125,7 @@ def unpack_protocol_column( which keeps memory referenced by the column alive. """ dtype = col.dtype[0] - if dtype in ( - DTypeKind.INT, - DTypeKind.UINT, - DTypeKind.FLOAT, - DTypeKind.BOOL, - ): + if dtype in (DTypeKind.INT, DTypeKind.UINT, DTypeKind.FLOAT, DTypeKind.BOOL,): return primitive_column_to_ndarray(col) elif dtype == DTypeKind.CATEGORICAL: return categorical_column_to_series(col) diff --git a/modin/core/dataframe/pandas/partitioning/axis_partition.py b/modin/core/dataframe/pandas/partitioning/axis_partition.py index 55a36b28c15..2177fefacb5 100644 --- a/modin/core/dataframe/pandas/partitioning/axis_partition.py +++ b/modin/core/dataframe/pandas/partitioning/axis_partition.py @@ -250,9 +250,7 @@ def deploy_func_between_two_axis_partitions( # reshaping flattened `rt_parts` array into a frame with shape `other_shape` combined_axis = [ pandas.concat( - rt_parts[other_shape[i - 1] : other_shape[i]], - axis=axis, - copy=False, + rt_parts[other_shape[i - 1] : other_shape[i]], axis=axis, copy=False, ) for i in range(1, len(other_shape)) ] diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index ecd2dcb9042..475b7d06019 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -1395,14 +1395,9 @@ def rebalance_partitions(cls, partitions): new_partitions = np.array( [ cls.column_partitions( - partitions[i : i + chunk_size], - full_axis=False, - ) - for i in range( - 0, - num_existing_partitions, - chunk_size, + partitions[i : i + chunk_size], full_axis=False, ) + for i in range(0, num_existing_partitions, chunk_size,) ] ) return new_partitions, None diff --git a/modin/core/execution/dask/common/engine_wrapper.py b/modin/core/execution/dask/common/engine_wrapper.py index 03e8871b279..72433cf9ca2 100644 --- a/modin/core/execution/dask/common/engine_wrapper.py +++ b/modin/core/execution/dask/common/engine_wrapper.py @@ -21,12 +21,7 @@ class DaskWrapper: @classmethod def deploy( - cls, - func, - f_args=None, - f_kwargs=None, - num_returns=1, - pure=True, + cls, func, f_args=None, f_kwargs=None, num_returns=1, pure=True, ): """ Deploy a function in a worker process. diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py index 10ce592766a..2c3e27ad946 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py @@ -201,10 +201,7 @@ def deploy_axis_func( maintain_partitioning, *partitions, ), - f_kwargs={ - "lengths": lengths, - "manual_partition": manual_partition, - }, + f_kwargs={"lengths": lengths, "manual_partition": manual_partition,}, num_returns=result_num_splits * 4, pure=False, ) diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py index 9b51e916955..a3285bf299d 100644 --- a/modin/core/execution/ray/common/utils.py +++ b/modin/core/execution/ray/common/utils.py @@ -40,7 +40,7 @@ # This constant should be in sync with the limit in ray, which is private, # not exposed to users, and not documented: # https://github.com/ray-project/ray/blob/4692e8d8023e789120d3f22b41ffb136b50f70ea/python/ray/_private/ray_constants.py#L57-L62 -_MAC_OBJECT_STORE_LIMIT_BYTES = 2 * 2**30 +_MAC_OBJECT_STORE_LIMIT_BYTES = 2 * 2 ** 30 ObjectIDType = ray.ObjectRef if version.parse(ray.__version__) >= version.parse("1.2.0"): diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py index b34e08eeaaa..882f7b2e345 100644 --- a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py +++ b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py @@ -265,11 +265,7 @@ def iloc(df, row_labels, col_labels): iloc = cuDFOnRayDataframePartition.preprocess_func(iloc) return self.gpu_manager.apply.remote( - self.key, - None, - iloc, - col_labels=col_labels, - row_labels=row_labels, + self.key, None, iloc, col_labels=col_labels, row_labels=row_labels, ) def get_gpu_manager(self): @@ -357,8 +353,7 @@ def convert(df): # FIXME: Can't find `gpu_manager.apply_result_not_dataframe` method. return self.gpu_manager.apply_result_not_dataframe.remote( - self.get_key(), - convert, + self.get_key(), convert, ) def free(self): @@ -373,10 +368,7 @@ def copy(self): ------- cuDFOnRayDataframePartition """ - new_key = self.gpu_manager.apply.remote( - self.get_key(), - lambda x: x, - ) + new_key = self.gpu_manager.apply.remote(self.get_key(), lambda x: x,) new_key = RayWrapper.materialize(new_key) return self.__constructor__(self.gpu_manager, new_key) diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py index 6561fcda963..455d8edb4db 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py @@ -178,12 +178,9 @@ def drain_call_queue(self): # this dramatically improves performance. func, f_args, f_kwargs = call_queue[0] logger.debug(f"SUBMIT::_apply_func::{self._identity}") - ( - self._data, - new_length, - new_width, - self._ip_cache, - ) = _apply_func.remote(data, func, *f_args, **f_kwargs) + (self._data, new_length, new_width, self._ip_cache,) = _apply_func.remote( + data, func, *f_args, **f_kwargs + ) logger.debug(f"EXIT::Partition.drain_call_queue::{self._identity}") self.call_queue = [] diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py index 91a720bcb3e..9652e128c1f 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py @@ -65,8 +65,7 @@ def magic(*args, **kwargs): ) = inspect.getframeinfo(current_frame) current_frame = current_frame.f_back t = threading.Thread( - target=call_progress_bar, - args=(result_parts, line_number), + target=call_progress_bar, args=(result_parts, line_number), ) t.start() # We need to know whether or not we are in a jupyter notebook diff --git a/modin/core/io/__init__.py b/modin/core/io/__init__.py index 9b7c8fc2d11..a5ffa4f2f1f 100644 --- a/modin/core/io/__init__.py +++ b/modin/core/io/__init__.py @@ -18,9 +18,7 @@ from .text.csv_glob_dispatcher import CSVGlobDispatcher from .text.fwf_dispatcher import FWFDispatcher from .text.json_dispatcher import JSONDispatcher -from .text.custom_text_dispatcher import ( - CustomTextExperimentalDispatcher, -) +from .text.custom_text_dispatcher import CustomTextExperimentalDispatcher from .text.excel_dispatcher import ExcelDispatcher from .file_dispatcher import FileDispatcher from .text.text_file_dispatcher import TextFileDispatcher diff --git a/modin/core/io/column_stores/column_store_dispatcher.py b/modin/core/io/column_stores/column_store_dispatcher.py index a49ce859eb3..a2b4e795f30 100644 --- a/modin/core/io/column_stores/column_store_dispatcher.py +++ b/modin/core/io/column_stores/column_store_dispatcher.py @@ -227,12 +227,7 @@ def build_query_compiler(cls, path, columns, **kwargs): ) new_query_compiler = cls.query_compiler_cls( cls.frame_cls( - remote_parts, - index, - columns, - row_lens, - column_widths, - dtypes=dtypes, + remote_parts, index, columns, row_lens, column_widths, dtypes=dtypes, ) ) return new_query_compiler diff --git a/modin/core/io/column_stores/feather_dispatcher.py b/modin/core/io/column_stores/feather_dispatcher.py index 933e26918e4..15fe8e73b5f 100644 --- a/modin/core/io/column_stores/feather_dispatcher.py +++ b/modin/core/io/column_stores/feather_dispatcher.py @@ -54,10 +54,7 @@ def _read(cls, path, columns=None, **kwargs): ) from pyarrow.feather import read_feather - with OpenFile( - path, - **(kwargs.get("storage_options", None) or {}), - ) as file: + with OpenFile(path, **(kwargs.get("storage_options", None) or {}),) as file: df = read_feather(file) # pyarrow.feather.read_feather doesn't support columns as pandas.Index columns = list(df.columns) diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index ec0874464c5..636f06d38e5 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -218,8 +218,7 @@ def files(self): return self._files def to_pandas_dataframe( - self, - columns, + self, columns, ): from pyarrow.parquet import read_table @@ -370,11 +369,7 @@ def call_deploy(cls, dataset, col_partitions, storage_options, **kwargs): parquet_files = dataset.files # step determines how many row groups are going to be in a partition - step = compute_chunksize( - num_row_groups, - NPartitions.get(), - min_block_size=1, - ) + step = compute_chunksize(num_row_groups, NPartitions.get(), min_block_size=1,) current_partition_size = 0 file_index = 0 partition_files = [] # 2D array - each element contains list of chunks to read @@ -467,9 +462,7 @@ def build_partition(cls, partition_ids, column_widths): [ [ cls.frame_partition_cls( - part_id[0], - length=part_id[2], - width=col_width, + part_id[0], length=part_id[2], width=col_width, ) for part_id, col_width in zip(part_ids, column_widths) ] @@ -602,8 +595,7 @@ def _read(cls, path, engine, columns, **kwargs): https://arrow.apache.org/docs/python/parquet.html """ import_optional_dependency( - "pyarrow", - "pyarrow is required to read parquet files.", + "pyarrow", "pyarrow is required to read parquet files.", ) from modin.pandas.io import PQ_INDEX_REGEX diff --git a/modin/core/io/io.py b/modin/core/io/io.py index 346a2500dd0..737b6a601e5 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -116,11 +116,7 @@ def from_dataframe(cls, df): ) def _read_parquet(cls, **kwargs): # noqa: PR01 ErrorMessage.default_to_pandas("`read_parquet`") - return cls.from_pandas( - pandas.read_parquet( - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_parquet(**kwargs,)) @classmethod @_inherit_docstrings(pandas.read_csv, apilink="pandas.read_csv") @@ -130,9 +126,7 @@ def _read_parquet(cls, **kwargs): # noqa: PR01 returns=_doc_returns_qc_or_parser, ) def _read_csv( - cls, - filepath_or_buffer, - **kwargs, + cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_csv`") return cls._read(filepath_or_buffer=filepath_or_buffer, **kwargs) @@ -172,8 +166,7 @@ def _read(cls, **kwargs): returns=_doc_returns_qc, ) def _read_json( - cls, - **kwargs, + cls, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_json`") return cls.from_pandas(pandas.read_json(**kwargs)) @@ -408,17 +401,10 @@ def read_hdf( returns=_doc_returns_qc, ) def _read_feather( - cls, - path, - **kwargs, + cls, path, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_feather`") - return cls.from_pandas( - pandas.read_feather( - path, - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_feather(path, **kwargs,)) @classmethod @_inherit_docstrings(pandas.read_stata, apilink="pandas.read_stata") @@ -428,9 +414,7 @@ def _read_feather( returns=_doc_returns_qc, ) def _read_stata( - cls, - filepath_or_buffer, - **kwargs, + cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_stata`") return cls.from_pandas(pandas.read_stata(filepath_or_buffer, **kwargs)) @@ -471,17 +455,10 @@ def read_sas( returns=_doc_returns_qc, ) def _read_pickle( - cls, - filepath_or_buffer, - **kwargs, + cls, filepath_or_buffer, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_pickle`") - return cls.from_pandas( - pandas.read_pickle( - filepath_or_buffer, - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_pickle(filepath_or_buffer, **kwargs,)) @classmethod @_inherit_docstrings(pandas.read_sql, apilink="pandas.read_sql") @@ -586,19 +563,10 @@ def read_sql_table( returns=_doc_returns_qc, ) def _read_sql_query( - cls, - sql, - con, - **kwargs, + cls, sql, con, **kwargs, ): # noqa: PR01 ErrorMessage.default_to_pandas("`read_sql_query`") - return cls.from_pandas( - pandas.read_sql_query( - sql, - con, - **kwargs, - ) - ) + return cls.from_pandas(pandas.read_sql_query(sql, con, **kwargs,)) @classmethod @_inherit_docstrings(pandas.read_spss, apilink="pandas.read_spss") @@ -650,10 +618,7 @@ def to_sql( pandas.DataFrame.to_pickle, apilink="pandas.DataFrame.to_pickle" ) def _to_pickle( - cls, - obj: Any, - filepath_or_buffer, - **kwargs, + cls, obj: Any, filepath_or_buffer, **kwargs, ): # noqa: PR01, D200 """ Pickle (serialize) object to file. @@ -662,11 +627,7 @@ def _to_pickle( if isinstance(obj, BaseQueryCompiler): obj = obj.to_pandas() - return pandas.to_pickle( - obj, - filepath_or_buffer=filepath_or_buffer, - **kwargs, - ) + return pandas.to_pickle(obj, filepath_or_buffer=filepath_or_buffer, **kwargs,) @classmethod @_inherit_docstrings(pandas.DataFrame.to_csv, apilink="pandas.DataFrame.to_csv") diff --git a/modin/core/io/pickle/pickle_dispatcher.py b/modin/core/io/pickle/pickle_dispatcher.py index 32feae81f65..49ec8eb1097 100644 --- a/modin/core/io/pickle/pickle_dispatcher.py +++ b/modin/core/io/pickle/pickle_dispatcher.py @@ -70,12 +70,7 @@ def _read(cls, filepath_or_buffer, **kwargs): for idx, file_name in enumerate(filepath_or_buffer): *partition_ids[idx], lengths_ids[idx], widths_ids[idx] = cls.deploy( - func=cls.parse, - f_kwargs={ - "fname": file_name, - **kwargs, - }, - num_returns=3, + func=cls.parse, f_kwargs={"fname": file_name, **kwargs,}, num_returns=3, ) lengths = cls.materialize(lengths_ids) widths = cls.materialize(widths_ids) diff --git a/modin/core/io/text/csv_glob_dispatcher.py b/modin/core/io/text/csv_glob_dispatcher.py index 5a687da4d6c..109dca3703a 100644 --- a/modin/core/io/text/csv_glob_dispatcher.py +++ b/modin/core/io/text/csv_glob_dispatcher.py @@ -72,9 +72,7 @@ def _read(cls, filepath_or_buffer, **kwargs): filepath_or_buffer = cls.get_path(filepath_or_buffer) elif not cls.pathlib_or_pypath(filepath_or_buffer): return cls.single_worker_read( - filepath_or_buffer, - reason=cls.BUFFER_UNSUPPORTED_MSG, - **kwargs, + filepath_or_buffer, reason=cls.BUFFER_UNSUPPORTED_MSG, **kwargs, ) # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the @@ -161,8 +159,7 @@ def _read(cls, filepath_or_buffer, **kwargs): if usecols is not None and usecols_md[1] != "integer": del kwargs["usecols"] all_cols = pandas.read_csv( - filepath_or_buffer, - **dict(kwargs, nrows=0, skipfooter=0), + filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0), ).columns usecols = all_cols.get_indexer_for(list(usecols_md[0])) parse_dates = kwargs.pop("parse_dates", False) @@ -237,9 +234,7 @@ def _read(cls, filepath_or_buffer, **kwargs): for idx, chunks in enumerate(splits): args.update({"chunks": chunks}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx] = cls.deploy( - func=cls.parse, - f_kwargs=args, - num_returns=num_splits + 2, + func=cls.parse, f_kwargs=args, num_returns=num_splits + 2, ) # Compute the index based on a sum of the lengths of each partition (by default) @@ -477,10 +472,7 @@ def partitioned_file( # TODO(williamma12): Handle when skiprows > number of rows in file. Currently returns empty df. outside_quotes, read_rows = cls._read_rows( - f, - nrows=skip_amount, - quotechar=quotechar, - is_quoting=is_quoting, + f, nrows=skip_amount, quotechar=quotechar, is_quoting=is_quoting, ) if skiprows: skiprows -= read_rows @@ -513,10 +505,7 @@ def partitioned_file( read_size = nrows - read_rows_counter outside_quotes, read_rows = cls._read_rows( - f, - nrows=read_size, - quotechar=quotechar, - is_quoting=is_quoting, + f, nrows=read_size, quotechar=quotechar, is_quoting=is_quoting, ) split_size += read_rows read_rows_counter += read_rows diff --git a/modin/core/io/text/excel_dispatcher.py b/modin/core/io/text/excel_dispatcher.py index 4b30d171ee7..9e51e2b3ab9 100644 --- a/modin/core/io/text/excel_dispatcher.py +++ b/modin/core/io/text/excel_dispatcher.py @@ -199,9 +199,7 @@ def _read(cls, io, **kwargs): if b"" not in chunk and b"" in chunk: break remote_results_list = cls.deploy( - func=cls.parse, - f_kwargs=args, - num_returns=num_splits + 2, + func=cls.parse, f_kwargs=args, num_returns=num_splits + 2, ) data_ids.append(remote_results_list[:-2]) index_ids.append(remote_results_list[-2]) diff --git a/modin/core/io/text/json_dispatcher.py b/modin/core/io/text/json_dispatcher.py index d8bb3e33026..bd36d7dcf52 100644 --- a/modin/core/io/text/json_dispatcher.py +++ b/modin/core/io/text/json_dispatcher.py @@ -65,19 +65,14 @@ def _read(cls, path_or_buf, **kwargs): with OpenFile(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: column_widths, num_splits = cls._define_metadata(empty_pd_df, columns) args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs} - splits = cls.partitioned_file( - f, - num_partitions=NPartitions.get(), - ) + splits = cls.partitioned_file(f, num_partitions=NPartitions.get(),) partition_ids = [None] * len(splits) index_ids = [None] * len(splits) dtypes_ids = [None] * len(splits) for idx, (start, end) in enumerate(splits): args.update({"start": start, "end": end}) *partition_ids[idx], index_ids[idx], dtypes_ids[idx], _ = cls.deploy( - func=cls.parse, - f_kwargs=args, - num_returns=num_splits + 3, + func=cls.parse, f_kwargs=args, num_returns=num_splits + 3, ) # partition_id[-1] contains the columns for each partition, which will be useful # for implementing when `lines=False`. diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index 05dd922e3dd..17846b4dd81 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -535,9 +535,7 @@ def _define_header_size( @classmethod def _define_metadata( - cls, - df: pandas.DataFrame, - column_names: ColumnNamesTypes, + cls, df: pandas.DataFrame, column_names: ColumnNamesTypes, ) -> Tuple[list, int]: """ Define partitioning metadata. @@ -805,9 +803,7 @@ def skiprows_func(x): @classmethod def _define_index( - cls, - index_ids: list, - index_name: str, + cls, index_ids: list, index_name: str, ) -> Tuple[IndexColType, list]: """ Compute the resulting DataFrame index and index lengths for each of partitions. @@ -981,10 +977,7 @@ def _read(cls, filepath_or_buffer, **kwargs): # Define header size for further skipping (Header can be skipped because header # information will be obtained further from empty_df, so no need to handle it # by workers) - header_size = cls._define_header_size( - header, - names, - ) + header_size = cls._define_header_size(header, names,) ( skiprows_md, pre_reading, @@ -995,10 +988,7 @@ def _read(cls, filepath_or_buffer, **kwargs): ) (use_modin_impl, fallback_reason) = cls.check_parameters_support( - filepath_or_buffer_md, - kwargs, - skiprows_md, - header_size, + filepath_or_buffer_md, kwargs, skiprows_md, header_size, ) if not use_modin_impl: return cls.single_worker_read( diff --git a/modin/core/storage_formats/base/doc_utils.py b/modin/core/storage_formats/base/doc_utils.py index b538c47c92b..2fe591a1c11 100644 --- a/modin/core/storage_formats/base/doc_utils.py +++ b/modin/core/storage_formats/base/doc_utils.py @@ -272,10 +272,7 @@ def doc_reduce_agg(method, refer_to, params=None, extra_params=None): ] ) return doc_qc_method( - template, - params=params, - method=method, - refer_to=f"DataFrame.{refer_to}", + template, params=params, method=method, refer_to=f"DataFrame.{refer_to}", ) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index daf6f74505b..8c1fc28f69c 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -746,11 +746,7 @@ def series_update(self, other, **kwargs): # noqa: PR02 New QueryCompiler with updated values. """ return BinaryDefault.register(pandas.Series.update, inplace=True)( - self, - other=other, - squeeze_self=True, - squeeze_other=True, - **kwargs, + self, other=other, squeeze_self=True, squeeze_other=True, **kwargs, ) @doc_utils.add_refer_to("DataFrame.clip") @@ -1068,10 +1064,7 @@ def prod(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.prod)(self, **kwargs) @doc_utils.doc_reduce_agg( - method="sum", - refer_to="sum", - extra_params=["**kwargs"], - params="axis : {0, 1}", + method="sum", refer_to="sum", extra_params=["**kwargs"], params="axis : {0, 1}", ) def sum(self, **kwargs): # noqa: PR02 return DataFrameDefault.register(pandas.DataFrame.sum)(self, **kwargs) @@ -2295,10 +2288,7 @@ def apply_on_series(self, func, *args, **kwargs): assert self.is_series_like() return SeriesDefault.register(pandas.Series.apply)( - self, - func=func, - *args, - **kwargs, + self, func=func, *args, **kwargs, ) def explode(self, column): @@ -2336,13 +2326,7 @@ def explode(self, column): refer_to="count", ) def groupby_count( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.count)( self, @@ -2360,13 +2344,7 @@ def groupby_count( refer_to="any", ) def groupby_any( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.any)( self, @@ -2382,13 +2360,7 @@ def groupby_any( action="get the minimum value", result="minimum value", refer_to="min" ) def groupby_min( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.min)( self, @@ -2402,13 +2374,7 @@ def groupby_min( @doc_utils.doc_groupby_method(result="product", refer_to="prod") def groupby_prod( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.prod)( self, @@ -2424,13 +2390,7 @@ def groupby_prod( action="get the maximum value", result="maximum value", refer_to="max" ) def groupby_max( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.max)( self, @@ -2448,13 +2408,7 @@ def groupby_max( refer_to="all", ) def groupby_all( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.all)( self, @@ -2468,13 +2422,7 @@ def groupby_all( @doc_utils.doc_groupby_method(result="sum", refer_to="sum") def groupby_sum( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.sum)( self, @@ -2492,13 +2440,7 @@ def groupby_sum( refer_to="size", ) def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.size)( self, @@ -2577,13 +2519,7 @@ def groupby_agg( action="compute the mean value", result="mean value", refer_to="mean" ) def groupby_mean( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2599,13 +2535,7 @@ def groupby_mean( action="compute unbiased skew", result="unbiased skew", refer_to="skew" ) def groupby_skew( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2623,13 +2553,7 @@ def groupby_skew( refer_to="cumsum", ) def groupby_cumsum( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2647,13 +2571,7 @@ def groupby_cumsum( refer_to="cummax", ) def groupby_cummax( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2671,13 +2589,7 @@ def groupby_cummax( refer_to="cummin", ) def groupby_cummin( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2695,13 +2607,7 @@ def groupby_cummin( refer_to="cumprod", ) def groupby_cumprod( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2717,13 +2623,7 @@ def groupby_cumprod( action="compute standard deviation", result="standard deviation", refer_to="std" ) def groupby_std( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2739,13 +2639,7 @@ def groupby_std( action="compute numerical rank", result="numerical rank", refer_to="rank" ) def groupby_rank( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2761,13 +2655,7 @@ def groupby_rank( action="compute variance", result="variance", refer_to="var" ) def groupby_var( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2785,13 +2673,7 @@ def groupby_var( refer_to="nunique", ) def groupby_nunique( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2807,13 +2689,7 @@ def groupby_nunique( action="get the median value", result="median value", refer_to="median" ) def groupby_median( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2831,13 +2707,7 @@ def groupby_median( refer_to="quantile", ) def groupby_quantile( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2855,13 +2725,7 @@ def groupby_quantile( refer_to="fillna", ) def groupby_fillna( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2877,13 +2741,7 @@ def groupby_fillna( action="get data types", result="data type", refer_to="dtypes" ) def groupby_dtypes( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2901,13 +2759,7 @@ def groupby_dtypes( refer_to="shift", ) def groupby_shift( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -3970,9 +3822,7 @@ def resample_pipe(self, resample_kwargs, func, *args, **kwargs): ) @doc_utils.doc_resample_reduce( - result="product", - params="min_count : int", - refer_to="prod", + result="product", params="min_count : int", refer_to="prod", ) def resample_prod(self, resample_kwargs, min_count, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.prod)( @@ -3988,8 +3838,7 @@ def resample_quantile(self, resample_kwargs, q, *args, **kwargs): ) @doc_utils.doc_resample_reduce( - result="standard error of the mean", - refer_to="sem", + result="standard error of the mean", refer_to="sem", ) def resample_sem(self, resample_kwargs, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.sem)( @@ -4013,9 +3862,7 @@ def resample_std(self, resample_kwargs, ddof, *args, **kwargs): ) @doc_utils.doc_resample_reduce( - result="sum", - params="min_count : int", - refer_to="sum", + result="sum", params="min_count : int", refer_to="sum", ) def resample_sum(self, resample_kwargs, min_count, *args, **kwargs): return ResampleDefault.register(pandas.core.resample.Resampler.sum)( diff --git a/modin/core/storage_formats/cudf/parser.py b/modin/core/storage_formats/cudf/parser.py index 9daac986e13..12bedb2a5fc 100644 --- a/modin/core/storage_formats/cudf/parser.py +++ b/modin/core/storage_formats/cudf/parser.py @@ -76,10 +76,8 @@ def single_worker_read(cls, fname, *, reason, **kwargs): pandas_frame = cls.parse(fname, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read - pandas_frame.read = ( - lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( - pd_read(*args, **kwargs), cls.frame_cls - ) + pandas_frame.read = lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( + pd_read(*args, **kwargs), cls.frame_cls ) return pandas_frame elif isinstance(pandas_frame, (OrderedDict, dict)): diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index e08f4da8451..39c473e5fff 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -130,10 +130,7 @@ def find_common_type_cat(types): if all(isinstance(t, pandas.CategoricalDtype) for t in types): if all(t.ordered for t in types): categories = np.sort(np.unique([c for t in types for c in t.categories])) - return pandas.CategoricalDtype( - categories, - ordered=True, - ) + return pandas.CategoricalDtype(categories, ordered=True,) return union_categoricals( [pandas.Categorical([], dtype=t) for t in types], sort_categories=all(t.ordered for t in types), @@ -247,8 +244,7 @@ def get_dtypes(cls, dtypes_ids): # concat all elements of `partitions_dtypes` and find common dtype # for each of the column among all partitions frame_dtypes = combined_part_dtypes.apply( - lambda row: find_common_type_cat(row.values), - axis=1, + lambda row: find_common_type_cat(row.values), axis=1, ).squeeze(axis=0) return frame_dtypes @@ -281,10 +277,8 @@ def single_worker_read(cls, fname, *, reason: str, **kwargs): pandas_frame = cls.parse(fname, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read - pandas_frame.read = ( - lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( - pd_read(*args, **kwargs), cls.frame_cls - ) + pandas_frame.read = lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( + pd_read(*args, **kwargs), cls.frame_cls ) return pandas_frame elif isinstance(pandas_frame, (OrderedDict, dict)): @@ -673,10 +667,7 @@ def _read_row_group_chunk( return ( ParquetFile(f) .read_row_groups( - range( - row_group_start, - row_group_end, - ), + range(row_group_start, row_group_end,), columns=columns, use_pandas_metadata=True, ) @@ -762,10 +753,7 @@ def parse(fname, **kwargs): if num_splits is None: return pandas.read_feather(fname, **kwargs) - with OpenFile( - fname, - **(kwargs.pop("storage_options", None) or {}), - ) as file: + with OpenFile(fname, **(kwargs.pop("storage_options", None) or {}),) as file: df = feather.read_feather(file, **kwargs) # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes] diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index de442670abc..2b294a3935e 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -431,10 +431,7 @@ def where(self, cond, other, **kwargs): # the same row and column labels as `self`. new_modin_frame = self._modin_frame.n_ary_op( lambda df, cond, other: df.where(cond, other, **kwargs), - [ - cond._modin_frame, - other._modin_frame, - ], + [cond._modin_frame, other._modin_frame,], join_type=None, ) # This will be a Series of scalars to be applied based on the condition @@ -786,10 +783,7 @@ def reduce_fn(df, **kwargs): count_cols = count_cols.sum(axis=axis, skipna=False) return sum_cols / count_cols - return TreeReduce.register( - map_fn, - reduce_fn, - )(self, axis=axis, **kwargs) + return TreeReduce.register(map_fn, reduce_fn,)(self, axis=axis, **kwargs) # END TreeReduce operations @@ -1000,11 +994,7 @@ def resample_ohlc_df(self, resample_kwargs, *args, **kwargs): def resample_prod(self, resample_kwargs, min_count, *args, **kwargs): return self._resample_func( - resample_kwargs, - "prod", - min_count=min_count, - *args, - **kwargs, + resample_kwargs, "prod", min_count=min_count, *args, **kwargs, ) def resample_size(self, resample_kwargs): @@ -1020,11 +1010,7 @@ def resample_std(self, resample_kwargs, ddof, *args, **kwargs): def resample_sum(self, resample_kwargs, min_count, *args, **kwargs): return self._resample_func( - resample_kwargs, - "sum", - min_count=min_count, - *args, - **kwargs, + resample_kwargs, "sum", min_count=min_count, *args, **kwargs, ) def resample_var(self, resample_kwargs, ddof, *args, **kwargs): @@ -1423,9 +1409,7 @@ def stack(self, level, dropna): def unique(self): new_modin_frame = self._modin_frame.apply_full_axis( - 0, - lambda x: x.squeeze(axis=1).unique(), - new_columns=self.columns, + 0, lambda x: x.squeeze(axis=1).unique(), new_columns=self.columns, ) return self.__constructor__(new_modin_frame) @@ -2137,9 +2121,7 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]): ) new_index = pandas.RangeIndex(len(self.index) * len(value_vars)) new_modin_frame = self._modin_frame.__constructor__( - new_parts, - index=new_index, - columns=id_vars + [var_name, value_name], + new_parts, index=new_index, columns=id_vars + [var_name, value_name], ) result = self.__constructor__(new_modin_frame) # this assigment needs to propagate correct indices into partitions @@ -2588,9 +2570,7 @@ def _groupby_mean_reduce(dfgb, **kwargs): result = GroupByReduce.register( lambda dfgb, **kwargs: pandas.concat( - [dfgb.sum(**kwargs), dfgb.count()], - axis=1, - copy=False, + [dfgb.sum(**kwargs), dfgb.count()], axis=1, copy=False, ), _groupby_mean_reduce, default_to_pandas_func=lambda dfgb, **kwargs: dfgb.mean(**kwargs), @@ -2609,13 +2589,7 @@ def _groupby_mean_reduce(dfgb, **kwargs): return result def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): result = self._groupby_dict_reduce( by=by, @@ -2725,13 +2699,7 @@ def _groupby_dict_reduce( ) def groupby_dtypes( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.groupby_agg( by=by, @@ -2846,9 +2814,7 @@ def groupby_agg_builder(df, by=None, drop=False, partition_idx=None): missed_by_cols = internal_by_df.columns.difference(df.columns) if len(missed_by_cols) > 0: df = pandas.concat( - [df, internal_by_df[missed_by_cols]], - axis=1, - copy=False, + [df, internal_by_df[missed_by_cols]], axis=1, copy=False, ) internal_by_cols = internal_by_df.columns diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py index 45c044203a1..5bc911e1026 100644 --- a/modin/distributed/dataframe/pandas/partitions.py +++ b/modin/distributed/dataframe/pandas/partitions.py @@ -199,11 +199,7 @@ def from_partitions( column_widths = [len(idx) for idx in internal_indices] frame = partition_frame_class( - parts, - index, - columns, - row_lengths=row_lengths, - column_widths=column_widths, + parts, index, columns, row_lengths=row_lengths, column_widths=column_widths, ) if labels_axis_to_sync != -1: diff --git a/modin/experimental/batch/pipeline.py b/modin/experimental/batch/pipeline.py index 1030e88589f..2eb444bdb44 100644 --- a/modin/experimental/batch/pipeline.py +++ b/modin/experimental/batch/pipeline.py @@ -235,8 +235,7 @@ def _complete_nodes(self, list_of_nodes, partitions): for i in range(1, self.num_partitions): new_dfs.append( type(partitions[0])( - partition_list, - full_axis=partitions[0].full_axis, + partition_list, full_axis=partitions[0].full_axis, ).add_to_apply_calls(node.func, i) ) new_dfs[-1].drain_call_queue(num_splits=1) diff --git a/modin/experimental/batch/test/test_pipeline.py b/modin/experimental/batch/test/test_pipeline.py index 3248ca7b94f..2a03da90604 100644 --- a/modin/experimental/batch/test/test_pipeline.py +++ b/modin/experimental/batch/test/test_pipeline.py @@ -24,8 +24,7 @@ @pytest.mark.skipif( - Engine.get() != "Ray", - reason="Only Ray supports the Batch Pipeline API", + Engine.get() != "Ray", reason="Only Ray supports the Batch Pipeline API", ) class TestPipelineRayEngine: def test_warnings(self): @@ -163,8 +162,7 @@ def test_output_id(self): ): pipeline.compute_batch(postprocessor=lambda df: df, pass_output_id=True) with pytest.raises( - ValueError, - match="Output ID cannot be specified for non-output node.", + ValueError, match="Output ID cannot be specified for non-output node.", ): pipeline.add_query(lambda df: df, output_id=22) assert ( @@ -485,8 +483,7 @@ def reducer(dfs): output_id=20, ) pipeline.add_query( - lambda df: pandas.concat([df] * 1000), - repartition_after=True, + lambda df: pandas.concat([df] * 1000), repartition_after=True, ) def to_csv(df, partition_id): @@ -501,9 +498,7 @@ def post_proc(df, o_id, partition_id): return df new_dfs = pipeline.compute_batch( - postprocessor=post_proc, - pass_partition_id=True, - pass_output_id=True, + postprocessor=post_proc, pass_partition_id=True, pass_output_id=True, ) correct_df = pd.DataFrame([[0, 1, 2]]) correct_df["new_col"] = 0 @@ -543,8 +538,7 @@ def post_proc(df, o_id, partition_id): @pytest.mark.skipif( - Engine.get() == "Ray", - reason="Ray supports the Batch Pipeline API", + Engine.get() == "Ray", reason="Ray supports the Batch Pipeline API", ) def test_pipeline_unsupported_engine(): """Ensure that trying to use the Pipeline API with an unsupported Engine raises errors.""" diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index d77c419ba8a..7673048e969 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -641,13 +641,16 @@ def _preprocess_init_args( ): (data,) = conn.deliver((data,), {})[0] - return (), dict( - data=data, - index=index, - columns=columns, - dtype=dtype, - copy=copy, - query_compiler=query_compiler, + return ( + (), + dict( + data=data, + index=index, + columns=columns, + dtype=dtype, + copy=copy, + query_compiler=query_compiler, + ), ) @property @@ -694,9 +697,7 @@ def make_dataframe_groupby_wrapper(DataFrameGroupBy): Look for deatils in make_dataframe_wrapper() and _deliveringWrapper(). """ DeliveringDataFrameGroupBy = _deliveringWrapper( - DataFrameGroupBy, - ["agg", "aggregate", "apply"], - target_name="DataFrameGroupBy", + DataFrameGroupBy, ["agg", "aggregate", "apply"], target_name="DataFrameGroupBy", ) return DeliveringDataFrameGroupBy diff --git a/modin/experimental/cloud/test/test_cloud.py b/modin/experimental/cloud/test/test_cloud.py index 08d7ccad192..89844d039c5 100644 --- a/modin/experimental/cloud/test/test_cloud.py +++ b/modin/experimental/cloud/test/test_cloud.py @@ -63,8 +63,7 @@ def ray_cluster(conda_packages=None): make_bootstrap_config_mock, ): ray_cluster = RayCluster( - Provider(name="aws"), - add_conda_packages=conda_packages, + Provider(name="aws"), add_conda_packages=conda_packages, ) return ray_cluster @@ -111,13 +110,10 @@ def test_create_or_update_cluster(make_ray_cluster, make_create_or_update_cluste ], ) @pytest.mark.parametrize( - "user_packages", - [["scikit-learn>=0.23", "modin==0.8.0"], None], + "user_packages", [["scikit-learn>=0.23", "modin==0.8.0"], None], ) def test_update_conda_requirements( - make_ray_cluster, - setup_commands_source, - user_packages, + make_ray_cluster, setup_commands_source, user_packages, ): fake_version = namedtuple("FakeVersion", "major minor micro")(7, 12, 45) with mock.patch("sys.version_info", fake_version): diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py index 095ceac5c44..ba7c384c3eb 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py @@ -206,8 +206,8 @@ def compute_fragment_size(cls, table): cpu_count = os.cpu_count() if cpu_count is not None: fragment_size = table.num_rows // cpu_count - fragment_size = min(fragment_size, 2**25) - fragment_size = max(fragment_size, 2**18) + fragment_size = min(fragment_size, 2 ** 25) + fragment_size = max(fragment_size, 2 ** 18) else: fragment_size = 0 else: diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py index 41f576ee135..08c41eaf23c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py @@ -869,13 +869,7 @@ def join( condition = self._build_equi_join_condition(other, left_on, right_on) - op = JoinNode( - self, - other, - how=how.value, - exprs=exprs, - condition=condition, - ) + op = JoinNode(self, other, how=how.value, exprs=exprs, condition=condition,) new_columns = Index.__new__(Index, data=new_columns) res = self.__constructor__( @@ -1096,13 +1090,7 @@ def _join_by_index(self, other_modin_frames, how, sort, ignore_index): exprs[new_col_name] = rhs.ref(col) new_columns.append(new_col_name) - op = JoinNode( - lhs, - rhs, - how=how, - exprs=exprs, - condition=condition, - ) + op = JoinNode(lhs, rhs, how=how, exprs=exprs, condition=condition,) new_columns = Index.__new__( Index, data=new_columns, dtype=self.columns.dtype @@ -1117,10 +1105,7 @@ def _join_by_index(self, other_modin_frames, how, sort, ignore_index): if sort: lhs = lhs.sort_rows( - lhs._index_cols, - ascending=True, - ignore_index=False, - na_position="last", + lhs._index_cols, ascending=True, ignore_index=False, na_position="last", ) if reset_index_names: diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py index f64627fc79f..d34eacec10c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py @@ -657,12 +657,7 @@ class JoinNode(DFAlgNode): """ def __init__( - self, - left, - right, - how="inner", - exprs=None, - condition=None, + self, left, right, how="inner", exprs=None, condition=None, ): self.input = [left, right] self.how = how @@ -678,11 +673,7 @@ def copy(self): JoinNode """ return JoinNode( - self.input[0], - self.input[1], - self.how, - self.exprs, - self.condition, + self.input[0], self.input[1], self.how, self.exprs, self.condition, ) def _prints(self, prefix): diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py index 35e87bce803..46fe456ab3c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/utils.py @@ -27,18 +27,8 @@ arrow_types_map = { DTypeKind.BOOL: {8: pa.bool_()}, - DTypeKind.INT: { - 8: pa.int8(), - 16: pa.int16(), - 32: pa.int32(), - 64: pa.int64(), - }, - DTypeKind.UINT: { - 8: pa.uint8(), - 16: pa.uint16(), - 32: pa.uint32(), - 64: pa.uint64(), - }, + DTypeKind.INT: {8: pa.int8(), 16: pa.int16(), 32: pa.int32(), 64: pa.int64(),}, + DTypeKind.UINT: {8: pa.uint8(), 16: pa.uint16(), 32: pa.uint32(), 64: pa.uint64(),}, DTypeKind.FLOAT: {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}, DTypeKind.STRING: {8: pa.string()}, } diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py index cd4841d5d95..1ab5514c5b8 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/io/io.py @@ -40,18 +40,7 @@ from pandas.io.common import is_url ReadCsvKwargsType = Dict[ - str, - Union[ - str, - int, - bool, - dict, - object, - Sequence, - Callable, - Dialect, - None, - ], + str, Union[str, int, bool, dict, object, Sequence, Callable, Dialect, None,], ] @@ -235,9 +224,7 @@ def read_csv( return cls._read(**mykwargs) cls._validate_read_csv_kwargs(mykwargs) - use_modin_impl, error_message = cls._read_csv_check_support( - mykwargs, - ) + use_modin_impl, error_message = cls._read_csv_check_support(mykwargs,) if not use_modin_impl: raise ArrowEngineException(error_message) if isinstance(dtype, dict): @@ -401,8 +388,7 @@ def _prepare_pyarrow_usecols(cls, read_csv_kwargs): @classmethod def _read_csv_check_support( - cls, - read_csv_kwargs: ReadCsvKwargsType, + cls, read_csv_kwargs: ReadCsvKwargsType, ) -> Tuple[bool, str]: """ Check if passed parameters are supported by current ``modin.pandas.read_csv`` implementation. @@ -548,8 +534,7 @@ def _read_csv_check_support( @classmethod def _validate_read_csv_kwargs( - cls, - read_csv_kwargs: ReadCsvKwargsType, + cls, read_csv_kwargs: ReadCsvKwargsType, ): """ Validate `read_csv` keyword arguments. diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py index 87515318bb7..e73001b3ac9 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py @@ -144,7 +144,5 @@ def put_arrow(cls, obj): The new partition. """ return HdkOnNativeDataframePartition( - arrow_table=obj, - length=len(obj), - width=len(obj.columns), + arrow_table=obj, length=len(obj), width=len(obj.columns), ) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index b5504cc0df0..6bfc792e130 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -285,10 +285,7 @@ def test_float32(self): ) @pytest.mark.parametrize("names", [None, [f"c{x}" for x in range(1, 7)]]) def test_read_csv_datetime( - self, - engine, - parse_dates, - names, + self, engine, parse_dates, names, ): parse_dates_unsupported = isinstance(parse_dates, dict) or ( @@ -332,9 +329,7 @@ def test_read_csv_datetime( ], ) def test_read_csv_col_handling( - self, - engine, - usecols, + self, engine, usecols, ): eval_io( fn_name="read_csv", @@ -644,11 +639,7 @@ def concat(lib, df, join, sort, ignore_index): return lib.concat([df], join=join, sort=sort, ignore_index=ignore_index) run_and_compare( - concat, - data=self.data, - join=join, - sort=sort, - ignore_index=ignore_index, + concat, data=self.data, join=join, sort=sort, ignore_index=ignore_index, ) def test_groupby_concat_single(self): @@ -657,8 +648,7 @@ def concat(lib, df): return df.groupby("a").agg({"b": "min"}) run_and_compare( - concat, - data=self.data, + concat, data=self.data, ) @pytest.mark.parametrize("join", ["inner"]) @@ -1838,9 +1828,7 @@ def sort(df, ascending, **kwargs): return df.sort_values(["a", "b"], ascending=ascending) run_and_compare( - sort, - data=self.data, - ascending=ascending, + sort, data=self.data, ascending=ascending, ) @pytest.mark.parametrize("ascending", ascending_values) @@ -1849,9 +1837,7 @@ def sort(df, ascending, **kwargs): return df.sort_values("d", ascending=ascending) run_and_compare( - sort, - data=self.data, - ascending=ascending, + sort, data=self.data, ascending=ascending, ) @pytest.mark.parametrize("cols", cols_values) @@ -1962,15 +1948,15 @@ def test_uint(self, md_df_constructor): { "uint8_in_int_bounds": np.array([1, 2, 3], dtype="uint8"), "uint8_out-of_int_bounds": np.array( - [(2**8) - 1, (2**8) - 2, (2**8) - 3], dtype="uint8" + [(2 ** 8) - 1, (2 ** 8) - 2, (2 ** 8) - 3], dtype="uint8" ), "uint16_in_int_bounds": np.array([1, 2, 3], dtype="uint16"), "uint16_out-of_int_bounds": np.array( - [(2**16) - 1, (2**16) - 2, (2**16) - 3], dtype="uint16" + [(2 ** 16) - 1, (2 ** 16) - 2, (2 ** 16) - 3], dtype="uint16" ), "uint32_in_int_bounds": np.array([1, 2, 3], dtype="uint32"), "uint32_out-of_int_bounds": np.array( - [(2**32) - 1, (2**32) - 2, (2**32) - 3], dtype="uint32" + [(2 ** 32) - 1, (2 ** 32) - 2, (2 ** 32) - 3], dtype="uint32" ), "uint64_in_int_bounds": np.array([1, 2, 3], dtype="uint64"), } @@ -2005,7 +1991,7 @@ def test_uint_overflow(self, md_df_constructor): pandas.DataFrame( { "col": np.array( - [(2**64) - 1, (2**64) - 2, (2**64) - 3], dtype="uint64" + [(2 ** 64) - 1, (2 ** 64) - 2, (2 ** 64) - 3], dtype="uint64" ) } ) diff --git a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py index 2336a9d5e41..35221400cde 100644 --- a/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py +++ b/modin/experimental/core/execution/ray/implementations/pyarrow_on_ray/partitioning/axis_partition.py @@ -90,12 +90,7 @@ def apply(self, func, *args, num_splits=None, other_axis_partition=None, **kwarg return [ PyarrowOnRayDataframePartition(obj) for obj in deploy_ray_axis_func.options(num_returns=num_splits).remote( - self.axis, - func, - args, - kwargs, - num_splits, - *self.list_of_blocks, + self.axis, func, args, kwargs, num_splits, *self.list_of_blocks, ) ] @@ -255,13 +250,7 @@ def deploy_ray_axis_func(axis, func, f_args, f_kwargs, num_splits, *partitions): @ray.remote def deploy_ray_func_between_two_axis_partitions( - axis, - func, - f_args, - f_kwargs, - num_splits, - len_of_left, - *partitions, + axis, func, f_args, f_kwargs, num_splits, len_of_left, *partitions, ): """ Deploy a function along a full axis between two data sets in Ray. diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index e28fd679ad0..59a5e3188aa 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -132,9 +132,7 @@ def bind_wrappers(cls): if callable(method): setattr( - cls, - name, - build_method_wrapper(name, method), + cls, name, build_method_wrapper(name, method), ) return cls @@ -292,13 +290,7 @@ def take_2d(self, index=None, columns=None): ) def groupby_size( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): # Grouping on empty frame or on index level. if len(self.columns) == 0: @@ -524,11 +516,7 @@ def fillna( ): assert not inplace, "inplace=True should be handled on upper level" new_frame = self._modin_frame.fillna( - value=value, - method=method, - axis=axis, - limit=limit, - downcast=downcast, + value=value, method=method, axis=axis, limit=limit, downcast=downcast, ) return self.__constructor__(new_frame, self._shape_hint) diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py index ef265a0ef31..8acb291ce4f 100644 --- a/modin/experimental/pandas/test/test_io_exp.py +++ b/modin/experimental/pandas/test/test_io_exp.py @@ -33,8 +33,7 @@ @pytest.mark.skipif( - Engine.get() == "Dask", - reason="Dask does not have experimental API", + Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": @@ -67,8 +66,7 @@ def test_from_sql_distributed(make_sql_connection): # noqa: F811 @pytest.mark.skipif( - Engine.get() == "Dask", - reason="Dask does not have experimental API", + Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_defaults(make_sql_connection): # noqa: F811 with ensure_clean_dir() as dirname: @@ -215,8 +213,7 @@ def _pandas_read_csv_glob(path, storage_options): pandas_df = pandas.concat( [ pandas.read_csv( - f"{path}test_data{i}.csv", - storage_options=storage_options, + f"{path}test_data{i}.csv", storage_options=storage_options, ) for i in range(2) ], @@ -234,8 +231,7 @@ def _pandas_read_csv_glob(path, storage_options): @pytest.mark.skipif( - not Engine.get() == "Ray", - reason=f"{Engine.get()} does not have experimental API", + not Engine.get() == "Ray", reason=f"{Engine.get()} does not have experimental API", ) @pytest.mark.parametrize("compression", [None, "gzip"]) @pytest.mark.parametrize( @@ -301,8 +297,7 @@ def _custom_parser(io_input, **kwargs): @pytest.mark.skipif( - not Engine.get() == "Ray", - reason=f"{Engine.get()} does not have experimental API", + not Engine.get() == "Ray", reason=f"{Engine.get()} does not have experimental API", ) def test_read_evaluated_dict(): def _generate_evaluated_dict(file_name, nrows, ncols): @@ -338,9 +333,7 @@ def columns_callback(io_input, **kwargs): _generate_evaluated_dict(filename, 64, 8) df1 = pd.read_custom_text( - filename, - columns=["col1", "col2"], - custom_parser=_custom_parser, + filename, columns=["col1", "col2"], custom_parser=_custom_parser, ) assert df1.shape == (64, 2) diff --git a/modin/experimental/xgboost/test/test_default.py b/modin/experimental/xgboost/test/test_default.py index 9504e3680c5..9c2d9d8a6fc 100644 --- a/modin/experimental/xgboost/test/test_default.py +++ b/modin/experimental/xgboost/test/test_default.py @@ -20,8 +20,7 @@ @pytest.mark.skipif( - Engine.get() == "Ray", - reason="This test doesn't make sense on Ray engine.", + Engine.get() == "Ray", reason="This test doesn't make sense on Ray engine.", ) @pytest.mark.skipif( Engine.get() == "Python", diff --git a/modin/experimental/xgboost/test/test_dmatrix.py b/modin/experimental/xgboost/test/test_dmatrix.py index 3f7f7d681c6..695ffec9c71 100644 --- a/modin/experimental/xgboost/test/test_dmatrix.py +++ b/modin/experimental/xgboost/test/test_dmatrix.py @@ -69,8 +69,7 @@ def check_dmatrix(data, label=None, **kwargs): ], ) @pytest.mark.parametrize( - "feature_types", - [None, "q", list("qiqiq")], + "feature_types", [None, "q", list("qiqiq")], ) def test_dmatrix_feature_names_and_feature_types(data, feature_names, feature_types): check_dmatrix(data, feature_names=feature_names, feature_types=feature_types) @@ -83,9 +82,7 @@ def test_feature_names(): feature_names = [f"feat{i}" for i in range(X.shape[1])] check_dmatrix( - X, - y, - feature_names=feature_names, + X, y, feature_names=feature_names, ) dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names) diff --git a/modin/experimental/xgboost/test/test_xgboost.py b/modin/experimental/xgboost/test/test_xgboost.py index 1863a268379..2f7008cfd06 100644 --- a/modin/experimental/xgboost/test/test_xgboost.py +++ b/modin/experimental/xgboost/test/test_xgboost.py @@ -40,12 +40,10 @@ @pytest.mark.parametrize( - "modin_type_y", - [pd.DataFrame, pd.Series], + "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( - "num_actors", - [1, num_cpus, None, modin.config.NPartitions.get() + 1], + "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", @@ -114,28 +112,17 @@ def test_xgb_with_binary_classification_datasets(data, num_actors, modin_type_y) @pytest.mark.parametrize( - "modin_type_y", - [pd.DataFrame, pd.Series], + "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( - "num_actors", - [1, num_cpus, None, modin.config.NPartitions.get() + 1], + "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( "data", [ - ( - load_iris(), - {"num_class": 3}, - ), - ( - load_digits(), - {"num_class": 10}, - ), - ( - load_wine(), - {"num_class": 3}, - ), + (load_iris(), {"num_class": 3},), + (load_digits(), {"num_class": 10},), + (load_wine(), {"num_class": 3},), ], ids=["load_iris", "load_digits", "load_wine"], ) @@ -199,17 +186,13 @@ def test_xgb_with_multiclass_classification_datasets(data, num_actors, modin_typ @pytest.mark.parametrize( - "modin_type_y", - [pd.DataFrame, pd.Series], + "modin_type_y", [pd.DataFrame, pd.Series], ) @pytest.mark.parametrize( - "num_actors", - [1, num_cpus, None, modin.config.NPartitions.get() + 1], + "num_actors", [1, num_cpus, None, modin.config.NPartitions.get() + 1], ) @pytest.mark.parametrize( - "data", - [(load_diabetes(), {"eta": 0.01})], - ids=["load_diabetes"], + "data", [(load_diabetes(), {"eta": 0.01})], ids=["load_diabetes"], ) def test_xgb_with_regression_datasets(data, num_actors, modin_type_y): dataset, param = data diff --git a/modin/experimental/xgboost/xgboost.py b/modin/experimental/xgboost/xgboost.py index ce940129f54..520bdf497a4 100644 --- a/modin/experimental/xgboost/xgboost.py +++ b/modin/experimental/xgboost/xgboost.py @@ -305,9 +305,7 @@ def __init__(self, params=None, cache=(), model_file=None): # noqa: MD01 super(Booster, self).__init__(params=params, cache=cache, model_file=model_file) def predict( - self, - data: DMatrix, - **kwargs, + self, data: DMatrix, **kwargs, ): """ Run distributed prediction with a trained booster. diff --git a/modin/experimental/xgboost/xgboost_ray.py b/modin/experimental/xgboost/xgboost_ray.py index ddd05232204..d11ef7201fe 100644 --- a/modin/experimental/xgboost/xgboost_ray.py +++ b/modin/experimental/xgboost/xgboost_ray.py @@ -287,10 +287,7 @@ def create_actors(num_actors): def _split_data_across_actors( - actors: List, - set_func, - X_parts, - y_parts, + actors: List, set_func, X_parts, y_parts, ): """ Split row partitions of data between actors. @@ -306,15 +303,10 @@ def _split_data_across_actors( y_parts : list Row partitions of y data. """ - X_parts_by_actors = _assign_row_partitions_to_actors( - actors, - X_parts, - ) + X_parts_by_actors = _assign_row_partitions_to_actors(actors, X_parts,) y_parts_by_actors = _assign_row_partitions_to_actors( - actors, - y_parts, - data_for_aligning=X_parts_by_actors, + actors, y_parts, data_for_aligning=X_parts_by_actors, ) for rank, (_, actor) in enumerate(actors): @@ -322,9 +314,7 @@ def _split_data_across_actors( def _assign_row_partitions_to_actors( - actors: List, - row_partitions, - data_for_aligning=None, + actors: List, row_partitions, data_for_aligning=None, ): """ Assign row_partitions to actors. @@ -455,12 +445,7 @@ def _assign_row_partitions_to_actors( def _train( - dtrain, - params: Dict, - *args, - num_actors=None, - evals=(), - **kwargs, + dtrain, params: Dict, *args, num_actors=None, evals=(), **kwargs, ): """ Run distributed training of XGBoost model on Ray engine. @@ -592,17 +577,13 @@ def _map_predict(booster, part, columns, dmatrix_kwargs={}, **kwargs): """ dmatrix = xgb.DMatrix(part, **dmatrix_kwargs) prediction = pandas.DataFrame( - booster.predict(dmatrix, **kwargs), - index=part.index, - columns=columns, + booster.predict(dmatrix, **kwargs), index=part.index, columns=columns, ) return prediction def _predict( - booster, - data, - **kwargs, + booster, data, **kwargs, ): """ Run distributed prediction with a trained booster on Ray engine. diff --git a/modin/pandas/base.py b/modin/pandas/base.py index c68e9dfd7a5..1035c7989c0 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -213,11 +213,7 @@ def _update_inplace(self, new_query_compiler): old_query_compiler.free() def _validate_other( - self, - other, - axis, - dtype_check=False, - compare_index=False, + self, other, axis, dtype_check=False, compare_index=False, ): """ Help to check validity of other in inter-df operations. @@ -849,12 +845,7 @@ def error_raiser(msg, exception): stacklevel=2, ) query_compiler = self._query_compiler.apply( - func, - axis, - args=args, - raw=raw, - result_type=result_type, - **kwds, + func, axis, args=args, raw=raw, result_type=result_type, **kwds, ) return query_compiler @@ -1809,11 +1800,7 @@ def _stat_operation( else self ) result_qc = getattr(data._query_compiler, op_name)( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs, ) return self._reduce_dimension(result_qc) @@ -1826,12 +1813,7 @@ def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 ) def _min( - self, - axis, - skipna, - level, - numeric_only, - **kwargs, + self, axis, skipna, level, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the minimum of the values over the requested axis. @@ -2015,13 +1997,7 @@ def check_dtype(t): @_inherit_docstrings(pandas.DataFrame.rank, apilink="pandas.DataFrame.rank") def _rank( - self, - axis, - method, - numeric_only, - na_option, - ascending, - pct, + self, axis, method, numeric_only, na_option, ascending, pct, ): axis = self._get_axis_number(axis) return self.__constructor__( @@ -2064,11 +2040,7 @@ def _ensure_index(self, index_like, axis=0): # noqa: PR01, RT01, D200 return ensure_index(index_like) def _reindex( - self, - index, - columns, - copy, - **kwargs, + self, index, columns, copy, **kwargs, ): # noqa: PR01, RT01, D200 """ Conform `BasePandasDataset` to new index with optional filling logic. @@ -2253,10 +2225,7 @@ def reset_index( raise ValueError("cannot insert level_0, already exists") else: new_query_compiler = self._query_compiler.reset_index( - drop=drop, - level=level, - col_level=col_level, - col_fill=col_fill, + drop=drop, level=level, col_level=col_level, col_fill=col_fill, ) return self._create_or_update_from_compiler(new_query_compiler, inplace) @@ -2365,14 +2334,7 @@ def rtruediv( rdiv = rtruediv def _sample( - self, - n, - frac, - replace, - weights, - random_state, - axis, - **kwargs, + self, n, frac, replace, weights, random_state, axis, **kwargs, ): # noqa: PR01, RT01, D200 """ Return a random sample of items from an axis of object. @@ -2491,13 +2453,7 @@ def _sample( return self.__constructor__(query_compiler=query_compiler) def _sem( - self, - axis, - skipna, - level, - ddof, - numeric_only, - **kwargs, + self, axis, skipna, level, ddof, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased standard error of the mean over requested axis. @@ -2601,12 +2557,7 @@ def _shift(self, periods, freq, axis, fill_value): # noqa: PR01, RT01, D200 return self.tshift(periods, freq) def _skew( - self, - axis, - skipna, - level, - numeric_only, - **kwargs, + self, axis, skipna, level, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased skew over requested axis. @@ -2686,13 +2637,7 @@ def sort_values( return self._create_or_update_from_compiler(result, inplace) def _std( - self, - axis, - skipna, - level, - ddof, - numeric_only, - **kwargs, + self, axis, skipna, level, ddof, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return sample standard deviation over requested axis. @@ -2772,11 +2717,7 @@ def to_hdf( ) def to_numpy(self, dtype=None, copy=False, na_value=no_default): - return self._query_compiler.to_numpy( - dtype=dtype, - copy=copy, - na_value=na_value, - ) + return self._query_compiler.to_numpy(dtype=dtype, copy=copy, na_value=na_value,) # TODO(williamma12): When this gets implemented, have the series one call this. def to_period( @@ -3333,19 +3274,13 @@ def __nonzero__(self): __bool__ = __nonzero__ @_doc_binary_op( - operation="disjunction", - bin_op="or", - right="other", - **_doc_binary_op_kwargs, + operation="disjunction", bin_op="or", right="other", **_doc_binary_op_kwargs, ) def __or__(self, other): return self._binary_op("__or__", other, axis=0) @_doc_binary_op( - operation="disjunction", - bin_op="ror", - right="other", - **_doc_binary_op_kwargs, + operation="disjunction", bin_op="ror", right="other", **_doc_binary_op_kwargs, ) def __ror__(self, other): return self._binary_op("__ror__", other, axis=0) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 172431d6717..35f1ca12b38 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -108,13 +108,7 @@ class DataFrame(DataFrameCompat, BasePandasDataset): @append_to_docstring(__doc__) def _init( - self, - data, - index, - columns, - dtype, - copy, - query_compiler, + self, data, index, columns, dtype, copy, query_compiler, ): # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. @@ -693,8 +687,7 @@ def corr(self, method="pearson", min_periods=1): # noqa: PR01, RT01, D200 """ return self.__constructor__( query_compiler=self._query_compiler.corr( - method=method, - min_periods=min_periods, + method=method, min_periods=min_periods, ) ) @@ -1637,13 +1630,7 @@ def pow( ) def _prod( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the product of the values over the requested axis. @@ -1758,13 +1745,7 @@ def rename( return obj def _replace( - self, - to_replace, - value, - inplace, - limit, - regex, - method, + self, to_replace, value, inplace, limit, regex, method, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -2031,13 +2012,7 @@ def sub( subtract = sub def _sum( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the sum of the values over the requested axis. @@ -2475,11 +2450,7 @@ def __setitem__(self, key, value): if len(key) != value.shape[-1]: raise ValueError("Columns must be same length as key") item = broadcast_item( - self, - slice(None), - key, - value, - need_columns_reindex=False, + self, slice(None), key, value, need_columns_reindex=False, ) new_qc = self._query_compiler.write_items( slice(None), self.columns.get_indexer_for(key), item diff --git a/modin/pandas/general.py b/modin/pandas/general.py index a34dac3a2e5..914d8c3d26e 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -378,11 +378,7 @@ def value_counts( Series """ return Series(values).value_counts( - sort=sort, - ascending=ascending, - normalize=normalize, - bins=bins, - dropna=dropna, + sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, ) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index c50a9bfaa54..300d23dbd6e 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -340,8 +340,7 @@ def dtypes(self): raise ValueError("Cannot call dtypes on groupby with axis=1") return self._check_index( self._wrap_aggregation( - type(self._query_compiler).groupby_dtypes, - numeric_only=False, + type(self._query_compiler).groupby_dtypes, numeric_only=False, ) ) @@ -439,11 +438,7 @@ def __getitem__(self, key): ) cols_to_grab = internal_by.union(key) key = [col for col in self._df.columns if col in cols_to_grab] - return DataFrameGroupBy( - self._df[key], - drop=self._drop, - **kwargs, - ) + return DataFrameGroupBy(self._df[key], drop=self._drop, **kwargs,) if ( self._is_multi_by and isinstance(self._by, list) @@ -453,11 +448,7 @@ def __getitem__(self, key): "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported." ) - return SeriesGroupBy( - self._df[key], - drop=False, - **kwargs, - ) + return SeriesGroupBy(self._df[key], drop=False, **kwargs,) def cummin(self, axis=0, **kwargs): return self._check_index_name( @@ -669,8 +660,7 @@ def size(self): **self._kwargs, ) result = work_object._wrap_aggregation( - type(work_object._query_compiler).groupby_size, - numeric_only=False, + type(work_object._query_compiler).groupby_size, numeric_only=False, ) if not isinstance(result, Series): result = result.squeeze(axis=1) @@ -743,8 +733,7 @@ def resample(self, rule, *args, **kwargs): def median(self, numeric_only=None): return self._check_index( self._wrap_aggregation( - type(self._query_compiler).groupby_median, - numeric_only=numeric_only, + type(self._query_compiler).groupby_median, numeric_only=numeric_only, ) ) @@ -805,8 +794,7 @@ def fillna(self, *args, **kwargs): def count(self): result = self._wrap_aggregation( - type(self._query_compiler).groupby_count, - numeric_only=False, + type(self._query_compiler).groupby_count, numeric_only=False, ) # pandas do it in case of Series if isinstance(result, Series): @@ -1024,12 +1012,7 @@ def _compute_index_grouped(self, numerical=False): return groupby_obj.indices if numerical else groupby_obj.groups def _wrap_aggregation( - self, - qc_method, - numeric_only=None, - agg_args=None, - agg_kwargs=None, - **kwargs, + self, qc_method, numeric_only=None, agg_args=None, agg_kwargs=None, **kwargs, ): """ Perform common metadata transformations and apply groupby functions. diff --git a/modin/pandas/resample.py b/modin/pandas/resample.py index 803af86f68e..84f4fda7edb 100644 --- a/modin/pandas/resample.py +++ b/modin/pandas/resample.py @@ -162,12 +162,7 @@ def apply(self, func, *args, **kwargs): query_comp_op = self._query_compiler.resample_app_ser dataframe = DataFrame( - query_compiler=query_comp_op( - self.resample_kwargs, - func, - *args, - **kwargs, - ) + query_compiler=query_comp_op(self.resample_kwargs, func, *args, **kwargs,) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe @@ -186,12 +181,7 @@ def aggregate(self, func, *args, **kwargs): query_comp_op = self._query_compiler.resample_agg_ser dataframe = DataFrame( - query_compiler=query_comp_op( - self.resample_kwargs, - func, - *args, - **kwargs, - ) + query_compiler=query_comp_op(self.resample_kwargs, func, *args, **kwargs,) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe @@ -304,54 +294,42 @@ def nunique(self, *args, **kwargs): def first(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_first( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def last(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_last( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def max(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_max( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def mean(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_mean( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def median(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_median( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) def min(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_min( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) @@ -361,17 +339,13 @@ def ohlc(self, *args, **kwargs): if isinstance(self._dataframe, DataFrame): return DataFrame( query_compiler=self._query_compiler.resample_ohlc_df( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) else: return DataFrame( query_compiler=self._query_compiler.resample_ohlc_ser( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) @@ -392,9 +366,7 @@ def size(self): def sem(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_sem( - self.resample_kwargs, - *args, - **kwargs, + self.resample_kwargs, *args, **kwargs, ) ) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 0495c1e3ae7..77f22dad53a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1194,12 +1194,7 @@ def keys(self): # noqa: RT01, D200 return self.index def _kurt( - self, - axis, - skipna, - level, - numeric_only, - **kwargs, + self, axis, skipna, level, numeric_only, **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased kurtosis over requested axis. @@ -1245,14 +1240,7 @@ def arg(s): @_inherit_docstrings(pandas.Series.mask, apilink="pandas.Series.mask") def _mask( - self, - cond, - other, - inplace, - axis, - level, - errors, - try_cast, + self, cond, other, inplace, axis, level, errors, try_cast, ): return self._default_to_pandas( pandas.Series.mask, @@ -1411,13 +1399,7 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @_inherit_docstrings(pandas.Series.prod, apilink="pandas.Series.prod") def _prod( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): axis = self._get_axis_number(axis) if level is not None: @@ -1658,13 +1640,7 @@ def reorder_levels(self, order): # noqa: PR01, RT01, D200 return super(Series, self).reorder_levels(order) def _replace( - self, - to_replace, - value, - inplace, - limit, - regex, - method, + self, to_replace, value, inplace, limit, regex, method, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -1775,13 +1751,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, subtract = sub def _sum( - self, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, + self, axis, skipna, level, numeric_only, min_count, **kwargs, ): # noqa: PR01, RT01, D200 """ Return the sum of the values. @@ -1877,11 +1847,7 @@ def to_numpy( """ return ( super(Series, self) - .to_numpy( - dtype=dtype, - copy=copy, - na_value=na_value, - ) + .to_numpy(dtype=dtype, copy=copy, na_value=na_value,) .flatten() ) @@ -2018,14 +1984,7 @@ def view(self, dtype=None): # noqa: PR01, RT01, D200 ) def _where( - self, - cond, - other, - inplace, - axis, - level, - errors, - try_cast, + self, cond, other, inplace, axis, level, errors, try_cast, ): # noqa: PR01, RT01, D200 """ Replace values where the condition is False. diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index 25f36f54de8..7e464915c68 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -75,7 +75,7 @@ def test_math_functions(other, axis, op): @pytest.mark.parametrize( "other", - [lambda df: df[: -(2**4)], lambda df: df[df.columns[0]].reset_index(drop=True)], + [lambda df: df[: -(2 ** 4)], lambda df: df[df.columns[0]].reset_index(drop=True)], ids=["check_missing_value", "check_different_index"], ) @pytest.mark.parametrize("fill_value", [None, 3.0]) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index 710eb4b152a..d4ef9de5c76 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -186,8 +186,7 @@ def test_between_time(): pandas_df.between_time("12:00", "17:00"), ) df_equals( - modin_df.between_time("3:00", "4:00"), - pandas_df.between_time("3:00", "4:00"), + modin_df.between_time("3:00", "4:00"), pandas_df.between_time("3:00", "4:00"), ) df_equals( modin_df.T.between_time("12:00", "17:00", axis=1), @@ -431,9 +430,7 @@ def test_kurt_kurtosis_level(level): df_pandas.columns = index eval_general( - df_modin, - df_pandas, - lambda df: df.kurtosis(axis=1, level=level), + df_modin, df_pandas, lambda df: df.kurtosis(axis=1, level=level), ) @@ -470,9 +467,7 @@ def test_mad_level(level): modin_df.columns = index pandas_df.columns = index eval_general( - modin_df, - pandas_df, - lambda df: df.mad(axis=1, level=level), + modin_df, pandas_df, lambda df: df.mad(axis=1, level=level), ) @@ -583,12 +578,7 @@ def test_pivot_table_data(data, index, columns, values): ) @pytest.mark.parametrize("margins_name", ["Custom name", None]) def test_pivot_table_margins( - data, - index, - columns, - values, - aggfunc, - margins_name, + data, index, columns, values, aggfunc, margins_name, ): eval_general( *create_test_dfs(data), @@ -770,18 +760,10 @@ def test_resample_specific(rule, closed, label, on, level): modin_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") pandas_resampler = pandas_df.resample( - rule, - closed=closed, - label=label, - on=on, - level=level, + rule, closed=closed, label=label, on=on, level=level, ) modin_resampler = modin_df.resample( - rule, - closed=closed, - label=label, - on=on, - level=level, + rule, closed=closed, label=label, on=on, level=level, ) df_equals(modin_resampler.var(0), pandas_resampler.var(0)) if on is None and level is None: @@ -972,8 +954,7 @@ def test_swaplevel(): ), ) df_equals( - modin_df.swaplevel("Number", "Color"), - pandas_df.swaplevel("Number", "Color"), + modin_df.swaplevel("Number", "Color"), pandas_df.swaplevel("Number", "Color"), ) df_equals(modin_df.swaplevel(), pandas_df.swaplevel()) df_equals(modin_df.swaplevel(0, 1), pandas_df.swaplevel(0, 1)) @@ -1007,16 +988,14 @@ def test_take(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_records(request, data): eval_general( - *create_test_dfs(data), - lambda df: df.dropna().to_records(), + *create_test_dfs(data), lambda df: df.dropna().to_records(), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(data): eval_general( - *create_test_dfs(data), - lambda df: df.to_string(), + *create_test_dfs(data), lambda df: df.to_string(), ) diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index ba2c7ce0ffd..64d7517caca 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -136,8 +136,7 @@ def test_asof_without_nan(dates, subset): @pytest.mark.parametrize( - "lookup", - [[60, 70, 90], [60.5, 70.5, 100]], + "lookup", [[60, 70, 90], [60.5, 70.5, 100]], ) @pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None]) def test_asof_large(lookup, subset): @@ -534,20 +533,15 @@ def test_loc_multi_index(): pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) frame_data = np.random.randint(0, 100, size=(16, 100)) modin_df = pd.DataFrame( - frame_data, - index=modin_index, - columns=["col{}".format(i) for i in range(100)], + frame_data, index=modin_index, columns=["col{}".format(i) for i in range(100)], ) pandas_df = pandas.DataFrame( - frame_data, - index=pandas_index, - columns=["col{}".format(i) for i in range(100)], + frame_data, index=pandas_index, columns=["col{}".format(i) for i in range(100)], ) df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] df_equals( - modin_df.loc["bar", ("col1", "col2")], - pandas_df.loc["bar", ("col1", "col2")], + modin_df.loc["bar", ("col1", "col2")], pandas_df.loc["bar", ("col1", "col2")], ) # From issue #1456 @@ -1308,8 +1302,7 @@ def test_reset_index_multiindex_groupby(data): "none_in_index_names", [ pytest.param( - False, - marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), + False, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), ), True, "mixed_1st_None", @@ -1452,8 +1445,7 @@ def test_reset_index_with_multi_index_no_drop( "none_in_index_names", [ pytest.param( - False, - marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), + False, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"), ), True, "mixed_1st_None", @@ -1542,9 +1534,7 @@ def test_sample(data, axis): with pytest.raises(ValueError): modin_df.sample( - frac=0.5, - weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], - axis=1, + frac=0.5, weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], axis=1, ) with pytest.raises(ValueError): @@ -1951,9 +1941,7 @@ def test___setitem__mask(): ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"], ) @pytest.mark.parametrize( - "value", - [[11, 22], [11, 22, 33]], - ids=["2_length_val", "3_length_val"], + "value", [[11, 22], [11, 22, 33]], ids=["2_length_val", "3_length_val"], ) @pytest.mark.parametrize("convert_to_series", [False, True]) @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index 461a6f6c675..177f7d7f7ea 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -379,8 +379,7 @@ def test_constructor_columns_and_index(): pandas.DataFrame(pandas_df, columns=["max_speed", "health"]), ) df_equals( - pd.DataFrame(modin_df, index=[1, 2]), - pandas.DataFrame(pandas_df, index=[1, 2]), + pd.DataFrame(modin_df, index=[1, 2]), pandas.DataFrame(pandas_df, index=[1, 2]), ) df_equals( pd.DataFrame(modin_df, index=[1, 2], columns=["health"]), diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index 6ef0371bbee..9df809c24d3 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -67,20 +67,20 @@ def test_combine(data): "test_data, test_data2", [ ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**7, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**7, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**7)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ], ) @@ -165,20 +165,20 @@ def test_join(test_data, test_data2): "test_data, test_data2", [ ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**7, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**7, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**6)), - np.random.uniform(0, 100, size=(2**6, 2**7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), ), ( - np.random.uniform(0, 100, size=(2**6, 2**7)), - np.random.uniform(0, 100, size=(2**6, 2**6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), ), ], ) @@ -218,18 +218,10 @@ def test_merge(test_data, test_data2): df_equals(modin_result, pandas_result) modin_result = modin_df.merge( - modin_df2, - how=hows[i], - left_on="key", - right_on="key", - sort=sorts[j], + modin_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) pandas_result = pandas_df.merge( - pandas_df2, - how=hows[i], - left_on="key", - right_on="key", - sort=sorts[j], + pandas_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) df_equals(modin_result, pandas_result) @@ -436,9 +428,7 @@ def test_sort_multiindex(sort_remaining): ) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) @pytest.mark.parametrize( - "ignore_index", - bool_arg_values, - ids=arg_keys("ignore_index", bool_arg_keys), + "ignore_index", bool_arg_values, ids=arg_keys("ignore_index", bool_arg_keys), ) @pytest.mark.parametrize("key", [None, rotate_decimal_digits_or_symbols]) def test_sort_values( diff --git a/modin/pandas/test/dataframe/test_reduce.py b/modin/pandas/test/dataframe/test_reduce.py index 93f7be95fe4..03cada3f57f 100644 --- a/modin/pandas/test/dataframe/test_reduce.py +++ b/modin/pandas/test/dataframe/test_reduce.py @@ -94,9 +94,7 @@ def test_all_any_level(data, axis, level, method): pandas_df.columns = new_col eval_general( - modin_df, - pandas_df, - lambda df: getattr(df, method)(axis=axis, level=level), + modin_df, pandas_df, lambda df: getattr(df, method)(axis=axis, level=level), ) @@ -106,8 +104,7 @@ def test_all_any_level(data, axis, level, method): ) def test_count(data, axis): eval_general( - *create_test_dfs(data), - lambda df: df.count(axis=axis), + *create_test_dfs(data), lambda df: df.count(axis=axis), ) @@ -135,9 +132,7 @@ def test_count_level(data, axis, level): pandas_df.columns = new_col eval_general( - modin_df, - pandas_df, - lambda df: df.count(axis=axis, level=level), + modin_df, pandas_df, lambda df: df.count(axis=axis, level=level), ) @@ -146,9 +141,7 @@ def test_count_dtypes(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) eval_general( - modin_df, - pandas_df, - lambda df: df.isna().count(axis=0), + modin_df, pandas_df, lambda df: df.isna().count(axis=0), ) @@ -156,8 +149,7 @@ def test_count_dtypes(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_describe(data, percentiles): eval_general( - *create_test_dfs(data), - lambda df: df.describe(percentiles=percentiles), + *create_test_dfs(data), lambda df: df.describe(percentiles=percentiles), ) @@ -165,12 +157,12 @@ def test_describe(data, percentiles): @pytest.mark.parametrize("datetime_is_numeric", [True, False, None]) def test_2195(datetime_is_numeric, has_numeric_column): data = { - "categorical": pd.Categorical(["d"] * 10**2), - "date": [np.datetime64("2000-01-01")] * 10**2, + "categorical": pd.Categorical(["d"] * 10 ** 2), + "date": [np.datetime64("2000-01-01")] * 10 ** 2, } if has_numeric_column: - data.update({"numeric": [5] * 10**2}) + data.update({"numeric": [5] * 10 ** 2}) modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) @@ -292,17 +284,12 @@ def test_min_max_mean(data, axis, skipna, numeric_only, is_transposed, method): @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_prod( - data, - axis, - skipna, - is_transposed, - method, + data, axis, skipna, is_transposed, method, ): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( - axis=axis, - skipna=skipna, + axis=axis, skipna=skipna, ), ) @@ -328,10 +315,7 @@ def test_prod( def test_sum(data, axis, skipna, is_transposed): eval_general( *create_test_dfs(data), - lambda df: (df.T if is_transposed else df).sum( - axis=axis, - skipna=skipna, - ), + lambda df: (df.T if is_transposed else df).sum(axis=axis, skipna=skipna,), ) # test for issue #1953 diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index f1f7e103bbd..30903d2b86e 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -91,8 +91,7 @@ def test_diff(axis, periods): @pytest.mark.parametrize("axis", ["rows", "columns"]) def test_diff_transposed(axis): eval_general( - *create_test_dfs(test_data["int_data"]), - lambda df: df.T.diff(axis=axis), + *create_test_dfs(test_data["int_data"]), lambda df: df.T.diff(axis=axis), ) @@ -356,8 +355,7 @@ def test_fillna_dict_series(): df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) df_equals( - modin_df.fillna({"a": 0, "b": 5, "d": 7}), - df.fillna({"a": 0, "b": 5, "d": 7}), + modin_df.fillna({"a": 0, "b": 5, "d": 7}), df.fillna({"a": 0, "b": 5, "d": 7}), ) # Series treated same as dict diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 12d23892b2e..56b6d91ed51 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -149,7 +149,7 @@ def test_ignore_index_concat(): def test_concat_non_subscriptable_keys(): - frame_data = np.random.randint(0, 100, size=(2**10, 2**6)) + frame_data = np.random.randint(0, 100, size=(2 ** 10, 2 ** 6)) df = pd.DataFrame(frame_data).add_prefix("col") pdf = pandas.DataFrame(frame_data).add_prefix("col") @@ -232,7 +232,6 @@ def test_sort_order(sort, join, axis): pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) df_equals( - pandas_concat, - modin_concat, + pandas_concat, modin_concat, ) assert list(pandas_concat.columns) == list(modin_concat.columns) diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index 6d075076267..447bdeaf3a7 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -418,11 +418,7 @@ def test_merge_asof_merge_options(): # left_by + right_by with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( - modin_quotes, - modin_trades, - on="time", - left_by="ticker", - right_by="ticker2", + modin_quotes, modin_trades, on="time", left_by="ticker", right_by="ticker2", ) df_equals( pandas.merge_asof( @@ -440,18 +436,10 @@ def test_merge_asof_merge_options(): modin_trades["ticker"] = modin_trades["ticker2"] with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( - modin_quotes, - modin_trades, - on="time", - by="ticker", + modin_quotes, modin_trades, on="time", by="ticker", ) df_equals( - pandas.merge_asof( - pandas_quotes, - pandas_trades, - on="time", - by="ticker", - ), + pandas.merge_asof(pandas_quotes, pandas_trades, on="time", by="ticker",), modin_result, ) @@ -478,19 +466,11 @@ def test_merge_asof_merge_options(): # Direction with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( - modin_quotes, - modin_trades, - on="time", - by="ticker", - direction="forward", + modin_quotes, modin_trades, on="time", by="ticker", direction="forward", ) df_equals( pandas.merge_asof( - pandas_quotes, - pandas_trades, - on="time", - by="ticker", - direction="forward", + pandas_quotes, pandas_trades, on="time", by="ticker", direction="forward", ), modin_result, ) @@ -744,11 +724,7 @@ def test_to_pandas_indices(data): def test_create_categorical_dataframe_with_duplicate_column_name(): # This tests for https://github.com/modin-project/modin/issues/4312 pd_df = pandas.DataFrame( - { - "a": pandas.Categorical([1, 2]), - "b": [4, 5], - "c": pandas.Categorical([7, 8]), - } + {"a": pandas.Categorical([1, 2]), "b": [4, 5], "c": pandas.Categorical([7, 8]),} ) pd_df.columns = ["a", "b", "a"] md_df = pd.DataFrame(pd_df) @@ -777,10 +753,7 @@ def test_create_categorical_dataframe_with_duplicate_column_name(): (lambda df: df.mean(level=0), r"DataFrame\.mean"), (lambda df: df + df, r"DataFrame\.add"), (lambda df: df.index, r"DataFrame\.get_axis\(0\)"), - ( - lambda df: df.drop(columns="col1").squeeze().repeat(2), - r"Series\.repeat", - ), + (lambda df: df.drop(columns="col1").squeeze().repeat(2), r"Series\.repeat",), (lambda df: df.groupby("col1").prod(), r"GroupBy\.prod"), (lambda df: df.rolling(1).count(), r"Rolling\.count"), ], diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 5819aced627..78bc85e803e 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -124,7 +124,7 @@ def wrapper(obj1, obj2, *args, **kwargs): @pytest.mark.parametrize("as_index", [True, False]) def test_mixed_dtypes_groupby(as_index): - frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) + frame_data = np.random.randint(97, 198, size=(2 ** 6, 2 ** 4)) pandas_df = pandas.DataFrame(frame_data).add_prefix("col") # Convert every other column to string for col in pandas_df.iloc[ @@ -880,13 +880,13 @@ def test_simple_col_groupby(): @pytest.mark.parametrize( - "by", [np.random.randint(0, 100, size=2**8), lambda x: x % 3, None] + "by", [np.random.randint(0, 100, size=2 ** 8), lambda x: x % 3, None] ) @pytest.mark.parametrize("as_index_series_or_dataframe", [0, 1, 2]) def test_series_groupby(by, as_index_series_or_dataframe): if as_index_series_or_dataframe <= 1: as_index = as_index_series_or_dataframe == 1 - series_data = np.random.randint(97, 198, size=2**8) + series_data = np.random.randint(97, 198, size=2 ** 8) modin_series = pd.Series(series_data) pandas_series = pandas.Series(series_data) else: @@ -1257,19 +1257,13 @@ def eval_groups(modin_groupby, pandas_groupby): @_copy_pandas_groupby_if_needed def eval_shift(modin_groupby, pandas_groupby): eval_general( - modin_groupby, - pandas_groupby, - lambda groupby: groupby.shift(), + modin_groupby, pandas_groupby, lambda groupby: groupby.shift(), ) eval_general( - modin_groupby, - pandas_groupby, - lambda groupby: groupby.shift(periods=0), + modin_groupby, pandas_groupby, lambda groupby: groupby.shift(periods=0), ) eval_general( - modin_groupby, - pandas_groupby, - lambda groupby: groupby.shift(periods=-3), + modin_groupby, pandas_groupby, lambda groupby: groupby.shift(periods=-3), ) # Disabled for `BaseOnPython` because of the issue with `getitem_array`. @@ -1298,7 +1292,7 @@ def eval_shift(modin_groupby, pandas_groupby): def test_groupby_on_index_values_with_loop(): - length = 2**6 + length = 2 ** 6 data = { "a": np.random.randint(0, 100, size=length), "b": np.random.randint(0, 100, size=length), @@ -1338,7 +1332,7 @@ def test_groupby_on_index_values_with_loop(): ], ) def test_groupby_multiindex(groupby_kwargs): - frame_data = np.random.randint(0, 100, size=(2**6, 2**4)) + frame_data = np.random.randint(0, 100, size=(2 ** 6, 2 ** 4)) modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) @@ -1952,8 +1946,7 @@ def test_not_str_by(by, as_index): # 0 and -1 are considered to be the indices of the columns to group on. pytest.param({1: "sum", 2: "nunique"}, id="dict_agg_no_intersection_with_by"), pytest.param( - {0: "mean", 1: "sum", 2: "nunique"}, - id="dict_agg_has_intersection_with_by", + {0: "mean", 1: "sum", 2: "nunique"}, id="dict_agg_has_intersection_with_by", ), pytest.param( {1: "sum", 2: "nunique", -1: "nunique"}, diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index e1bb7707184..63c54acd43d 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -260,13 +260,7 @@ def test_read_csv_delimiters( ) @pytest.mark.parametrize("skip_blank_lines", [True, False]) def test_read_csv_col_handling( - self, - header, - index_col, - prefix, - names, - usecols, - skip_blank_lines, + self, header, index_col, prefix, names, usecols, skip_blank_lines, ): if names is lib.no_default: pytest.skip("some parameters combiantions fails: issue #2312") @@ -309,11 +303,7 @@ def test_from_csv_with_callable_usecols(self, usecols): ) @pytest.mark.parametrize("skipfooter", [0, 10]) def test_read_csv_parsing_1( - self, - dtype, - engine, - converters, - skipfooter, + self, dtype, engine, converters, skipfooter, ): if dtype: @@ -360,14 +350,7 @@ def test_read_csv_parsing_1( ) @pytest.mark.parametrize("encoding", ["latin1", "windows-1251", None]) def test_read_csv_parsing_2( - self, - make_csv_file, - request, - header, - skiprows, - nrows, - names, - encoding, + self, make_csv_file, request, header, skiprows, nrows, names, encoding, ): xfail_case = ( StorageFormat.get() == "Hdk" @@ -389,8 +372,7 @@ def test_read_csv_parsing_2( with ensure_clean(".csv") as unique_filename: if encoding: make_csv_file( - filename=unique_filename, - encoding=encoding, + filename=unique_filename, encoding=encoding, ) kwargs = { "filepath_or_buffer": unique_filename @@ -424,11 +406,7 @@ def test_read_csv_parsing_2( @pytest.mark.parametrize("skipfooter", [0, 10]) @pytest.mark.parametrize("nrows", [35, None]) def test_read_csv_parsing_3( - self, - true_values, - false_values, - skipfooter, - nrows, + self, true_values, false_values, skipfooter, nrows, ): xfail_case = ( (false_values or true_values) @@ -462,8 +440,7 @@ def test_read_csv_skipinitialspace(self): eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True) @pytest.mark.parametrize( - "test_case", - ["single_element", "single_column", "multiple_columns"], + "test_case", ["single_element", "single_column", "multiple_columns"], ) def test_read_csv_squeeze(self, request, test_case): if request.config.getoption("--simulate-cloud").lower() != "off": @@ -503,12 +480,7 @@ def test_read_csv_mangle_dupe_cols(self): @pytest.mark.parametrize("verbose", [True, False]) @pytest.mark.parametrize("skip_blank_lines", [True, False]) def test_read_csv_nans_handling( - self, - na_values, - keep_default_na, - na_filter, - verbose, - skip_blank_lines, + self, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, ): eval_io( fn_name="read_csv", @@ -692,13 +664,7 @@ def test_read_csv_encoding(self, make_csv_file, encoding): @pytest.mark.parametrize("escapechar", [None, "d", "x"]) @pytest.mark.parametrize("dialect", ["test_csv_dialect", None]) def test_read_csv_file_format( - self, - make_csv_file, - thousands, - decimal, - lineterminator, - escapechar, - dialect, + self, make_csv_file, thousands, decimal, lineterminator, escapechar, dialect, ): if Engine.get() != "Python" and lineterminator == "x": pytest.xfail("read_csv with Ray engine outputs empty frame - issue #2493") @@ -753,12 +719,7 @@ def test_read_csv_file_format( @pytest.mark.parametrize("doublequote", [True, False]) @pytest.mark.parametrize("comment", [None, "#", "x"]) def test_read_csv_quoting( - self, - make_csv_file, - quoting, - quotechar, - doublequote, - comment, + self, make_csv_file, quoting, quotechar, doublequote, comment, ): # in these cases escapechar should be set, otherwise error occures # _csv.Error: need to escape, but no escapechar set" @@ -796,10 +757,7 @@ def test_read_csv_quoting( reason="In compat mode, some error handling tests are failing due to https://github.com/modin-project/modin/issues/2845", ) def test_read_csv_error_handling( - self, - warn_bad_lines, - error_bad_lines, - on_bad_lines, + self, warn_bad_lines, error_bad_lines, on_bad_lines, ): # in that case exceptions are raised both by Modin and pandas # and tests pass @@ -883,8 +841,7 @@ def test_read_csv_internal( ) else: make_csv_file( - filename=unique_filename, - delimiter=delimiter, + filename=unique_filename, delimiter=delimiter, ) eval_io( @@ -1301,12 +1258,10 @@ def test_to_csv_with_index(self): ] ).transpose() modin_df = pd.DataFrame( - values, - columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], + values, columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], ).set_index("key") pandas_df = pandas.DataFrame( - values, - columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], + values, columns=["key"] + ["avalue" + str(i) for i in range(1, 1 + cols)], ).set_index("key") eval_to_file(modin_df, pandas_df, "to_csv", "csv") @@ -1837,24 +1792,21 @@ def test_excel_empty_line(self): def test_read_excel_empty_rows(self): # Test parsing empty rows in middle of excel dataframe as NaN values eval_io( - fn_name="read_excel", - io="modin/pandas/test/data/test_empty_rows.xlsx", + fn_name="read_excel", io="modin/pandas/test/data/test_empty_rows.xlsx", ) @check_file_leaks def test_read_excel_border_rows(self): # Test parsing border rows as NaN values in excel dataframe eval_io( - fn_name="read_excel", - io="modin/pandas/test/data/test_border_rows.xlsx", + fn_name="read_excel", io="modin/pandas/test/data/test_border_rows.xlsx", ) @check_file_leaks def test_read_excel_every_other_nan(self): # Test for reading excel dataframe with every other row as a NaN value eval_io( - fn_name="read_excel", - io="modin/pandas/test/data/every_other_row_nan.xlsx", + fn_name="read_excel", io="modin/pandas/test/data/every_other_row_nan.xlsx", ) @pytest.mark.parametrize( @@ -2086,16 +2038,13 @@ def test_read_sql_from_sql_server(self): "mssql+pymssql://sa:Strong.Pwd-123@0.0.0.0:1433/master" ) pandas_df_to_read = pandas.DataFrame( - np.arange( - 1000 * 256, - ).reshape(1000, 256) + np.arange(1000 * 256,).reshape(1000, 256) ).add_prefix("col") pandas_df_to_read.to_sql( table_name, sqlalchemy_connection_string, if_exists="replace" ) modin_df = pd.read_sql( - query, - ModinDatabaseConnection("sqlalchemy", sqlalchemy_connection_string), + query, ModinDatabaseConnection("sqlalchemy", sqlalchemy_connection_string), ) pandas_df = pandas.read_sql(query, sqlalchemy_connection_string) df_equals(modin_df, pandas_df) @@ -2109,15 +2058,10 @@ def test_read_sql_from_postgres(self): query = f"SELECT * FROM {table_name}" connection = "postgresql://sa:Strong.Pwd-123@localhost:2345/postgres" pandas_df_to_read = pandas.DataFrame( - np.arange( - 1000 * 256, - ).reshape(1000, 256) + np.arange(1000 * 256,).reshape(1000, 256) ).add_prefix("col") pandas_df_to_read.to_sql(table_name, connection, if_exists="replace") - modin_df = pd.read_sql( - query, - ModinDatabaseConnection("psycopg2", connection), - ) + modin_df = pd.read_sql(query, ModinDatabaseConnection("psycopg2", connection),) pandas_df = pandas.read_sql(query, connection) df_equals(modin_df, pandas_df) @@ -2464,8 +2408,7 @@ def test_read_feather_s3(self, storage_options): def test_read_feather_path_object(self, make_feather_file): eval_io( - fn_name="read_feather", - path=Path(make_feather_file()), + fn_name="read_feather", path=Path(make_feather_file()), ) @pytest.mark.xfail( diff --git a/modin/pandas/test/test_rolling.py b/modin/pandas/test/test_rolling.py index 689720ce180..257669777b7 100644 --- a/modin/pandas/test/test_rolling.py +++ b/modin/pandas/test/test_rolling.py @@ -152,8 +152,7 @@ def test_dataframe_dt_index(axis, on, closed, window): df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( - modin_rolled.apply(np.sum, raw=True), - pandas_rolled.apply(np.sum, raw=True), + modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) @@ -168,16 +167,10 @@ def test_series(data, window, min_periods, win_type): if window > len(pandas_series): window = len(pandas_series) pandas_rolled = pandas_series.rolling( - window=window, - min_periods=min_periods, - win_type=win_type, - center=True, + window=window, min_periods=min_periods, win_type=win_type, center=True, ) modin_rolled = modin_series.rolling( - window=window, - min_periods=min_periods, - win_type=win_type, - center=True, + window=window, min_periods=min_periods, win_type=win_type, center=True, ) # Testing of Window class if win_type is not None: @@ -196,8 +189,7 @@ def test_series(data, window, min_periods, win_type): df_equals(modin_rolled.min(), pandas_rolled.min()) df_equals(modin_rolled.max(), pandas_rolled.max()) df_equals( - modin_rolled.corr(modin_series), - pandas_rolled.corr(pandas_series), + modin_rolled.corr(modin_series), pandas_rolled.corr(pandas_series), ) df_equals( modin_rolled.cov(modin_series, True), pandas_rolled.cov(pandas_series, True) @@ -211,8 +203,7 @@ def test_series(data, window, min_periods, win_type): df_equals(modin_rolled.apply(np.sum), pandas_rolled.apply(np.sum)) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals( - modin_rolled.agg([np.sum, np.mean]), - pandas_rolled.agg([np.sum, np.mean]), + modin_rolled.agg([np.sum, np.mean]), pandas_rolled.agg([np.sum, np.mean]), ) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 339ffe788e0..4c7dcd5bf28 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -502,9 +502,7 @@ def test___pow__(data): "dt_index", [True, False], ids=["dt_index_true", "dt_index_false"] ) @pytest.mark.parametrize( - "data", - [*test_data_values, "empty"], - ids=[*test_data_keys, "empty"], + "data", [*test_data_values, "empty"], ids=[*test_data_keys, "empty"], ) def test___repr__(name, dt_index, data): if data == "empty": @@ -645,8 +643,7 @@ def test_agg(data, func): "Older pandas raises TypeError but Modin conforms to AssertionError" ) eval_general( - *create_test_series(data), - lambda df: df.agg(func), + *create_test_series(data), lambda df: df.agg(func), ) @@ -657,8 +654,7 @@ def test_agg_except(data, func): # See details in pandas issue 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.agg(func), + *create_test_series(data), lambda df: df.agg(func), ) @@ -670,8 +666,7 @@ def test_agg_numeric(request, data, func): ): axis = 0 eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -686,8 +681,7 @@ def test_agg_numeric_except(request, data, func): # See details in pandas issue 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -703,8 +697,7 @@ def test_aggregate(data, func): ) axis = 0 eval_general( - *create_test_series(data), - lambda df: df.aggregate(func, axis), + *create_test_series(data), lambda df: df.aggregate(func, axis), ) @@ -716,8 +709,7 @@ def test_aggregate_except(data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.aggregate(func, axis), + *create_test_series(data), lambda df: df.aggregate(func, axis), ) @@ -729,8 +721,7 @@ def test_aggregate_numeric(request, data, func): ): axis = 0 eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -745,8 +736,7 @@ def test_aggregate_numeric_except(request, data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), + *create_test_series(data), lambda df: df.agg(func, axis), ) @@ -870,8 +860,7 @@ def test_append(data): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply(data, func): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -882,8 +871,7 @@ def test_apply_except(data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -912,8 +900,7 @@ def test_apply_external_lib(): def test_apply_numeric(request, data, func): if name_contains(request.node.name, numeric_dfs): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -925,8 +912,7 @@ def test_apply_numeric_except(request, data, func): # See details in pandas issues 36036. with pytest.raises(SpecificationError): eval_general( - *create_test_series(data), - lambda df: df.apply(func), + *create_test_series(data), lambda df: df.apply(func), ) @@ -1018,8 +1004,7 @@ def test_asof(where): @pytest.mark.parametrize( - "where", - [20, 30, [10.5, 40.5], [10], pandas.Index([20, 30]), pandas.Index([10.5])], + "where", [20, 30, [10.5, 40.5], [10], pandas.Index([20, 30]), pandas.Index([10.5])], ) def test_asof_large(where): values = test_data["float_nan_data"]["col1"] @@ -1192,9 +1177,7 @@ def test_bool(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) def test_clip_scalar(request, data, bound_type): - modin_series, pandas_series = create_test_series( - data, - ) + modin_series, pandas_series = create_test_series(data,) if name_contains(request.node.name, numeric_dfs): # set bounds @@ -1214,9 +1197,7 @@ def test_clip_scalar(request, data, bound_type): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) def test_clip_sequence(request, data, bound_type): - modin_series, pandas_series = create_test_series( - data, - ) + modin_series, pandas_series = create_test_series(data,) if name_contains(request.node.name, numeric_dfs): lower = random_state.random_integers(RAND_LOW, RAND_HIGH, len(pandas_series)) @@ -1690,8 +1671,7 @@ def test_dt(timezone): modin_series.dt.to_pydatetime(), pandas_series.dt.to_pydatetime() ) df_equals( - modin_series.dt.tz_localize(None), - pandas_series.dt.tz_localize(None), + modin_series.dt.tz_localize(None), pandas_series.dt.tz_localize(None), ) if timezone: df_equals( @@ -2121,9 +2101,7 @@ def test_kurtosis_level(level): pandas_s.columns = index eval_general( - modin_s, - pandas_s, - lambda s: s.kurtosis(axis=1, level=level), + modin_s, pandas_s, lambda s: s.kurtosis(axis=1, level=level), ) @@ -2149,8 +2127,7 @@ def test_index_order(func): s_pandas.index = index df_equals( - getattr(s_modin, func)(level=0).index, - getattr(s_pandas, func)(level=0).index, + getattr(s_modin, func)(level=0).index, getattr(s_pandas, func)(level=0).index, ) @@ -2733,9 +2710,7 @@ def test_repeat(data, repeats): ) def test_repeat_lists(data, repeats): eval_general( - pd.Series(data), - pandas.Series(data), - lambda df: df.repeat(repeats), + pd.Series(data), pandas.Series(data), lambda df: df.repeat(repeats), ) @@ -2800,12 +2775,10 @@ def test_resample(closed, label, level): pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), ) df_equals( - modin_resampler.aggregate("max"), - pandas_resampler.aggregate("max"), + modin_resampler.aggregate("max"), pandas_resampler.aggregate("max"), ) df_equals( - modin_resampler.apply("sum"), - pandas_resampler.apply("sum"), + modin_resampler.apply("sum"), pandas_resampler.apply("sum"), ) df_equals( modin_resampler.get_group(name=list(modin_resampler.groups)[0]), @@ -2817,8 +2790,7 @@ def test_resample(closed, label, level): # Upsampling from level= or on= selection is not supported if level is None: df_equals( - modin_resampler.interpolate(), - pandas_resampler.interpolate(), + modin_resampler.interpolate(), pandas_resampler.interpolate(), ) df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) df_equals( @@ -3018,8 +2990,7 @@ def test_sem_float_nan_only(skipna, ddof): @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) def test_sem_int_only(ddof): eval_general( - *create_test_series(test_data["int_data"]), - lambda df: df.sem(ddof=ddof), + *create_test_series(test_data["int_data"]), lambda df: df.sem(ddof=ddof), ) @@ -3065,8 +3036,7 @@ def test_shift_slice_shift(data, index, periods): ] df_equals( - modin_series.shift(periods=periods), - pandas_series.shift(periods=periods), + modin_series.shift(periods=periods), pandas_series.shift(periods=periods), ) df_equals( modin_series.shift(periods=periods, fill_value=777), @@ -3097,9 +3067,7 @@ def test_sort_index(data, ascending, sort_remaining, na_position): modin_series, pandas_series, lambda df: df.sort_index( - ascending=ascending, - sort_remaining=sort_remaining, - na_position=na_position, + ascending=ascending, sort_remaining=sort_remaining, na_position=na_position, ), ) @@ -3335,8 +3303,7 @@ def test_series_empty_values(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(request, data): eval_general( - *create_test_series(data), - lambda df: df.to_string(), + *create_test_series(data), lambda df: df.to_string(), ) @@ -3365,8 +3332,7 @@ def test_tolist(data): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_transform(data, func): eval_general( - *create_test_series(data), - lambda df: df.transform(func), + *create_test_series(data), lambda df: df.transform(func), ) @@ -3374,8 +3340,7 @@ def test_transform(data, func): @pytest.mark.parametrize("func", agg_func_except_values, ids=agg_func_except_keys) def test_transform_except(data, func): eval_general( - *create_test_series(data), - lambda df: df.transform(func), + *create_test_series(data), lambda df: df.transform(func), ) @@ -3458,8 +3423,7 @@ def test_tz_localize(): pandas_series.tz_localize("America/Los_Angeles"), ) df_equals( - modin_series.tz_localize("UTC"), - pandas_series.tz_localize("UTC"), + modin_series.tz_localize("UTC"), pandas_series.tz_localize("UTC"), ) @@ -3565,7 +3529,7 @@ def sort_sensitive_comparator(df1, df2): ) # from issue #2365 - arr = np.random.rand(2**6) + arr = np.random.rand(2 ** 6) arr[::10] = np.nan eval_general( *create_test_series(arr), @@ -3591,8 +3555,7 @@ def test_value_counts_categorical(): random_state.shuffle(data) eval_general( - *create_test_series(data, dtype="category"), - lambda df: df.value_counts(), + *create_test_series(data, dtype="category"), lambda df: df.value_counts(), ) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 21565286309..1bc1d926f3c 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -42,9 +42,9 @@ random_state = np.random.RandomState(seed=42) DATASET_SIZE_DICT = { - "Small": (2**2, 2**3), - "Normal": (2**6, 2**8), - "Big": (2**7, 2**12), + "Small": (2 ** 2, 2 ** 3), + "Normal": (2 ** 6, 2 ** 8), + "Big": (2 ** 7, 2 ** 12), } # Size of test dataframes @@ -852,9 +852,7 @@ def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): f.write(csv_str) eval_io( - filepath_or_buffer=unique_filename, - fn_name="read_csv", - **kwargs, + filepath_or_buffer=unique_filename, fn_name="read_csv", **kwargs, ) finally: diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 7564732c9bd..5d5de728041 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -255,11 +255,7 @@ def check_both_not_none(option1, option2): def broadcast_item( - obj, - row_lookup, - col_lookup, - item, - need_columns_reindex=True, + obj, row_lookup, col_lookup, item, need_columns_reindex=True, ): """ Use NumPy to broadcast or reshape item with reindexing. diff --git a/modin/pandas/window.py b/modin/pandas/window.py index fa7922df5c8..f14ca384935 100644 --- a/modin/pandas/window.py +++ b/modin/pandas/window.py @@ -192,20 +192,13 @@ def apply( ) def aggregate( - self, - func, - *args, - **kwargs, + self, func, *args, **kwargs, ): from .dataframe import DataFrame dataframe = DataFrame( query_compiler=self._query_compiler.rolling_aggregate( - self.axis, - self.rolling_args, - func, - *args, - **kwargs, + self.axis, self.rolling_args, func, *args, **kwargs, ) ) if isinstance(self._dataframe, DataFrame): diff --git a/modin/test/interchange/dataframe_protocol/base/test_utils.py b/modin/test/interchange/dataframe_protocol/base/test_utils.py index 5fdc803e331..a5de4f5edd3 100644 --- a/modin/test/interchange/dataframe_protocol/base/test_utils.py +++ b/modin/test/interchange/dataframe_protocol/base/test_utils.py @@ -42,10 +42,7 @@ (np.dtype("float32"), "f"), (np.dtype("float64"), "g"), (pandas.Series(["a"]).dtype, "u"), - ( - pandas.Series([0]).astype("datetime64[ns]").dtype, - "tsn:", - ), + (pandas.Series([0]).astype("datetime64[ns]").dtype, "tsn:",), ], ) def test_dtype_to_arrow_c(pandas_dtype, c_string): # noqa PR01 diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index 92c91027cdd..43718943efe 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -166,16 +166,10 @@ def func_to_apply(partition, row_internal_indices, col_internal_indices, item): ) @pytest.mark.parametrize( "test_type", - [ - "many_small_dfs", - "concatted_df_with_small_dfs", - "large_df_plus_small_dfs", - ], + ["many_small_dfs", "concatted_df_with_small_dfs", "large_df_plus_small_dfs",], ) @pytest.mark.parametrize( - "set_num_partitions", - [1, 4], - indirect=True, + "set_num_partitions", [1, 4], indirect=True, ) def test_rebalance_partitions(test_type, set_num_partitions): num_partitions = NPartitions.get() @@ -300,9 +294,7 @@ class TestDrainVirtualPartitionCallQueue: """ def test_from_virtual_partitions_with_call_queues( - self, - axis, - virtual_partition_class, + self, axis, virtual_partition_class, ): # reverse the dataframe along the virtual partition axis. def reverse(df): @@ -333,8 +325,7 @@ def reverse(df): else: expected_df = pandas.DataFrame([[1, 0, 3, 2]], columns=[0, 0, 0, 0]) df_equals( - level_two_virtual.to_pandas(), - expected_df, + level_two_virtual.to_pandas(), expected_df, ) def test_from_block_and_virtual_partition_with_call_queues( diff --git a/modin/test/test_envvar_npartitions.py b/modin/test/test_envvar_npartitions.py index fa23bbef957..e2d0db8570a 100644 --- a/modin/test/test_envvar_npartitions.py +++ b/modin/test/test_envvar_npartitions.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize("num_partitions", [2, 4, 6, 8, 10]) def test_set_npartitions(num_partitions): NPartitions.put(num_partitions) - data = np.random.randint(0, 100, size=(2**16, 2**8)) + data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8)) df = pd.DataFrame(data) part_shape = df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == num_partitions and part_shape[1] == min(num_partitions, 8) @@ -31,7 +31,7 @@ def test_set_npartitions(num_partitions): @pytest.mark.parametrize("right_num_partitions", [2, 4, 6, 8, 10]) def test_runtime_change_npartitions(left_num_partitions, right_num_partitions): NPartitions.put(left_num_partitions) - data = np.random.randint(0, 100, size=(2**16, 2**8)) + data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8)) left_df = pd.DataFrame(data) part_shape = left_df._query_compiler._modin_frame._partitions.shape assert part_shape[0] == left_num_partitions and part_shape[1] == min( diff --git a/modin/test/test_partition_api.py b/modin/test/test_partition_api.py index 5f4d8a44bf6..b85d5309c8c 100644 --- a/modin/test/test_partition_api.py +++ b/modin/test/test_partition_api.py @@ -84,10 +84,8 @@ def get_df(lib, data): get_func(actual_partitions[row_idx][col_idx]), ) else: - expected_axis_partitions = ( - expected_df._query_compiler._modin_frame._partition_mgr_cls.axis_partition( - expected_partitions, axis ^ 1 - ) + expected_axis_partitions = expected_df._query_compiler._modin_frame._partition_mgr_cls.axis_partition( + expected_partitions, axis ^ 1 ) expected_axis_partitions = [ axis_partition.force_materialization().unwrap(squeeze=True) diff --git a/modin/test/test_utils.py b/modin/test/test_utils.py index 98c2439e208..01eb6673163 100644 --- a/modin/test/test_utils.py +++ b/modin/test/test_utils.py @@ -138,11 +138,7 @@ class Child(Parent): @pytest.mark.parametrize( "source_doc,to_append,expected", [ - ( - "One-line doc.", - "One-line message.", - "One-line doc.One-line message.", - ), + ("One-line doc.", "One-line message.", "One-line doc.One-line message.",), ( """ Regular doc-string diff --git a/scripts/doc_checker.py b/scripts/doc_checker.py index 5a525637136..66f2955a11a 100644 --- a/scripts/doc_checker.py +++ b/scripts/doc_checker.py @@ -128,8 +128,7 @@ def check_optional_args(doc: Docstring) -> list: ( "MD01", MODIN_ERROR_CODES["MD01"].format( - parameter=parameter, - found=type_line, + parameter=parameter, found=type_line, ), ) )