FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490)

jeffreykennethli · devin-petersohn · web-flow · commit b22b93df20ad · 2022-06-07T12:41:06.000-05:00
Co-authored-by: Devin Petersohn &lt;devin-petersohn@users.noreply.github.com&gt;
Signed-off-by: jeffreykennethli &lt;jkli@ponder.io&gt;
diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst
@@ -27,6 +27,7 @@ Key Features and Updates
   * FIX-#4481: Allow clipping with a Modin Series of bounds (#4486)  
   * FIX-#4504: Support na_action in applymap (#4505)
   * FIX-#4503: Stop the memory logging thread after session exit (#4515)
+  * FIX-#4464: Refactor Ray utils and quick fix groupby.count failing on virtual partitions (#4490)
 * Performance enhancements
   * FEAT-#4320: Add connectorx as an alternative engine for read_sql (#4346)
   * PERF-#4493: Use partition size caches more in Modin dataframe (#4495)
diff --git a/modin/core/dataframe/pandas/partitioning/axis_partition.py b/modin/core/dataframe/pandas/partitioning/axis_partition.py
@@ -83,7 +83,6 @@ def apply(
                     num_splits,
                     len(self.list_of_blocks),
                     other_shape,
-                    kwargs,
                     *tuple(
                         self.list_of_blocks
                         + [
@@ -92,11 +91,12 @@ def apply(
                             for part in axis_partition.list_of_blocks
                         ]
                     ),
+                    **kwargs,
                 )
             )
-        args = [self.axis, func, num_splits, kwargs, maintain_partitioning]
+        args = [self.axis, func, num_splits, maintain_partitioning]
         args.extend(self.list_of_blocks)
-        return self._wrap_partitions(self.deploy_axis_func(*args))
+        return self._wrap_partitions(self.deploy_axis_func(*args, **kwargs))
 
     def shuffle(self, func, lengths, **kwargs):
         """
@@ -120,13 +120,13 @@ def shuffle(self, func, lengths, **kwargs):
         # We add these to kwargs and will pop them off before performing the operation.
         kwargs["manual_partition"] = True
         kwargs["_lengths"] = lengths
-        args = [self.axis, func, num_splits, kwargs, False]
+        args = [self.axis, func, num_splits, False]
         args.extend(self.list_of_blocks)
-        return self._wrap_partitions(self.deploy_axis_func(*args))
+        return self._wrap_partitions(self.deploy_axis_func(*args, **kwargs))
 
     @classmethod
     def deploy_axis_func(
-        cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions
+        cls, axis, func, num_splits, maintain_partitioning, *partitions, **kwargs
     ):
         """
         Deploy a function along a full axis.
@@ -139,13 +139,13 @@ def deploy_axis_func(
             The function to perform.
         num_splits : int
             The number of splits to return (see `split_result_of_axis_func_pandas`).
-        kwargs : dict
-            Additional keywords arguments to be passed in `func`.
         maintain_partitioning : bool
             If True, keep the old partitioning if possible.
             If False, create a new partition layout.
         *partitions : iterable
             All partitions that make up the full axis (row or column).
+        **kwargs : dict
+            Additional keywords arguments to be passed in `func`.
 
         Returns
         -------
@@ -157,7 +157,9 @@ def deploy_axis_func(
         lengths = kwargs.pop("_lengths", None)
 
         dataframe = pandas.concat(list(partitions), axis=axis, copy=False)
-        result = func(dataframe, **kwargs)
+        # To not mix the args for deploy_axis_func and args for func, we fold
+        # args into kwargs. This is a bit of a hack, but it works.
+        result = func(dataframe, *kwargs.pop("args", ()), **kwargs)
 
         if manual_partition:
             # The split function is expecting a list
@@ -180,7 +182,14 @@ def deploy_axis_func(
 
     @classmethod
     def deploy_func_between_two_axis_partitions(
-        cls, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions
+        cls,
+        axis,
+        func,
+        num_splits,
+        len_of_left,
+        other_shape,
+        *partitions,
+        **kwargs,
     ):
         """
         Deploy a function along a full axis between two data sets.
@@ -198,10 +207,10 @@ def deploy_func_between_two_axis_partitions(
         other_shape : np.ndarray
             The shape of right frame in terms of partitions, i.e.
             (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition.
-        kwargs : dict
-            Additional keywords arguments to be passed in `func`.
         *partitions : iterable
             All partitions that make up the full axis (row or column) for both data sets.
+        **kwargs : dict
+            Additional keywords arguments to be passed in `func`.
 
         Returns
         -------
@@ -222,6 +231,7 @@ def deploy_func_between_two_axis_partitions(
             for i in range(1, len(other_shape))
         ]
         rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False)
-
-        result = func(lt_frame, rt_frame, **kwargs)
+        # To not mix the args for deploy_func_between_two_axis_partitions and args
+        # for func, we fold args into kwargs. This is a bit of a hack, but it works.
+        result = func(lt_frame, rt_frame, *kwargs.pop("args", ()), **kwargs)
         return split_result_of_axis_func_pandas(axis, num_splits, result)
diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py
@@ -55,7 +55,13 @@ def __init__(self, list_of_blocks, get_ip=False, full_axis=True):
 
     @classmethod
     def deploy_axis_func(
-        cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions
+        cls,
+        axis,
+        func,
+        num_splits,
+        maintain_partitioning,
+        *partitions,
+        **kwargs,
     ):
         """
         Deploy a function along a full axis.
@@ -68,13 +74,13 @@ def deploy_axis_func(
             The function to perform.
         num_splits : int
             The number of splits to return (see `split_result_of_axis_func_pandas`).
-        kwargs : dict
-            Additional keywords arguments to be passed in `func`.
         maintain_partitioning : bool
             If True, keep the old partitioning if possible.
             If False, create a new partition layout.
         *partitions : iterable
             All partitions that make up the full axis (row or column).
+        **kwargs : dict
+            Additional keywords arguments to be passed in `func`.
 
         Returns
         -------
@@ -89,16 +95,16 @@ def deploy_axis_func(
             axis,
             func,
             num_splits,
-            kwargs,
             maintain_partitioning,
             *partitions,
             num_returns=result_num_splits * 4,
             pure=False,
+            **kwargs,
         )
 
     @classmethod
     def deploy_func_between_two_axis_partitions(
-        cls, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions
+        cls, axis, func, num_splits, len_of_left, other_shape, *partitions, **kwargs
     ):
         """
         Deploy a function along a full axis between two data sets.
@@ -116,10 +122,10 @@ def deploy_func_between_two_axis_partitions(
         other_shape : np.ndarray
             The shape of right frame in terms of partitions, i.e.
             (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition.
-        kwargs : dict
-            Additional keywords arguments to be passed in `func`.
         *partitions : iterable
             All partitions that make up the full axis (row or column) for both data sets.
+        **kwargs : dict
+            Additional keywords arguments to be passed in `func`.
 
         Returns
         -------
@@ -134,10 +140,10 @@ def deploy_func_between_two_axis_partitions(
             num_splits,
             len_of_left,
             other_shape,
-            kwargs,
             *partitions,
             num_returns=num_splits * 4,
             pure=False,
+            **kwargs,
         )
 
     def _wrap_partitions(self, partitions):
@@ -200,7 +206,7 @@ class PandasOnDaskDataframeRowPartition(PandasOnDaskDataframeAxisPartition):
     axis = 1
 
 
-def deploy_dask_func(func, *args):
+def deploy_dask_func(func, *args, **kwargs):
     """
     Execute a function on an axis partition in a worker process.
 
@@ -210,13 +216,15 @@ def deploy_dask_func(func, *args):
         Function to be executed on an axis partition.
     *args : iterable
         Additional arguments that need to passed in ``func``.
+    **kwargs : dict
+        Additional keyword arguments to be passed in `func`.
 
     Returns
     -------
     list
         The result of the function ``func`` and metadata for it.
     """
-    result = func(*args)
+    result = func(*args, **kwargs)
     ip = get_ip()
     if isinstance(result, pandas.DataFrame):
         return result, len(result), len(result.columns), ip
diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py
@@ -16,6 +16,7 @@
 import os
 import sys
 import psutil
+from packaging import version
 import warnings
 
 import ray
@@ -32,6 +33,12 @@
     ValueSource,
 )
 
+ObjectIDType = ray.ObjectRef
+if version.parse(ray.__version__) >= version.parse("1.2.0"):
+    from ray.util.client.common import ClientObjectRef
+
+    ObjectIDType = (ray.ObjectRef, ClientObjectRef)
+
 
 def _move_stdlib_ahead_of_site_packages(*args):
     """
@@ -223,3 +230,31 @@ def initialize_ray(
         NPartitions._put(num_gpus)
     else:
         NPartitions._put(num_cpus)
+
+
+def deserialize(obj):
+    """
+    Deserialize a Ray object.
+
+    Parameters
+    ----------
+    obj : ObjectIDType, iterable of ObjectIDType, or mapping of keys to ObjectIDTypes
+        Object(s) to deserialize.
+
+    Returns
+    -------
+    obj
+        The deserialized object.
+    """
+    if isinstance(obj, ObjectIDType):
+        return ray.get(obj)
+    elif isinstance(obj, (tuple, list)) and any(
+        isinstance(o, ObjectIDType) for o in obj
+    ):
+        return ray.get(list(obj))
+    elif isinstance(obj, dict) and any(
+        isinstance(val, ObjectIDType) for val in obj.values()
+    ):
+        return dict(zip(obj.keys(), ray.get(list(obj.values()))))
+    else:
+        return obj
diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py
@@ -15,19 +15,13 @@
 
 import ray
 from ray.util import get_node_ip_address
-from packaging import version
 import uuid
+from modin.core.execution.ray.common.utils import deserialize, ObjectIDType
 
 from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition
 from modin.pandas.indexing import compute_sliced_len
 from modin.logging import get_logger
 
-ObjectIDType = ray.ObjectRef
-if version.parse(ray.__version__) >= version.parse("1.2.0"):
-    from ray.util.client.common import ClientObjectRef
-
-    ObjectIDType = (ray.ObjectRef, ClientObjectRef)
-
 compute_sliced_len = ray.remote(compute_sliced_len)
 
 
@@ -419,21 +413,6 @@ def _apply_list_of_funcs(funcs, partition):  # pragma: no cover
     str
         The node IP address of the worker process.
     """
-
-    def deserialize(obj):
-        if isinstance(obj, ObjectIDType):
-            return ray.get(obj)
-        elif isinstance(obj, (tuple, list)) and any(
-            isinstance(o, ObjectIDType) for o in obj
-        ):
-            return ray.get(list(obj))
-        elif isinstance(obj, dict) and any(
-            isinstance(val, ObjectIDType) for val in obj.values()
-        ):
-            return dict(zip(obj.keys(), ray.get(list(obj.values()))))
-        else:
-            return obj
-
     for func, args, kwargs in funcs:
         func = deserialize(func)
         args = deserialize(args)
diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py
diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py