Optimize GroupBy's aggregation algorithm (#2696)

hekaisheng · web-flow · commit 77373af721d6 · 2022-02-16T11:11:32.000+08:00
diff --git a/mars/dataframe/groupby/aggregation.py b/mars/dataframe/groupby/aggregation.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy.stats import variation
 
 from ... import opcodes as OperandDef
 from ...config import options
@@ -59,12 +60,12 @@
 
 class SizeRecorder:
     def __init__(self):
-        self._raw_records = 0
-        self._agg_records = 0
+        self._raw_records = []
+        self._agg_records = []
 
-    def record(self, raw_records: int, agg_records: int):
-        self._raw_records += raw_records
-        self._agg_records += agg_records
+    def record(self, raw_record: int, agg_record: int):
+        self._raw_records.append(raw_record)
+        self._agg_records.append(agg_record)
 
     def get(self):
         return self._raw_records, self._agg_records
@@ -659,15 +660,27 @@ def _tile_auto(
         # yield to trigger execution
         yield chunks
 
-        raw_size, agg_size = size_recorder.get()
+        raw_sizes, agg_sizes = size_recorder.get()
         # destroy size recorder
         ctx.destroy_remote_object(size_recorder_name)
 
         left_chunks = in_df.chunks[combine_size:]
         left_chunks = cls._gen_map_chunks(op, left_chunks, out_df, func_infos)
-        if raw_size >= agg_size * len(chunks):
-            # aggregated size is less than 1 chunk
-            # use tree aggregation
+        # calculate the coefficient of variation of aggregation sizes,
+        # if the CV is less than 0.2 and the mean of agg_size/raw_size
+        # is less than 0.8, we suppose the single chunk's aggregation size
+        # almost equals to the tileable's, then use tree method
+        # as combine aggregation results won't lead to a rapid expansion.
+        ratios = [
+            agg_size / raw_size for agg_size, raw_size in zip(agg_sizes, raw_sizes)
+        ]
+        cv = variation(agg_sizes)
+        mean_ratio = np.mean(ratios)
+        if mean_ratio <= 1 / len(chunks):
+            # if mean of ratio is less than 0.25, use tree
+            return cls._combine_tree(op, chunks + left_chunks, out_df, func_infos)
+        elif cv <= 0.2 and mean_ratio <= 2 / 3:
+            # check CV and mean of ratio
             return cls._combine_tree(op, chunks + left_chunks, out_df, func_infos)
         else:
             # otherwise, use shuffle
@@ -685,7 +698,7 @@ def tile(cls, op: "DataFrameGroupByAgg"):
         func_infos = cls._compile_funcs(op, in_df)
 
         if op.method == "auto":
-            if len(in_df.chunks) < op.combine_size:
+            if len(in_df.chunks) <= op.combine_size:
                 return cls._tile_with_tree(op, in_df, out_df, func_infos)
             else:
                 return (yield from cls._tile_auto(op, in_df, out_df, func_infos))
diff --git a/mars/dataframe/groupby/tests/test_groupby_execution.py b/mars/dataframe/groupby/tests/test_groupby_execution.py
@@ -581,14 +581,31 @@ def _disallow_reduce(ctx, op):
     pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c2").agg("sum"))
 
     def _disallow_combine_and_agg(ctx, op):
-        assert op.stage not in (OperandStage.combine, OperandStage.agg)
+        assert op.stage != OperandStage.combine
         op.execute(ctx, op)
 
-    r = mdf.groupby("c1").agg("sum")
+    r = mdf.groupby("c3").agg("sum")
     operand_executors = {DataFrameGroupByAgg: _disallow_combine_and_agg}
     result = r.execute(
         extra_config={"operand_executors": operand_executors, "check_all": False}
     ).fetch()
+    pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c3").agg("sum"))
+
+    rs = np.random.RandomState(0)
+    raw = pd.DataFrame(
+        {
+            "c1": list(range(4)) * 12,
+            "c2": rs.choice(["a", "b", "c"], (48,)),
+            "c3": rs.rand(48),
+        }
+    )
+
+    mdf = md.DataFrame(raw, chunk_size=8)
+    r = mdf.groupby("c1").agg("sum")
+    operand_executors = {DataFrameGroupByAgg: _disallow_reduce}
+    result = r.execute(
+        extra_config={"operand_executors": operand_executors, "check_all": False}
+    ).fetch()
     pd.testing.assert_frame_equal(result.sort_index(), raw.groupby("c1").agg("sum"))
 
 
diff --git a/mars/services/subtask/worker/tests/subtask_processor.py b/mars/services/subtask/worker/tests/subtask_processor.py
@@ -24,7 +24,10 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         check_options = dict()
-        kwargs = self.subtask.extra_config or dict()
+        if self.subtask.extra_config:
+            kwargs = self.subtask.extra_config.copy()
+        else:
+            kwargs = dict()
         self._operand_executors = operand_executors = kwargs.pop(
             "operand_executors", dict()
         )
@@ -50,4 +53,7 @@ def _execute_operand(self, ctx: Dict[str, Any], op: OperandType):
     async def done(self):
         await super().done()
         for op in self._operand_executors:
-            op.unregister_executor()
+            try:
+                op.unregister_executor()
+            except KeyError:
+                pass