Assign reducer ops in task assigner to make them more balanced across cluster (#3048)

Xuye (Chris) Qin · web-flow · commit 08d7cfe99af6 · 2022-05-18T20:43:26.000+08:00
diff --git a/benchmarks/tpch/run_queries.py b/benchmarks/tpch/run_queries.py
@@ -617,7 +617,7 @@ def g2(x):
 def q13(customer, orders):
     customer_filtered = customer.loc[:, ["C_CUSTKEY"]]
     orders_filtered = orders[
-        ~orders["O_COMMENT"].str.contains("special(\S|\s)*requests")
+        ~orders["O_COMMENT"].str.contains("special[\S|\s]*requests")
     ]
     orders_filtered = orders_filtered.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]]
     c_o_merged = customer_filtered.merge(
diff --git a/mars/core/operand/shuffle.py b/mars/core/operand/shuffle.py
@@ -27,8 +27,10 @@ class ShuffleProxy(VirtualOperand):
 
 
 class MapReduceOperand(Operand):
-    reducer_index = TupleField("reducer_index", FieldTypes.uint64)
+    # for mapper
     mapper_id = Int32Field("mapper_id", default=0)
+    # for reducer
+    reducer_index = TupleField("reducer_index", FieldTypes.uint64)
     reducer_phase = StringField("reducer_phase", default=None)
 
     def _new_chunks(self, inputs, kws=None, **kw):
diff --git a/mars/dataframe/groupby/aggregation.py b/mars/dataframe/groupby/aggregation.py
@@ -806,9 +806,12 @@ def _choose_tree_method(
             len(ctx.get_worker_addresses()) > 1
             and estimate_size > chunk_store_limit
             and np.mean(agg_sizes) > 1024**2
+            and total_count <= 256
         ):
             # for distributed, if estimate size could be potentially large,
             # and each chunk size is large enough(>1M, small chunk means large error),
+            # total count is relatively small(<=256, large number of chunks
+            # is not quite efficient for shuffle)
             # we choose to use shuffle
             return False
         # calculate the coefficient of variation of aggregation sizes,
diff --git a/mars/services/task/analyzer/analyzer.py b/mars/services/task/analyzer/analyzer.py
@@ -19,9 +19,15 @@
 
 from ....config import Config
 from ....core import ChunkGraph, ChunkType, enter_mode
-from ....core.operand import Fetch, VirtualOperand, LogicKeyGenerator
+from ....core.operand import (
+    Fetch,
+    VirtualOperand,
+    LogicKeyGenerator,
+    MapReduceOperand,
+    OperandStage,
+)
 from ....resource import Resource
-from ....typing import BandType
+from ....typing import BandType, OperandType
 from ....utils import build_fetch, tokenize
 from ...subtask import SubtaskGraph, Subtask
 from ..core import Task, new_task_id
@@ -31,6 +37,18 @@
 logger = logging.getLogger(__name__)
 
 
+def need_reassign_worker(op: OperandType) -> bool:
+    # NOTE(qinxuye): special process for reducer
+    # We'd better set reducer op's stage to reduce, however,
+    # in many case, we copy a reducer op from tileable op,
+    # then set stage as reducer one,
+    # it would be quite nasty to take over the __setattr__ and
+    # make reassign_worker True etc.
+    return op.reassign_worker or (
+        isinstance(op, MapReduceOperand) and op.stage == OperandStage.reduce
+    )
+
+
 class GraphAnalyzer:
     def __init__(
         self,
@@ -294,8 +312,10 @@ def gen_subtask_graph(
         subtask_graph: SubtaskGraph
             Subtask graph.
         """
+        # reassign worker when specified reassign_worker = True
+        # or it's a reducer operands
         reassign_worker_ops = [
-            chunk.op for chunk in self._chunk_graph if chunk.op.reassign_worker
+            chunk.op for chunk in self._chunk_graph if need_reassign_worker(chunk.op)
         ]
         start_ops = (
             list(self._iter_start_ops(self._chunk_graph))
diff --git a/mars/services/task/analyzer/assigner.py b/mars/services/task/analyzer/assigner.py
@@ -15,12 +15,13 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from operator import itemgetter
-from typing import List, Dict, Set, Union
+from typing import List, Dict, Union
 
 import numpy as np
 
 from ....core import ChunkGraph, ChunkData
 from ....core.operand import Operand
+from ....lib.ordered_set import OrderedSet
 from ....resource import Resource
 from ....typing import BandType
 from ....utils import implements
@@ -77,8 +78,9 @@ def __init__(
         band_resource: Dict[BandType, Resource],
     ):
         super().__init__(chunk_graph, start_ops, band_resource)
-        self._undirected_chunk_graph = None
-        self._op_keys: Set[str] = {start_op.key for start_op in start_ops}
+        self._op_keys: OrderedSet[str] = OrderedSet(
+            [start_op.key for start_op in start_ops]
+        )
 
     def _calc_band_assign_limits(
         self, initial_count: int, occupied: Dict[BandType, int]
@@ -124,13 +126,15 @@ def _calc_band_assign_limits(
             pos = (pos + 1) % len(counts)
         return dict(zip(bands, counts))
 
+    @classmethod
     def _assign_by_bfs(
-        self,
+        cls,
+        undirected_chunk_graph: ChunkGraph,
         start: ChunkData,
         band: BandType,
         initial_sizes: Dict[BandType, int],
         spread_limits: Dict[BandType, float],
-        key_to_assign: Set[str],
+        key_to_assign: OrderedSet[str],
         assigned_record: Dict[str, Union[str, BandType]],
     ):
         """
@@ -140,19 +144,15 @@ def _assign_by_bfs(
         if initial_sizes[band] <= 0:
             return
 
-        graph = self._chunk_graph
-        if self._undirected_chunk_graph is None:
-            self._undirected_chunk_graph = graph.build_undirected()
-        undirected_chunk_graph = self._undirected_chunk_graph
-
         assigned = 0
         spread_range = 0
         for chunk in undirected_chunk_graph.bfs(start=start, visit_predicate="all"):
             op_key = chunk.op.key
             if op_key in assigned_record:
                 continue
             spread_range += 1
-            # `op_key` may not be in `key_to_assign`, but we need to record it to avoid iterate the node repeatedly.
+            # `op_key` may not be in `key_to_assign`,
+            # but we need to record it to avoid iterate the node repeatedly.
             assigned_record[op_key] = band
             if op_key not in key_to_assign:
                 continue
@@ -161,8 +161,22 @@ def _assign_by_bfs(
                 break
         initial_sizes[band] -= assigned
 
+    def _build_undirected_chunk_graph(
+        self, chunk_to_assign: List[ChunkData]
+    ) -> ChunkGraph:
+        chunk_graph = self._chunk_graph.copy()
+        # remove edges for all chunk_to_assign which may contain chunks
+        # that need be reassigned
+        for chunk in chunk_to_assign:
+            if chunk_graph.count_predecessors(chunk) > 0:
+                for pred in list(chunk_graph.predecessors(chunk)):
+                    chunk_graph.remove_edge(pred, chunk)
+        return chunk_graph.build_undirected()
+
     @implements(AbstractGraphAssigner.assign)
-    def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType]:
+    def assign(
+        self, cur_assigns: Dict[str, BandType] = None
+    ) -> Dict[ChunkData, BandType]:
         graph = self._chunk_graph
         assign_result = dict()
         cur_assigns = cur_assigns or dict()
@@ -173,7 +187,7 @@ def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType
         for chunk in graph:
             op_key_to_chunks[chunk.op.key].append(chunk)
 
-        op_keys = set(self._op_keys)
+        op_keys = OrderedSet(self._op_keys)
         chunk_to_assign = [
             op_key_to_chunks[op_key][0]
             for op_key in op_keys
@@ -183,6 +197,9 @@ def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType
         for band in cur_assigns.values():
             assigned_counts[band] += 1
 
+        # build undirected graph
+        undirected_chunk_graph = self._build_undirected_chunk_graph(chunk_to_assign)
+
         # calculate the number of chunks to be assigned to each band
         # given number of bands and existing assignments
         band_quotas = self._calc_band_assign_limits(
@@ -195,14 +212,20 @@ def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType
         spread_ranges = defaultdict(lambda: average_spread_range)
         # assign from other chunks to be assigned
         # TODO: sort by what?
-        sorted_candidates = [v for v in chunk_to_assign]
+        sorted_candidates = chunk_to_assign.copy()
         while max(band_quotas.values()):
             band = max(band_quotas, key=lambda k: band_quotas[k])
             cur = sorted_candidates.pop()
             while cur.op.key in cur_assigns:
                 cur = sorted_candidates.pop()
             self._assign_by_bfs(
-                cur, band, band_quotas, spread_ranges, op_keys, cur_assigns
+                undirected_chunk_graph,
+                cur,
+                band,
+                band_quotas,
+                spread_ranges,
+                op_keys,
+                cur_assigns,
             )
 
         key_to_assign = {n.op.key for n in chunk_to_assign} | initial_assigned_op_keys
diff --git a/mars/services/task/analyzer/tests/test_assigner.py b/mars/services/task/analyzer/tests/test_assigner.py
@@ -13,15 +13,19 @@
 # limitations under the License.
 
 import numpy as np
+import pandas as pd
 
+from ..... import dataframe as md
 from .....config import Config
 from .....core import ChunkGraph
+from .....core.graph.builder.utils import build_graph
+from .....core.operand import OperandStage
 from .....tensor.random import TensorRand
 from .....tensor.arithmetic import TensorAdd
 from .....tensor.fetch import TensorFetch
 from .....resource import Resource
 from ...core import Task
-from ..analyzer import GraphAnalyzer
+from ..analyzer import GraphAnalyzer, need_reassign_worker
 from ..assigner import GraphAssigner
 
 
@@ -71,3 +75,34 @@ def test_assigner_with_fetch_inputs():
             for inp in input_chunks:
                 if not isinstance(inp.op, TensorFetch):
                     assert subtask.expect_band == key_to_assign[inp.key]
+
+
+def test_shuffle_assign():
+    band_num = 8
+    all_bands = [(f"address_{i}", "numa-0") for i in range(band_num)]
+
+    pdf = pd.DataFrame(np.random.rand(32, 4))
+    df = md.DataFrame(pdf, chunk_size=4)
+    r = df.groupby(0).sum(method="shuffle")
+    chunk_graph = build_graph([r], tile=True)
+
+    band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands)
+
+    reassign_worker_ops = [
+        chunk.op for chunk in chunk_graph if need_reassign_worker(chunk.op)
+    ]
+    start_ops = list(GraphAnalyzer._iter_start_ops(chunk_graph))
+    to_assign_ops = start_ops + reassign_worker_ops
+
+    assigner = GraphAssigner(chunk_graph, to_assign_ops, band_resource)
+    assigns = assigner.assign()
+    assert len(assigns) == 16
+    init_assigns = set()
+    reducer_assigns = set()
+    for chunk, assign in assigns.items():
+        if chunk.op.stage == OperandStage.reduce:
+            reducer_assigns.add(assign)
+        else:
+            init_assigns.add(assign)
+    # init and reducers are assigned on all bands
+    assert len(init_assigns) == len(reducer_assigns) == 8

Original file line number	Diff line number	Diff line change
`@@ -617,7 +617,7 @@ def g2(x):`
`617`	`617`	`def q13(customer, orders):`
`618`	`618`	`customer_filtered = customer.loc[:, ["C_CUSTKEY"]]`
`619`	`619`	`orders_filtered = orders[`
`620`		`- ~orders["O_COMMENT"].str.contains("special(\S\|\s)*requests")`
	`620`	`+ ~orders["O_COMMENT"].str.contains("special[\S\|\s]*requests")`
`621`	`621`	`]`
`622`	`622`	`orders_filtered = orders_filtered.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]]`
`623`	`623`	`c_o_merged = customer_filtered.merge(`