[Ray] Add metrics for Ray executor (#3295)

fyrestone · 刘宝 · web-flow · commit c8363c77df37 · 2022-11-25T16:59:26.000+08:00
* Add metrics for Ray executor * Pin faiss-cpu<1.7.3 * Fix * Fix * Fix asv * Pin cryptography<38.0.3 * Debug CI * Debug CI * Debug CI * Install ray for asv benchmark * Fallback to dump dot file if dot executable is not found * Add MARS_DUMP_SUBTASK_GRAPH_DIR * Create dump dir if not exist * Fix asv benchmark * Fix * Force pin setuptools before any installations * Revert "Force pin setuptools before any installations" This reverts commit 472b7c9. * Force pin setuptools<64 * Refine metrics name * Change merge logs from debug to info * Change fetch meta log of ray executor from info to debug * Refine merge logs * Fix merge log Co-authored-by: 刘宝 <po.lb@antgroup.com>
diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml
@@ -35,7 +35,7 @@ jobs:
         shell: bash
         run: |
           source ./ci/install-conda.sh
-          python -m pip install --upgrade pip setuptools wheel coverage;
+          python -m pip install --upgrade pip "setuptools<64" wheel coverage;
 
       - name: Install dependencies
         id: build
@@ -57,7 +57,7 @@ jobs:
           git fetch upstream
           git merge upstream/master
           asv machine --yes
-          asv continuous -f 1.1 --strict upstream/master HEAD
+          asv continuous -e -f 1.1 --strict upstream/master HEAD
         if: ${{ steps.build.outcome == 'success' }}
 
       - name: Publish benchmarks artifact
diff --git a/.github/workflows/core-ci.yml b/.github/workflows/core-ci.yml
@@ -35,7 +35,7 @@ jobs:
       shell: bash
       run: |
         source ./ci/install-conda.sh
-        python -m pip install --upgrade pip setuptools wheel coverage;
+        python -m pip install --upgrade pip "setuptools<64" wheel coverage;
 
     - name: Install dependencies
       env:
diff --git a/.github/workflows/os-compat-ci.yml b/.github/workflows/os-compat-ci.yml
@@ -32,7 +32,7 @@ jobs:
       shell: bash
       run: |
         source ./ci/install-conda.sh
-        python -m pip install --upgrade pip setuptools wheel coverage;
+        python -m pip install --upgrade pip "setuptools<64" wheel coverage;
 
     - name: Install dependencies
       env:
diff --git a/.github/workflows/platform-ci.yml b/.github/workflows/platform-ci.yml
@@ -47,7 +47,7 @@ jobs:
         shell: bash
         run: |
           source ./ci/install-conda.sh
-          python -m pip install --upgrade pip setuptools wheel coverage;
+          python -m pip install --upgrade pip "setuptools<64" wheel coverage;
 
       - name: Start minikube
         if: ${{ matrix.with-kubernetes }}
diff --git a/benchmarks/asv_bench/benchmarks/execution.py b/benchmarks/asv_bench/benchmarks/execution.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import dataclasses
+import unittest.mock as mock
 
 import mars.tensor as mt
 from mars import new_session
@@ -24,7 +25,7 @@
 )
 from mars.serialization import serialize
 from mars.services.task import new_task_id
-from mars.services.task.execution.ray.executor import execute_subtask
+from mars.services.task.execution.ray import executor as ray_executor
 
 
 def _gen_subtask_chunk_graph(t):
@@ -83,10 +84,11 @@ def time_numexpr_execution(self):
             c.execute(show_progress=False)
 
     def time_numexpr_subtask_execution(self):
-        for asv_subtask_info in self.asv_subtasks:
-            execute_subtask(
-                asv_subtask_info.subtask_id,
-                asv_subtask_info.serialized_subtask_chunk_graph,
-                set(),
-                False,
-            )
+        with mock.patch.object(ray_executor, "ray"):
+            for asv_subtask_info in self.asv_subtasks:
+                ray_executor.execute_subtask(
+                    asv_subtask_info.subtask_id,
+                    asv_subtask_info.serialized_subtask_chunk_graph,
+                    0,
+                    False,
+                )
diff --git a/mars/dataframe/merge/merge.py b/mars/dataframe/merge/merge.py
@@ -740,22 +740,29 @@ def tile(cls, op: "DataFrameMerge"):
             and len(left.chunks) + len(right.chunks) > auto_merge_threshold
         ):
             yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.2)
+            left_chunk_size = len(left.chunks)
+            right_chunk_size = len(right.chunks)
             left = auto_merge_chunks(ctx, left)
             right = auto_merge_chunks(ctx, right)
-            logger.debug(
-                "Before merge %s, left data count: %d, chunk size: %d, "
-                "right data count: %d, chunk_size: %d",
+            logger.info(
+                "Auto merge before %s, left data shape: %s, chunk count: %s -> %s, "
+                "right data shape: %s, chunk count: %s -> %s.",
                 op,
-                left.shape[0],
+                left.shape,
+                left_chunk_size,
                 len(left.chunks),
-                right.shape[0],
+                right.shape,
+                right_chunk_size,
                 len(right.chunks),
             )
         else:
-            logger.debug(
-                "Skip auto merge before %s, left chunk size: %d, right chunk size: %d",
+            logger.info(
+                "Skip auto merge before %s, left data shape: %s, chunk count: %d, "
+                "right data shape: %s, chunk count: %d.",
                 op,
+                left.shape,
                 len(left.chunks),
+                right.shape,
                 len(right.chunks),
             )
 
@@ -766,7 +773,7 @@ def tile(cls, op: "DataFrameMerge"):
             left_on = _prepare_shuffle_on(op.left_index, op.left_on, op.on)
             right_on = _prepare_shuffle_on(op.right_index, op.right_on, op.on)
             small_one = right if len(left.chunks) > len(right.chunks) else left
-            logger.debug(
+            logger.info(
                 "Apply bloom filter for operand %s, use DataFrame %s to build bloom filter.",
                 op,
                 small_one,
@@ -782,7 +789,7 @@ def tile(cls, op: "DataFrameMerge"):
             if op.method == "auto":
                 # if method is auto, select new method after auto merge
                 method = cls._choose_merge_method(op, left, right)
-        logger.info("Choose %s method for merge operand %s", method, op)
+        logger.info("Choose %s method for merge operand %s.", method, op)
         if method == MergeMethod.one_chunk:
             ret = cls._tile_one_chunk(op, left, right)
         elif method == MergeMethod.broadcast:
@@ -802,16 +809,20 @@ def tile(cls, op: "DataFrameMerge"):
                 ret[0].chunks, progress=0.8
             )  # trigger execution for chunks
             merged = auto_merge_chunks(get_context(), ret[0])
-            logger.debug(
-                "After merge %s, data size: %d, chunk size: %d",
+            logger.info(
+                "Auto merge after %s, data shape: %s, chunk count: %s -> %s.",
                 op,
-                merged.shape[0],
+                merged.shape,
+                len(ret[0].chunks),
                 len(merged.chunks),
             )
             return [merged]
         else:
-            logger.debug(
-                "Skip auto merge after %s, chunk size: %d", op, len(ret[0].chunks)
+            logger.info(
+                "Skip auto merge after %s, data shape: %s, chunk count: %d.",
+                op,
+                ret[0].shape,
+                len(ret[0].chunks),
             )
             return ret
 
diff --git a/mars/dataframe/utils.py b/mars/dataframe/utils.py
@@ -165,8 +165,6 @@ def decide_dataframe_chunk_sizes(shape, chunk_size, memory_usage):
     :return: the calculated chunk size for each dimension
     :rtype: tuple
     """
-    from ..config import options
-
     chunk_size = dictify_chunk_size(shape, chunk_size)
     average_memory_usage = memory_usage / shape[0]
 
@@ -238,8 +236,6 @@ def decide_dataframe_chunk_sizes(shape, chunk_size, memory_usage):
 
 
 def decide_series_chunk_size(shape, chunk_size, memory_usage):
-    from ..config import options
-
     chunk_size = dictify_chunk_size(shape, chunk_size)
     average_memory_usage = memory_usage / shape[0] if shape[0] != 0 else memory_usage
 
diff --git a/mars/metrics/api.py b/mars/metrics/api.py
@@ -89,7 +89,7 @@ def __init__(
         self,
         name: str,
         description: str = "",
-        tag_keys: Optional[Tuple[str]] = None,
+        tag_keys: Optional[Tuple[str, ...]] = None,
         metric_type: str = "Counter",
     ):
         self._name = name
@@ -125,7 +125,9 @@ def record(self, value=1, tags: Optional[Dict[str, str]] = None):
 
 
 def gen_metric(func):
-    def wrapper(name, descriptions: str = "", tag_keys: Optional[Tuple[str]] = None):
+    def wrapper(
+        name, descriptions: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
         if _init is True:
             return func(name, descriptions, tag_keys)
         else:
@@ -168,7 +170,9 @@ class Metrics:
 
     @staticmethod
     @gen_metric
-    def counter(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
+    def counter(
+        name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
         logger.info(
             "Initializing a counter with name: %s, tag keys: %s, backend: %s",
             name,
@@ -179,7 +183,7 @@ def counter(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
 
     @staticmethod
     @gen_metric
-    def gauge(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
+    def gauge(name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None):
         logger.info(
             "Initializing a gauge whose name: %s, tag keys: %s, backend: %s",
             name,
@@ -190,7 +194,7 @@ def gauge(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
 
     @staticmethod
     @gen_metric
-    def meter(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
+    def meter(name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None):
         logger.info(
             "Initializing a meter whose name: %s, tag keys: %s, backend: %s",
             name,
@@ -201,7 +205,9 @@ def meter(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
 
     @staticmethod
     @gen_metric
-    def histogram(name, description: str = "", tag_keys: Optional[Tuple[str]] = None):
+    def histogram(
+        name, description: str = "", tag_keys: Optional[Tuple[str, ...]] = None
+    ):
         logger.info(
             "Initializing a histogram whose name: %s, tag keys: %s, backend: %s",
             name,
diff --git a/mars/services/task/execution/ray/executor.py b/mars/services/task/execution/ray/executor.py
@@ -34,7 +34,7 @@
 from .....core.operand.fetch import FetchShuffle
 from .....lib.aio import alru_cache
 from .....lib.ordered_set import OrderedSet
-from .....metrics.api import init_metrics
+from .....metrics.api import init_metrics, Metrics
 from .....resource import Resource
 from .....serialization import serialize, deserialize
 from .....typing import BandType
@@ -67,6 +67,24 @@
 logger = logging.getLogger(__name__)
 
 
+# Metrics
+submitted_subtask_number = Metrics.counter(
+    "mars.ray_dag.submitted_subtask_number",
+    "The number of submitted subtask.",
+    ("session_id", "task_id", "stage_id"),
+)
+started_subtask_number = Metrics.counter(
+    "mars.ray_dag.started_subtask_number",
+    "The number of started subtask.",
+    ("subtask_id",),
+)
+completed_subtask_number = Metrics.counter(
+    "mars.ray_dag.completed_subtask_number",
+    "The number of completed subtask.",
+    ("subtask_id",),
+)
+
+
 @dataclass
 class _RayChunkMeta:
     memory_size: int
@@ -165,8 +183,11 @@ def execute_subtask(
         subtask outputs and meta for outputs if `output_meta_keys` is provided.
     """
     init_metrics("ray")
+    metrics_tags = {"subtask_id": subtask_id}
+    started_subtask_number.record(1, metrics_tags)
+    ray_task_id = ray.get_runtime_context().task_id
     subtask_chunk_graph = deserialize(*subtask_chunk_graph)
-    logger.info("Start subtask: %s.", subtask_id)
+    logger.info("Start subtask: %s, ray task id: %s.", subtask_id, ray_task_id)
     # Optimize chunk graph.
     subtask_chunk_graph = _optimize_subtask_graph(subtask_chunk_graph)
     fetch_chunks, shuffle_fetch_chunk = _get_fetch_chunks(subtask_chunk_graph)
@@ -255,7 +276,8 @@ def execute_subtask(
         output_values.append(output_meta)
     output_values.extend(normal_output.values())
     output_values.extend(mapper_output.values())
-    logger.info("Complete subtask: %s.", subtask_id)
+    logger.info("Complete subtask: %s, ray task id: %s.", subtask_id, ray_task_id)
+    completed_subtask_number.record(1, metrics_tags)
     return output_values[0] if len(output_values) == 1 else output_values
 
 
@@ -554,6 +576,11 @@ async def _execute_subtask_graph(
         )
         subtask_max_retries = self._config.get_subtask_max_retries()
         subtask_num_cpus = self._config.get_subtask_num_cpus()
+        metrics_tags = {
+            "session_id": self._task.session_id,
+            "task_id": self._task.task_id,
+            "stage_id": stage_id,
+        }
         for subtask in subtask_graph.topological_iter():
             if subtask.virtual:
                 continue
@@ -592,6 +619,7 @@ async def _execute_subtask_graph(
             await asyncio.sleep(0)
             if output_count == 1:
                 output_object_refs = [output_object_refs]
+            submitted_subtask_number.record(1, metrics_tags)
             monitor_context.submitted_subtasks.add(subtask)
             monitor_context.object_ref_to_subtask[output_object_refs[0]] = subtask
             if subtask.stage_n_outputs:
@@ -750,7 +778,7 @@ async def _load_subtask_inputs(
             shuffle_object_refs = list(shuffle_manager.get_reducer_input_refs(subtask))
 
         if key_to_get_meta:
-            logger.info(
+            logger.debug(
                 "Fetch %s metas and update context of stage %s.",
                 len(key_to_get_meta),
                 stage_id,
@@ -867,11 +895,13 @@ def gc():
                 stage_id,
             ),
             _RayExecutionStage.WAITING: lambda: logger.info(
-                "Completed [%s/%s] subtasks of stage %s, one of waiting object refs: %s",
+                "Completed [%s/%s] subtasks of stage %s, one of waiting ray tasks: %s",
                 len(completed_subtasks),
                 total,
                 stage_id,
-                next(iter(object_ref_to_subtask)) if object_ref_to_subtask else None,
+                next(iter(object_ref_to_subtask)).task_id()
+                if object_ref_to_subtask
+                else None,
             ),
         }
 
diff --git a/mars/services/task/execution/ray/tests/test_ray_execution_backend.py b/mars/services/task/execution/ray/tests/test_ray_execution_backend.py
@@ -176,7 +176,8 @@ async def test_ray_executor_destroy():
 
 
 @require_ray
-def test_ray_execute_subtask_basic():
+@mock.patch("ray.get_runtime_context")
+def test_ray_execute_subtask_basic(_):
     raw = np.ones((10, 10))
     raw_expect = raw + 1
     a = mt.ones((10, 10), chunk_size=10)
@@ -610,7 +611,7 @@ async def _wait_gc_execute_subtask_graph(
 
     with mock.patch.object(
         executor, "_execute_subtask_graph", _wait_gc_execute_subtask_graph
-    ):
+    ), mock.patch("ray.get_runtime_context"):
         async with executor:
             await executor.execute_subtask_graph(
                 "mock_stage", subtask_graph, chunk_graph, tile_context
diff --git a/mars/services/task/supervisor/processor.py b/mars/services/task/supervisor/processor.py
@@ -423,17 +423,26 @@ def dump_subtask_graph(self):
             graphviz = None
 
         dot = GraphVisualizer.to_dot(self._subtask_graphs)
-        directory = tempfile.gettempdir()
+        directory = os.environ.get("MARS_DUMP_SUBTASK_GRAPH_DIR")
+        if directory is None:
+            directory = tempfile.gettempdir()
+        os.makedirs(directory, exist_ok=True)
         file_name = f"mars-{self.task_id}"
-        logger.debug(
-            "subtask graph is stored in %s", os.path.join(directory, file_name)
+        logger.info(
+            "Subtask graph of task %s is stored in %s",
+            self._task.task_id,
+            os.path.join(directory, file_name),
         )
         if graphviz is not None:  # pragma: no cover
-            g = graphviz.Source(dot)
-            g.view(file_name, directory=directory)
-        else:
-            with open(os.path.join(directory, file_name), "w") as f:
-                f.write(dot)
+            try:
+                g = graphviz.Source(dot)
+                g.view(file_name, directory=directory)
+                return
+            except graphviz.ExecutableNotFound:  # pragma: no cover
+                logger.info("dot executable is not found, dump dot file instead.")
+
+        with open(os.path.join(directory, file_name), "w") as f:
+            f.write(dot)
 
     def _finish(self):
         self._executor.destroy()