|
34 | 34 | from .....core.operand.fetch import FetchShuffle |
35 | 35 | from .....lib.aio import alru_cache |
36 | 36 | from .....lib.ordered_set import OrderedSet |
37 | | -from .....metrics.api import init_metrics |
| 37 | +from .....metrics.api import init_metrics, Metrics |
38 | 38 | from .....resource import Resource |
39 | 39 | from .....serialization import serialize, deserialize |
40 | 40 | from .....typing import BandType |
|
67 | 67 | logger = logging.getLogger(__name__) |
68 | 68 |
|
69 | 69 |
|
| 70 | +# Metrics |
| 71 | +submitted_subtask_number = Metrics.counter( |
| 72 | + "mars.ray_dag.submitted_subtask_number", |
| 73 | + "The number of submitted subtask.", |
| 74 | + ("session_id", "task_id", "stage_id"), |
| 75 | +) |
| 76 | +started_subtask_number = Metrics.counter( |
| 77 | + "mars.ray_dag.started_subtask_number", |
| 78 | + "The number of started subtask.", |
| 79 | + ("subtask_id",), |
| 80 | +) |
| 81 | +completed_subtask_number = Metrics.counter( |
| 82 | + "mars.ray_dag.completed_subtask_number", |
| 83 | + "The number of completed subtask.", |
| 84 | + ("subtask_id",), |
| 85 | +) |
| 86 | + |
| 87 | + |
70 | 88 | @dataclass |
71 | 89 | class _RayChunkMeta: |
72 | 90 | memory_size: int |
@@ -165,8 +183,11 @@ def execute_subtask( |
165 | 183 | subtask outputs and meta for outputs if `output_meta_keys` is provided. |
166 | 184 | """ |
167 | 185 | init_metrics("ray") |
| 186 | + metrics_tags = {"subtask_id": subtask_id} |
| 187 | + started_subtask_number.record(1, metrics_tags) |
| 188 | + ray_task_id = ray.get_runtime_context().task_id |
168 | 189 | subtask_chunk_graph = deserialize(*subtask_chunk_graph) |
169 | | - logger.info("Start subtask: %s.", subtask_id) |
| 190 | + logger.info("Start subtask: %s, ray task id: %s.", subtask_id, ray_task_id) |
170 | 191 | # Optimize chunk graph. |
171 | 192 | subtask_chunk_graph = _optimize_subtask_graph(subtask_chunk_graph) |
172 | 193 | fetch_chunks, shuffle_fetch_chunk = _get_fetch_chunks(subtask_chunk_graph) |
@@ -255,7 +276,8 @@ def execute_subtask( |
255 | 276 | output_values.append(output_meta) |
256 | 277 | output_values.extend(normal_output.values()) |
257 | 278 | output_values.extend(mapper_output.values()) |
258 | | - logger.info("Complete subtask: %s.", subtask_id) |
| 279 | + logger.info("Complete subtask: %s, ray task id: %s.", subtask_id, ray_task_id) |
| 280 | + completed_subtask_number.record(1, metrics_tags) |
259 | 281 | return output_values[0] if len(output_values) == 1 else output_values |
260 | 282 |
|
261 | 283 |
|
@@ -554,6 +576,11 @@ async def _execute_subtask_graph( |
554 | 576 | ) |
555 | 577 | subtask_max_retries = self._config.get_subtask_max_retries() |
556 | 578 | subtask_num_cpus = self._config.get_subtask_num_cpus() |
| 579 | + metrics_tags = { |
| 580 | + "session_id": self._task.session_id, |
| 581 | + "task_id": self._task.task_id, |
| 582 | + "stage_id": stage_id, |
| 583 | + } |
557 | 584 | for subtask in subtask_graph.topological_iter(): |
558 | 585 | if subtask.virtual: |
559 | 586 | continue |
@@ -592,6 +619,7 @@ async def _execute_subtask_graph( |
592 | 619 | await asyncio.sleep(0) |
593 | 620 | if output_count == 1: |
594 | 621 | output_object_refs = [output_object_refs] |
| 622 | + submitted_subtask_number.record(1, metrics_tags) |
595 | 623 | monitor_context.submitted_subtasks.add(subtask) |
596 | 624 | monitor_context.object_ref_to_subtask[output_object_refs[0]] = subtask |
597 | 625 | if subtask.stage_n_outputs: |
@@ -750,7 +778,7 @@ async def _load_subtask_inputs( |
750 | 778 | shuffle_object_refs = list(shuffle_manager.get_reducer_input_refs(subtask)) |
751 | 779 |
|
752 | 780 | if key_to_get_meta: |
753 | | - logger.info( |
| 781 | + logger.debug( |
754 | 782 | "Fetch %s metas and update context of stage %s.", |
755 | 783 | len(key_to_get_meta), |
756 | 784 | stage_id, |
@@ -867,11 +895,13 @@ def gc(): |
867 | 895 | stage_id, |
868 | 896 | ), |
869 | 897 | _RayExecutionStage.WAITING: lambda: logger.info( |
870 | | - "Completed [%s/%s] subtasks of stage %s, one of waiting object refs: %s", |
| 898 | + "Completed [%s/%s] subtasks of stage %s, one of waiting ray tasks: %s", |
871 | 899 | len(completed_subtasks), |
872 | 900 | total, |
873 | 901 | stage_id, |
874 | | - next(iter(object_ref_to_subtask)) if object_ref_to_subtask else None, |
| 902 | + next(iter(object_ref_to_subtask)).task_id() |
| 903 | + if object_ref_to_subtask |
| 904 | + else None, |
875 | 905 | ), |
876 | 906 | } |
877 | 907 |
|
|
0 commit comments