|
33 | 33 | ) |
34 | 34 | from .....core.operand.fetch import FetchShuffle |
35 | 35 | from .....lib.aio import alru_cache |
36 | | -from .....lib.ordered_set import OrderedSet |
37 | 36 | from .....metrics.api import init_metrics, Metrics |
38 | 37 | from .....resource import Resource |
39 | 38 | from .....serialization import serialize, deserialize |
|
76 | 75 | started_subtask_number = Metrics.counter( |
77 | 76 | "mars.ray_dag.started_subtask_number", |
78 | 77 | "The number of started subtask.", |
79 | | - ("subtask_id",), |
80 | 78 | ) |
81 | 79 | completed_subtask_number = Metrics.counter( |
82 | 80 | "mars.ray_dag.completed_subtask_number", |
83 | 81 | "The number of completed subtask.", |
84 | | - ("subtask_id",), |
85 | 82 | ) |
86 | 83 |
|
87 | 84 |
|
@@ -183,8 +180,7 @@ def execute_subtask( |
183 | 180 | subtask outputs and meta for outputs if `output_meta_keys` is provided. |
184 | 181 | """ |
185 | 182 | init_metrics("ray") |
186 | | - metrics_tags = {"subtask_id": subtask_id} |
187 | | - started_subtask_number.record(1, metrics_tags) |
| 183 | + started_subtask_number.record(1) |
188 | 184 | ray_task_id = ray.get_runtime_context().task_id |
189 | 185 | subtask_chunk_graph = deserialize(*subtask_chunk_graph) |
190 | 186 | logger.info("Start subtask: %s, ray task id: %s.", subtask_id, ray_task_id) |
@@ -277,7 +273,7 @@ def execute_subtask( |
277 | 273 | output_values.extend(normal_output.values()) |
278 | 274 | output_values.extend(mapper_output.values()) |
279 | 275 | logger.info("Complete subtask: %s, ray task id: %s.", subtask_id, ray_task_id) |
280 | | - completed_subtask_number.record(1, metrics_tags) |
| 276 | + completed_subtask_number.record(1) |
281 | 277 | return output_values[0] if len(output_values) == 1 else output_values |
282 | 278 |
|
283 | 279 |
|
@@ -331,6 +327,32 @@ class _RayExecutionStage(enum.Enum): |
331 | 327 | WAITING = 2 |
332 | 328 |
|
333 | 329 |
|
| 330 | +class OrderedSet: |
| 331 | + def __init__(self): |
| 332 | + self._d = set() |
| 333 | + self._l = list() |
| 334 | + |
| 335 | + def add(self, item): |
| 336 | + self._d.add(item) |
| 337 | + self._l.append(item) |
| 338 | + assert len(self._d) == len(self._l) |
| 339 | + |
| 340 | + def update(self, items): |
| 341 | + tmp = list(items) if isinstance(items, collections.Iterator) else items |
| 342 | + self._l.extend(tmp) |
| 343 | + self._d.update(tmp) |
| 344 | + assert len(self._d) == len(self._l) |
| 345 | + |
| 346 | + def __contains__(self, item): |
| 347 | + return item in self._d |
| 348 | + |
| 349 | + def __getitem__(self, item): |
| 350 | + return self._l[item] |
| 351 | + |
| 352 | + def __len__(self): |
| 353 | + return len(self._d) |
| 354 | + |
| 355 | + |
334 | 356 | @dataclass |
335 | 357 | class _RayMonitorContext: |
336 | 358 | stage: _RayExecutionStage = _RayExecutionStage.INIT |
@@ -576,6 +598,7 @@ async def _execute_subtask_graph( |
576 | 598 | ) |
577 | 599 | subtask_max_retries = self._config.get_subtask_max_retries() |
578 | 600 | subtask_num_cpus = self._config.get_subtask_num_cpus() |
| 601 | + subtask_memory = self._config.get_subtask_memory() |
579 | 602 | metrics_tags = { |
580 | 603 | "session_id": self._task.session_id, |
581 | 604 | "task_id": self._task.task_id, |
@@ -608,6 +631,7 @@ async def _execute_subtask_graph( |
608 | 631 | num_cpus=subtask_num_cpus, |
609 | 632 | num_returns=output_count, |
610 | 633 | max_retries=subtask_max_retries, |
| 634 | + memory=subtask_memory, |
611 | 635 | scheduling_strategy="DEFAULT" if len(input_object_refs) else "SPREAD", |
612 | 636 | ).remote( |
613 | 637 | subtask.subtask_id, |
@@ -840,11 +864,9 @@ def gc(): |
840 | 864 | for pred in subtask_graph.iter_predecessors(subtask): |
841 | 865 | if pred in gc_subtasks: |
842 | 866 | continue |
843 | | - while not all( |
844 | | - succ in gc_targets |
845 | | - for succ in subtask_graph.iter_successors(pred) |
846 | | - ): |
847 | | - yield |
| 867 | + for succ in subtask_graph.iter_successors(pred): |
| 868 | + while succ not in gc_targets: |
| 869 | + yield |
848 | 870 | if pred.virtual: |
849 | 871 | # For virtual subtask, remove all the predecessors if it is |
850 | 872 | # completed. |
|
0 commit comments