|
21 | 21 |
|
22 | 22 | from .... import oscar as mo |
23 | 23 | from ....lib.aio import alru_cache |
| 24 | +from ....metrics import Metrics |
24 | 25 | from ....oscar.backends.context import ProfilingContext |
25 | 26 | from ....oscar.errors import MarsError |
26 | 27 | from ....oscar.profiling import ProfilingData, MARS_ENABLE_PROFILING |
@@ -82,6 +83,21 @@ def __init__( |
82 | 83 | self._speculation_config = speculation_config or {} |
83 | 84 | self._queueing_ref = None |
84 | 85 | self._global_resource_ref = None |
| 86 | + self._submitted_subtask_count = Metrics.counter( |
| 87 | + "mars.scheduling.submitted_subtask_count", |
| 88 | + "The count of submitted subtasks to all bands.", |
| 89 | + ("session_id", "task_id", "stage_id"), |
| 90 | + ) |
| 91 | + self._finished_subtask_count = Metrics.counter( |
| 92 | + "mars.scheduling.finished_subtask_count", |
| 93 | + "The count of finished subtasks of all bands.", |
| 94 | + ("session_id", "task_id", "stage_id"), |
| 95 | + ) |
| 96 | + self._canceled_subtask_count = Metrics.counter( |
| 97 | + "mars.scheduling.canceled_subtask_count", |
| 98 | + "The count of canceled subtasks of all bands.", |
| 99 | + ("session_id", "task_id", "stage_id"), |
| 100 | + ) |
85 | 101 | logger.info( |
86 | 102 | "Created SubtaskManager with subtask_max_reschedules %s, " |
87 | 103 | "speculation_config %s", |
@@ -167,6 +183,14 @@ async def finish_subtasks( |
167 | 183 | for subtask_id, subtask_band in zip(subtask_ids, bands): |
168 | 184 | subtask_info = self._subtask_infos.get(subtask_id, None) |
169 | 185 | if subtask_info is not None: |
| 186 | + self._finished_subtask_count.record( |
| 187 | + 1, |
| 188 | + { |
| 189 | + "session_id": self._session_id, |
| 190 | + "task_id": subtask_info.subtask.task_id, |
| 191 | + "stage_id": subtask_info.subtask.stage_id, |
| 192 | + }, |
| 193 | + ) |
170 | 194 | self._subtask_summaries[subtask_id] = subtask_info.to_summary( |
171 | 195 | is_finished=True |
172 | 196 | ) |
@@ -236,6 +260,14 @@ async def submit_subtask_to_band(self, subtask_id: str, band: BandType): |
236 | 260 | if enable_profiling |
237 | 261 | else None |
238 | 262 | ) |
| 263 | + self._submitted_subtask_count.record( |
| 264 | + 1, |
| 265 | + { |
| 266 | + "session_id": self._session_id, |
| 267 | + "task_id": subtask_info.subtask.task_id, |
| 268 | + "stage_id": subtask_info.subtask.stage_id, |
| 269 | + }, |
| 270 | + ) |
239 | 271 | logger.debug("Start run subtask %s in band %s.", subtask_id, band) |
240 | 272 | with Timer() as timer: |
241 | 273 | task = asyncio.create_task( |
@@ -388,6 +420,14 @@ async def cancel_single_task(subtask, raw_tasks, cancel_tasks): |
388 | 420 | self._subtask_summaries[subtask_id] = subtask_info.to_summary( |
389 | 421 | is_finished=True, is_cancelled=True |
390 | 422 | ) |
| 423 | + self._canceled_subtask_count.record( |
| 424 | + 1, |
| 425 | + { |
| 426 | + "session_id": self._session_id, |
| 427 | + "task_id": subtask_info.subtask.task_id, |
| 428 | + "stage_id": subtask_info.subtask.stage_id, |
| 429 | + }, |
| 430 | + ) |
391 | 431 | await self._queueing_ref.submit_subtasks.tell() |
392 | 432 | logger.info("Subtasks %s canceled.", subtask_ids) |
393 | 433 |
|
|
0 commit comments