ITISFoundation
diff --git a/‎packages/dask-task-models-library/src/dask_task_models_library/models.py‎
Lines changed: 69 additions & 1 deletion b/‎packages/dask-task-models-library/src/dask_task_models_library/models.py‎
Lines changed: 69 additions & 1 deletion
diff --git a/‎packages/dask-task-models-library/src/dask_task_models_library/plugins/__init__.py‎ b/‎packages/dask-task-models-library/src/dask_task_models_library/plugins/__init__.py‎
diff --git a/‎packages/dask-task-models-library/src/dask_task_models_library/plugins/task_life_cycle_scheduler_plugin.py‎
Lines changed: 54 additions & 0 deletions b/‎packages/dask-task-models-library/src/dask_task_models_library/plugins/task_life_cycle_scheduler_plugin.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎packages/dask-task-models-library/src/dask_task_models_library/plugins/task_life_cycle_worker_plugin.py‎
Lines changed: 48 additions & 0 deletions b/‎packages/dask-task-models-library/src/dask_task_models_library/plugins/task_life_cycle_worker_plugin.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py‎
Lines changed: 12 additions & 0 deletions b/‎packages/pytest-simcore/src/pytest_simcore/dask_scheduler.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py‎
Lines changed: 5 additions & 1 deletion b/‎scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎services/clusters-keeper/requirements/ci.txt‎
Lines changed: 1 addition & 0 deletions b/‎services/clusters-keeper/requirements/ci.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎services/clusters-keeper/requirements/dev.txt‎
Lines changed: 1 addition & 0 deletions b/‎services/clusters-keeper/requirements/dev.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎services/dask-sidecar/src/simcore_service_dask_sidecar/rabbitmq_plugin.py‎ renamed to ‎services/dask-sidecar/src/simcore_service_dask_sidecar/rabbitmq_worker_plugin.py‎
Lines changed: 3 additions & 3 deletions b/‎services/dask-sidecar/src/simcore_service_dask_sidecar/rabbitmq_plugin.py‎ renamed to ‎services/dask-sidecar/src/simcore_service_dask_sidecar/rabbitmq_worker_plugin.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎services/dask-sidecar/src/simcore_service_dask_sidecar/scheduler.py‎
Lines changed: 9 additions & 2 deletions b/‎services/dask-sidecar/src/simcore_service_dask_sidecar/scheduler.py‎
Lines changed: 9 additions & 2 deletions
@@ -1,4 +1,72 @@
-from typing import TypeAlias
+from typing import Final, Literal, TypeAlias
+
+from dask.typing import Key
+from distributed.scheduler import TaskStateState as SchedulerTaskState
+from distributed.worker_state_machine import TaskStateState as WorkerTaskState
+from models_library.projects_state import RunningState
+from pydantic import BaseModel
 
 DaskJobID: TypeAlias = str
 DaskResources: TypeAlias = dict[str, int | float]
+
+TASK_LIFE_CYCLE_EVENT: Final[str] = "task-lifecycle-{key}"
+TASK_RUNNING_PROGRESS_EVENT: Final[str] = "task-progress-{key}"
+_SCHEDULER_TASK_STATE_TO_RUNNING_STATE: Final[
+    dict[SchedulerTaskState, RunningState]
+] = {
+    "released": RunningState.NOT_STARTED,  # Known but not actively computing or in memory
+    "waiting": RunningState.PENDING,  # On track to be computed, waiting on dependencies to arrive in memory
+    "no-worker": RunningState.WAITING_FOR_RESOURCES,  # Ready to be computed, but no appropriate worker exists (for example because of resource restrictions, or because no worker is connected at all).
+    "queued": RunningState.WAITING_FOR_RESOURCES,  # Ready to be computed, but all workers are already full.
+    "processing": RunningState.PENDING,  # All dependencies are available and the task is assigned to a worker for compute (the scheduler doesn’t know whether it’s in a worker queue or actively being computed).
+    "memory": RunningState.SUCCESS,  # In memory on one or more workers
+    "erred": RunningState.FAILED,  # Task computation, or one of its dependencies, has encountered an error
+    "forgotten": RunningState.UNKNOWN,  # Task is no longer needed by any client or dependent task, so it disappears from the scheduler as well. As soon as a task reaches this state, it is immediately dereferenced from the scheduler.
+}
+
+_WORKER_TASK_STATE_TO_RUNNING_STATE: Final[dict[WorkerTaskState, RunningState]] = {
+    "cancelled": RunningState.ABORTED,  # The scheduler asked to forget about this task, but it’s technically impossible at the moment. See Task cancellation. The task can be found in whatever collections it was in its previous state.
+    "constrained": RunningState.PENDING,  # Like ready, but the user specified resource constraints for this task. The task can be found in the WorkerState.constrained queue.
+    "error": RunningState.FAILED,  # Task execution failed
+    "executing": RunningState.STARTED,  # The task is currently being computed on a thread. It can be found in the WorkerState.executing set and in the distributed.worker.Worker.active_threads dict.
+    "fetch": RunningState.PENDING,  # This task is in memory on one or more peer workers, but not on this worker. Its data is queued to be transferred over the network, either because it’s a dependency of a task in waiting state, or because the Active Memory Manager requested it to be replicated here. The task can be found in the WorkerState.data_needed heap.
+    "flight": RunningState.PENDING,  # The task data is currently being transferred over the network from another worker. The task can be found in the WorkerState.in_flight_tasks and WorkerState.in_flight_workers collections.
+    "forgotten": RunningState.UNKNOWN,  # The scheduler asked this worker to forget about the task, and there are neither dependents nor dependencies on the same worker.
+    "long-running": RunningState.STARTED,  # Like executing, but the user code called distributed.secede() so the task no longer counts towards the maximum number of concurrent tasks. It can be found in the WorkerState.long_running set and in the distributed.worker.Worker.active_threads dict.
+    "memory": RunningState.SUCCESS,  # Task execution completed, or the task was successfully transferred from another worker, and is now held in either WorkerState.data or WorkerState.actors.
+    "missing": RunningState.PENDING,  # Like fetch, but all peer workers that were listed by the scheduler are either unreachable or have responded they don’t actually have the task data. The worker will periodically ask the scheduler if it knows of additional replicas; when it does, the task will transition again to fetch. The task can be found in the WorkerState.missing_dep_flight set.
+    "ready": RunningState.PENDING,  # The task is ready to be computed; all of its dependencies are in memory on the current worker and it’s waiting for an available thread. The task can be found in the WorkerState.ready heap.
+    "released": RunningState.PENDING,  # Known but not actively computing or in memory. A task can stay in this state when the scheduler asked to forget it, but it has dependent tasks on the same worker.
+    "rescheduled": RunningState.PENDING,  # The task just raised the Reschedule exception. This is a transitory state, which is not stored permanently.
+    "resumed": RunningState.PENDING,  # The task was recovered from cancelled state. See Task cancellation. The task can be found in whatever collections it was in its previous state.
+    "waiting": RunningState.PENDING,  # The scheduler has added the task to the worker queue. All of its dependencies are in memory somewhere on the cluster, but not all of them are in memory on the current worker, so they need to be fetched.
+}
+
+
+class TaskLifeCycleState(BaseModel):
+    key: str
+    source: Literal["scheduler", "worker"]
+    worker: str | None
+    state: RunningState
+
+    @classmethod
+    def from_scheduler_task_state(
+        cls, key: Key, worker: str | None, task_state: SchedulerTaskState
+    ) -> "TaskLifeCycleState":
+        return cls(
+            key=f"{key!r}",
+            source="scheduler",
+            worker=worker,
+            state=_SCHEDULER_TASK_STATE_TO_RUNNING_STATE[task_state],
+        )
+
+    @classmethod
+    def from_worker_task_state(
+        cls, key: Key, worker: str | None, task_state: WorkerTaskState
+    ) -> "TaskLifeCycleState":
+        return cls(
+            key=f"{key!r}",
+            source="worker",
+            worker=worker,
+            state=_WORKER_TASK_STATE_TO_RUNNING_STATE[task_state],
+        )
@@ -0,0 +1,54 @@
+# pylint: disable=unused-argument
+import logging
+from typing import Any
+
+import click
+from dask.typing import Key
+from distributed import Scheduler, SchedulerPlugin
+from distributed.scheduler import TaskStateState
+
+from ..models import TASK_LIFE_CYCLE_EVENT, TaskLifeCycleState
+
+_logger = logging.getLogger(__name__)
+
+
+class TaskLifecycleSchedulerPlugin(SchedulerPlugin):
+    def __init__(self) -> None:
+        self.scheduler = None
+        _logger.info("initialized TaskLifecycleSchedulerPlugin")
+
+    async def start(self, scheduler: Scheduler) -> None:
+        self.scheduler = scheduler  # type: ignore[assignment]
+        _logger.info("started TaskLifecycleSchedulerPlugin")
+
+    def transition(
+        self,
+        key: Key,
+        start: TaskStateState,
+        finish: TaskStateState,
+        *args: Any,  # noqa: ARG002
+        stimulus_id: str,
+        **kwargs: Any,
+    ):
+        _logger.debug(
+            "Task %s transition from %s to %s due to %s",
+            key,
+            start,
+            finish,
+            stimulus_id,
+        )
+
+        assert self.scheduler  # nosec
+
+        self.scheduler.log_event(
+            TASK_LIFE_CYCLE_EVENT.format(key=key),
+            TaskLifeCycleState.from_scheduler_task_state(
+                key, kwargs.get("worker"), finish
+            ).model_dump(mode="json"),
+        )
+
+
+@click.command()
+def dask_setup(scheduler):
+    plugin = TaskLifecycleSchedulerPlugin()
+    scheduler.add_plugin(plugin)
@@ -0,0 +1,48 @@
+import logging
+from collections.abc import Awaitable
+from typing import Any
+
+import click
+from dask.typing import Key
+from distributed import WorkerPlugin
+from distributed.worker import Worker
+from distributed.worker_state_machine import TaskStateState
+
+from ..models import TASK_LIFE_CYCLE_EVENT, TaskLifeCycleState
+
+_logger = logging.getLogger(__name__)
+
+
+class TaskLifecycleWorkerPlugin(WorkerPlugin):
+    def __init__(self) -> None:
+        self._worker = None
+        _logger.info("TaskLifecycleWorkerPlugin initialized")
+
+    def setup(self, worker: Worker) -> Awaitable[None]:
+        async def _() -> None:
+            self._worker = worker  # type: ignore[assignment]
+            _logger.info("TaskLifecycleWorkerPlugin setup completed")
+
+        return _()
+
+    def transition(
+        self,
+        key: Key,
+        start: TaskStateState,
+        finish: TaskStateState,
+        **kwargs: Any,
+    ):
+        _logger.info("Task '%s' transition from %s to %s", key, start, finish)
+        assert self._worker  # nosec
+        self._worker.log_event(
+            TASK_LIFE_CYCLE_EVENT.format(key=key),
+            TaskLifeCycleState.from_worker_task_state(
+                key, kwargs.get("worker"), finish
+            ).model_dump(mode="json"),
+        )
+
+
+@click.command()
+async def dask_setup(worker: Worker) -> None:
+    plugin = TaskLifecycleWorkerPlugin()
+    await worker.plugin_add(plugin)
@@ -19,6 +19,9 @@ def dask_workers_config() -> dict[str, Any]:
             "options": {
                 "nthreads": 2,
                 "resources": {"CPU": 2, "RAM": 48e9},
+                "preload": (
+                    "dask_task_models_library.plugins.task_life_cycle_worker_plugin",
+                ),
             },
         },
         "gpu-worker": {
@@ -30,6 +33,9 @@ def dask_workers_config() -> dict[str, Any]:
                     "GPU": 1,
                     "RAM": 48e9,
                 },
+                "preload": (
+                    "dask_task_models_library.plugins.task_life_cycle_worker_plugin",
+                ),
             },
         },
         "large-ram-worker": {
@@ -40,6 +46,9 @@ def dask_workers_config() -> dict[str, Any]:
                     "CPU": 8,
                     "RAM": 768e9,
                 },
+                "preload": (
+                    "dask_task_models_library.plugins.task_life_cycle_worker_plugin",
+                ),
             },
         },
     }
@@ -54,6 +63,9 @@ def dask_scheduler_config(
         "options": {
             "port": unused_tcp_port_factory(),
             "dashboard_address": f":{unused_tcp_port_factory()}",
+            "preload": (
+                "dask_task_models_library.plugins.task_life_cycle_scheduler_plugin",
+            ),
         },
     }
 
 
@@ -265,7 +265,11 @@ def _needs_manual_intervention(
                     user_id=containers[0].user_id,
                     project_id=containers[0].project_id,
                     created_at=containers[0].created_at,
-                    needs_manual_intervention=_needs_manual_intervention(containers),
+                    needs_manual_intervention=_needs_manual_intervention(containers)
+                    and (
+                        (arrow.utcnow().datetime - containers[0].created_at)
+                        > datetime.timedelta(minutes=2)
+                    ),
                     containers=[c.name for c in containers],
                     service_name=containers[0].service_name,
                     service_version=containers[0].service_version,
 
@@ -14,6 +14,7 @@
 # installs this repo's packages
 simcore-aws-library @ ../../packages/aws-library
 simcore-common-library @ ../../packages/common-library
+simcore-dask-task-models-library @ ../../packages/dask-task-models-library
 simcore-models-library @ ../../packages/models-library
 pytest-simcore @ ../../packages/pytest-simcore
 simcore-service-library[fastapi] @ ../../packages/service-library
 
@@ -18,6 +18,7 @@
 --editable ../../packages/pytest-simcore
 --editable ../../packages/service-library[fastapi]
 --editable ../../packages/settings-library
+--editable ../../packages/dask-task-models-library
 
 # installs current package
 --editable .
@@ -24,7 +24,7 @@
 class RabbitMQPlugin(distributed.WorkerPlugin):
     """Dask Worker Plugin for RabbitMQ integration"""
 
-    name = "rabbitmq_plugin"
+    name = "rabbitmq_worker_plugin"
     _main_thread_loop: AbstractEventLoop | None = None
     _client: RabbitMQClient | None = None
     _settings: RabbitSettings | None = None
@@ -60,7 +60,7 @@ async def _() -> None:
 
             if threading.current_thread() is not threading.main_thread():
                 _logger.warning(
-                    "RabbitMQ client plugin setup is not in the main thread! Beware! if in pytest it's ok."
+                    "RabbitMQ client plugin setup is not in the main thread! TIP: if in pytest it's ok."
                 )
 
             with log_context(
@@ -98,7 +98,7 @@ async def _() -> None:
                     )
                 else:
                     _logger.warning(
-                        "RabbitMQ client plugin setup is not the main thread!"
+                        "RabbitMQ client plugin setup is not the main thread! TIP: if in pytest it's ok."
                     )
 
                 # Cancel the message processor task
 
@@ -1,6 +1,9 @@
 import logging
 
 import distributed
+from dask_task_models_library.plugins.task_life_cycle_scheduler_plugin import (
+    TaskLifecycleSchedulerPlugin,
+)
 from servicelib.logging_utils import log_context
 
 from ._meta import print_dask_scheduler_banner
@@ -19,9 +22,13 @@ async def dask_setup(scheduler: distributed.Scheduler) -> None:
 
     with log_context(_logger, logging.INFO, "Launch dask scheduler"):
         _logger.info("app settings: %s", settings.model_dump_json(indent=1))
+
+        scheduler.add_plugin(TaskLifecycleSchedulerPlugin())
         print_dask_scheduler_banner()
 
 
-async def dask_teardown(_worker: distributed.Worker) -> None:
-    with log_context(_logger, logging.INFO, "Tear down dask scheduler"):
+async def dask_teardown(scheduler: distributed.Scheduler) -> None:
+    with log_context(
+        _logger, logging.INFO, f"Tear down dask scheduler at {scheduler.address}"
+    ):
         ...