From 7bbc90ba334629d51898ff83d41bd26d2b80e3bb Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 11:59:53 +0100
Subject: [PATCH 01/27] ConfigurationError

---
 .../src/simcore_service_director_v2/core/errors.py    | 11 +++--------
 .../modules/comp_scheduler/_scheduler_factory.py      |  2 +-
 .../modules/dask_clients_pool.py                      |  2 +-
 .../modules/db/repositories/comp_tasks/_utils.py      | 10 ++++++----
 .../simcore_service_director_v2/modules/rabbitmq.py   |  4 ++--
 .../utils/dask_client_utils.py                        | 10 +++++-----
 6 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 65af83fa28f1..07fc958b92ec 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -25,17 +25,12 @@
 from models_library.projects_nodes_io import NodeID
 
 
-class DirectorError(Exception):
-    """Basic exception"""
+class DirectorError(OsparcErrorMixin, RuntimeError):
+    msg_template: str = "Director-v2 unexpected error"
 
 
 class ConfigurationError(DirectorError):
-    """An error in the director-v2 configuration"""
-
-    def __init__(self, msg: str | None = None):
-        super().__init__(
-            msg or "Invalid configuration of the director-v2 application. Please check."
-        )
+    msg_template: str = "Application misconfiguration: {msg}"
 
 
 class GenericDockerError(DirectorError):
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py
index 39b432b94925..524dfc7e8ad5 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py
@@ -18,7 +18,7 @@
 async def create_from_db(app: FastAPI) -> BaseCompScheduler:
     if not hasattr(app.state, "engine"):
         msg = "Database connection is missing. Please check application configuration."
-        raise ConfigurationError(msg)
+        raise ConfigurationError(msg=msg)
     db_engine = app.state.engine
 
     with log_context(
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dask_clients_pool.py b/services/director-v2/src/simcore_service_director_v2/modules/dask_clients_pool.py
index d246bb35f428..31177b5a6162 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dask_clients_pool.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dask_clients_pool.py
@@ -50,7 +50,7 @@ async def create(
     def instance(app: FastAPI) -> "DaskClientsPool":
         if not hasattr(app.state, "dask_clients_pool"):
             msg = "Dask clients pool is not available. Please check the configuration."
-            raise ConfigurationError(msg)
+            raise ConfigurationError(msg=msg)
         dask_clients_pool: DaskClientsPool = app.state.dask_clients_pool
         return dask_clients_pool
 
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py
index bdb64cbbf993..637e0c7faf6f 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py
@@ -322,7 +322,7 @@ def _by_type_name(ec2: EC2InstanceTypeGet) -> bool:
             f"invalid EC2 type name selected {set(hardware_info.aws_ec2_instances)}."
             " TIP: adjust product configuration"
         )
-        raise ConfigurationError(msg) from exc
+        raise ConfigurationError(msg=msg) from exc
     except (
         RemoteMethodNotRegisteredError,
         RPCServerError,
@@ -450,9 +450,11 @@ async def generate_tasks_list_from_project(
             last_heartbeat=None,
             created=arrow.utcnow().datetime,
             modified=arrow.utcnow().datetime,
-            pricing_info=pricing_info.model_dump(exclude={"pricing_unit_cost"})
-            if pricing_info
-            else None,
+            pricing_info=(
+                pricing_info.model_dump(exclude={"pricing_unit_cost"})
+                if pricing_info
+                else None
+            ),
             hardware_info=hardware_info,
         )
 
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/rabbitmq.py b/services/director-v2/src/simcore_service_director_v2/modules/rabbitmq.py
index dcda51ad0e51..a7cb4e1ba27a 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/rabbitmq.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/rabbitmq.py
@@ -81,7 +81,7 @@ async def on_shutdown() -> None:
 def get_rabbitmq_client(app: FastAPI) -> RabbitMQClient:
     if not hasattr(app.state, "rabbitmq_client"):
         msg = "RabbitMQ client is not available. Please check the configuration."
-        raise ConfigurationError(msg)
+        raise ConfigurationError(msg=msg)
     return cast(RabbitMQClient, app.state.rabbitmq_client)
 
 
@@ -90,5 +90,5 @@ def get_rabbitmq_rpc_client(app: FastAPI) -> RabbitMQRPCClient:
         msg = (
             "RabbitMQ client for RPC is not available. Please check the configuration."
         )
-        raise ConfigurationError(msg)
+        raise ConfigurationError(msg=msg)
     return cast(RabbitMQRPCClient, app.state.rabbitmq_rpc_client)
diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
index 15e6e98dfce2..fca890c6128e 100644
--- a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
+++ b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
@@ -101,7 +101,7 @@ async def _connect_to_dask_scheduler(
         )
     except TypeError as exc:
         msg = f"Scheduler has invalid configuration: {endpoint=}"
-        raise ConfigurationError(msg) from exc
+        raise ConfigurationError(msg=msg) from exc
 
 
 async def _connect_with_gateway_and_create_cluster(
@@ -155,7 +155,7 @@ async def _connect_with_gateway_and_create_cluster(
 
     except TypeError as exc:
         msg = f"Cluster has invalid configuration: {endpoint=}, {auth_params=}"
-        raise ConfigurationError(msg) from exc
+        raise ConfigurationError(msg=msg) from exc
     except ValueError as exc:
         # this is when a 404=NotFound,422=MalformedData comes up
         raise DaskClientRequestError(endpoint=endpoint, error=exc) from exc
@@ -196,10 +196,10 @@ async def get_gateway_auth_from_params(
             return dask_gateway.JupyterHubAuth(auth_params.api_token)
     except (TypeError, ValueError) as exc:
         msg = f"Cluster has invalid configuration: {auth_params}"
-        raise ConfigurationError(msg) from exc
+        raise ConfigurationError(msg=msg) from exc
 
     msg = f"Cluster has invalid configuration: {auth_params=}"
-    raise ConfigurationError(msg)
+    raise ConfigurationError(msg=msg)
 
 
 _PING_TIMEOUT_S: Final[int] = 5
@@ -251,4 +251,4 @@ async def test_scheduler_endpoint(
     ) as exc:
         logger.debug("Pinging %s, failed: %s", f"{endpoint=}", f"{exc=!r}")
         msg = f"Could not connect to cluster in {endpoint}: error: {exc}"
-        raise ConfigurationError(msg) from exc
+        raise ConfigurationError(msg=msg) from exc

From 43a2a8071642da3db1b9a02e14bc4d7a14a31c5d Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:18:22 +0100
Subject: [PATCH 02/27] pruning

---
 .../core/errors.py                            | 65 ++-----------------
 .../modules/db/repositories/comp_pipelines.py |  6 +-
 .../modules/db/repositories/projects.py       |  2 +-
 .../db/repositories/projects_networks.py      |  2 +-
 .../modules/resource_usage_tracker_client.py  |  4 +-
 5 files changed, 12 insertions(+), 67 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 07fc958b92ec..285e4d13a3c7 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -33,77 +33,20 @@ class ConfigurationError(DirectorError):
     msg_template: str = "Application misconfiguration: {msg}"
 
 
-class GenericDockerError(DirectorError):
-    """Generic docker library error"""
-
-    def __init__(self, msg: str, original_exception: Exception):
-        super().__init__(msg + f": {original_exception}")
-        self.original_exception = original_exception
-
-
-class ServiceNotAvailableError(DirectorError):
-    """Service not found"""
-
-    def __init__(self, service_name: str, service_tag: str | None = None):
-        service_tag = service_tag or "UNDEFINED"
-        super().__init__(f"The service {service_name}:{service_tag} does not exist")
-        self.service_name = service_name
-        self.service_tag = service_tag
-
-
-class ServiceUUIDNotFoundError(DirectorError):
-    """Service not found"""
-
-    def __init__(self, service_uuid: str):
-        super().__init__(f"The service with uuid {service_uuid} was not found")
-        self.service_uuid = service_uuid
-
-
-class ServiceUUIDInUseError(DirectorError):
-    """Service UUID is already in use"""
-
-    def __init__(self, service_uuid: str):
-        super().__init__(f"The service uuid {service_uuid} is already in use")
-        self.service_uuid = service_uuid
-
-
-class ServiceStartTimeoutError(DirectorError):
-    """The service was created but never run (time-out)"""
-
-    def __init__(self, service_name: str, service_uuid: str):
-        super().__init__(f"Service {service_name}:{service_uuid} failed to start ")
-        self.service_name = service_name
-        self.service_uuid = service_uuid
-
-
 class ProjectNotFoundError(DirectorError):
-    """Project not found error"""
-
-    def __init__(self, project_id: ProjectID):
-        super().__init__(f"project {project_id} not found")
-        self.project_id = project_id
+    msg_template: str = "project {project_id} not found"
 
 
 class ProjectNetworkNotFoundError(DirectorError):
-    """Project not found error"""
-
-    def __init__(self, project_id: ProjectID):
-        super().__init__(f"no networks forund for project {project_id}")
-        self.project_id = project_id
+    msg_template: str = "no networks found for project {project_id}"
 
 
 class PricingPlanUnitNotFoundError(DirectorError):
-    """Pricing plan unit not found error"""
-
-    def __init__(self, msg: str):
-        super().__init__(msg)
+    msg_template: str = "pricing plan not found {msg}"
 
 
 class PipelineNotFoundError(DirectorError):
-    """Pipeline not found error"""
-
-    def __init__(self, pipeline_id: str):
-        super().__init__(f"pipeline {pipeline_id} not found")
+    msg_template: str = "pipeline {pipeline_id} not found"
 
 
 class ComputationalRunNotFoundError(OsparcErrorMixin, DirectorError):
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_pipelines.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_pipelines.py
index 37129141f6d0..38981b5fa7d1 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_pipelines.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_pipelines.py
@@ -25,7 +25,7 @@ async def get_pipeline(self, project_id: ProjectID) -> CompPipelineAtDB:
             )
             row: RowProxy | None = await result.fetchone()
         if not row:
-            raise PipelineNotFoundError(str(project_id))
+            raise PipelineNotFoundError(pipeline_id=project_id)
         return CompPipelineAtDB.model_validate(row)
 
     async def upsert_pipeline(
@@ -39,7 +39,9 @@ async def upsert_pipeline(
             dag_adjacency_list=nx.to_dict_of_lists(dag_graph),
             state=RunningState.PUBLISHED if publish else RunningState.NOT_STARTED,
         )
-        insert_stmt = insert(comp_pipeline).values(**pipeline_at_db.model_dump(by_alias=True))
+        insert_stmt = insert(comp_pipeline).values(
+            **pipeline_at_db.model_dump(by_alias=True)
+        )
         # FIXME: This is not a nice thing. this part of the information should be kept in comp_runs.
         update_exclusion_policy = set()
         if not dag_graph.nodes():
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects.py
index 5f5fe5263fff..902f99775744 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects.py
@@ -22,7 +22,7 @@ async def get_project(self, project_id: ProjectID) -> ProjectAtDB:
                 )
             ).first()
         if not row:
-            raise ProjectNotFoundError(project_id)
+            raise ProjectNotFoundError(project_id=project_id)
         return ProjectAtDB.model_validate(row)
 
     async def is_node_present_in_workbench(
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects_networks.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects_networks.py
index 59334aa0a060..12fc7fe29322 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects_networks.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/projects_networks.py
@@ -22,7 +22,7 @@ async def get_projects_networks(self, project_id: ProjectID) -> ProjectsNetworks
                 )
             ).first()
         if not row:
-            raise ProjectNetworkNotFoundError(project_id)
+            raise ProjectNetworkNotFoundError(project_id=project_id)
         return ProjectsNetworks.model_validate(row)
 
     async def upsert_projects_networks(
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py b/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py
index 58d02975fd70..3b75607989d1 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py
@@ -91,7 +91,7 @@ async def get_default_service_pricing_plan(
         )
         if response.status_code == status.HTTP_404_NOT_FOUND:
             msg = "No pricing plan defined"
-            raise PricingPlanUnitNotFoundError(msg)
+            raise PricingPlanUnitNotFoundError(msg=msg)
 
         response.raise_for_status()
         return PricingPlanGet.model_validate(response.json())
@@ -117,7 +117,7 @@ async def get_default_pricing_and_hardware_info(
                     unit.specific_info.aws_ec2_instances,
                 )
         msg = "Default pricing plan and unit does not exist"
-        raise PricingPlanUnitNotFoundError(msg)
+        raise PricingPlanUnitNotFoundError(msg=msg)
 
     async def get_pricing_unit(
         self,

From 53b7cde3c38acc8d2657f69cf886521936e5e6d3 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:43:38 +0100
Subject: [PATCH 03/27] pruning

---
 .../api/routes/computations.py                |   10 +-
 .../core/errors.py                            |   90 +-
 .../modules/comp_scheduler/_base_scheduler.py |   15 +-
 .../modules/comp_scheduler/_dask_scheduler.py |    3 +-
 .../modules/comp_scheduler/_scheduler_base.py |  792 ++++++++
 .../simcore_service_director_v2/utils/dask.py |   19 +-
 .../utils/dask_client_utils.py                |    6 +-
 ...t_modules_comp_scheduler_dask_scheduler.py | 1630 +++++++++++++++++
 ...t_modules_comp_scheduler_dask_scheduler.py |    4 +-
 9 files changed, 2488 insertions(+), 81 deletions(-)
 create mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py
 create mode 100644 services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py

diff --git a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
index e624ed0785cb..4dd160ed6f40 100644
--- a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
+++ b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
@@ -53,10 +53,10 @@
     ClusterNotFoundError,
     ClustersKeeperNotAvailableError,
     ComputationalRunNotFoundError,
+    ComputationalSchedulerError,
     ConfigurationError,
     PricingPlanUnitNotFoundError,
     ProjectNotFoundError,
-    SchedulerError,
     WalletNotEnoughCreditsError,
 )
 from ...models.comp_pipelines import CompPipelineAtDB
@@ -510,7 +510,9 @@ async def get_computation(
         pipeline_details=pipeline_details,
         url=TypeAdapter(AnyHttpUrl).validate_python(f"{request.url}"),
         stop_url=(
-            TypeAdapter(AnyHttpUrl).validate_python(f"{self_url}:stop?user_id={user_id}")
+            TypeAdapter(AnyHttpUrl).validate_python(
+                f"{self_url}:stop?user_id={user_id}"
+            )
             if pipeline_state.is_running()
             else None
         ),
@@ -598,7 +600,7 @@ async def stop_computation(
 
     except ProjectNotFoundError as e:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"{e}") from e
-    except SchedulerError as e:
+    except ComputationalSchedulerError as e:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"{e}") from e
 
 
@@ -639,7 +641,7 @@ async def delete_computation(
             # abort the pipeline first
             try:
                 await scheduler.stop_pipeline(computation_stop.user_id, project_id)
-            except SchedulerError as e:
+            except ComputationalSchedulerError as e:
                 _logger.warning(
                     "Project %s could not be stopped properly.\n reason: %s",
                     project_id,
diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 285e4d13a3c7..eb605c307eab 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -49,71 +49,47 @@ class PipelineNotFoundError(DirectorError):
     msg_template: str = "pipeline {pipeline_id} not found"
 
 
-class ComputationalRunNotFoundError(OsparcErrorMixin, DirectorError):
+class ComputationalRunNotFoundError(DirectorError):
     msg_template = "Computational run not found"
 
 
-class ComputationalTaskNotFoundError(OsparcErrorMixin, DirectorError):
+class ComputationalTaskNotFoundError(DirectorError):
     msg_template = "Computational task {node_id} not found"
 
 
-class WalletNotEnoughCreditsError(OsparcErrorMixin, DirectorError):
+class WalletNotEnoughCreditsError(DirectorError):
     msg_template = "Wallet '{wallet_name}' has {wallet_credit_amount} credits."
 
 
 #
 # SCHEDULER ERRORS
 #
+class ComputationalSchedulerError(DirectorError):
+    msg_template = "Computational scheduler unexpected error"
 
 
-class SchedulerError(DirectorError):
-    def __init__(self, msg: str | None = None):
-        super().__init__(msg or "Unexpected error in the scheduler")
+class InvalidPipelineError(ComputationalSchedulerError):
+    msg_template = "Computational scheduler: Invalid configuration of pipeline {pipeline_id}: {msg}"
 
 
-class InvalidPipelineError(SchedulerError):
-    """A pipeline is misconfigured"""
-
-    def __init__(self, pipeline_id: str, msg: str | None = None):
-        super().__init__(msg or f"Invalid configuration of pipeline {pipeline_id}")
-
-
-class TaskSchedulingError(SchedulerError):
-    """A task cannot be scheduled"""
-
-    code: str = "task scheduler error"
-
-    def __init__(self, project_id: ProjectID, node_id: NodeID, msg: str | None = None):
-        super().__init__(msg=msg)
-        self.project_id = project_id
-        self.node_id = node_id
-
-    def get_errors(self) -> list[ErrorDict]:
-        # default implementation
-        return [
-            {
-                "loc": (
-                    f"{self.project_id}",
-                    f"{self.node_id}",
-                ),
-                "msg": f"{self.args[0]}",
-                "type": self.code,
-            },
-        ]
+class TaskSchedulingError(ComputationalSchedulerError):
+    msg_template = "Computational scheduler: Task {node_id} in project {project_id} could not be scheduled {msg}"
 
 
 class MissingComputationalResourcesError(TaskSchedulingError):
-    """A task cannot be scheduled because the cluster does not have the required resources"""
-
-    def __init__(self, project_id: ProjectID, node_id: NodeID, msg: str | None = None):
-        super().__init__(project_id, node_id, msg=msg)
+    msg_template = (
+        "Service {service_name}:{service_version} cannot be scheduled "
+        "on cluster {cluster_id}: task needs '{task_resources}', "
+        "cluster has {cluster_resources}",
+    )
 
 
 class InsuficientComputationalResourcesError(TaskSchedulingError):
-    """A task cannot be scheduled because the cluster does not have *enough* of the required resources"""
-
-    def __init__(self, project_id: ProjectID, node_id: NodeID, msg: str | None = None):
-        super().__init__(project_id, node_id, msg=msg)
+    msg_template: str = (
+        "Insufficient computational resources to run {service_name}:{service_version} with {service_requested_resources} on cluster {cluster_id}."
+        "Cluster available workers: {cluster_available_resources}"
+        "TIP: Reduce service required resources or contact oSparc support"
+    )
 
 
 class PortsValidationError(TaskSchedulingError):
@@ -158,33 +134,33 @@ def get_errors(self) -> list[ErrorDict]:
         return value_errors
 
 
-class ComputationalSchedulerChangedError(OsparcErrorMixin, SchedulerError):
+class ComputationalSchedulerChangedError(ComputationalSchedulerError):
     msg_template = "The dask scheduler ID changed from '{original_scheduler_id}' to '{current_scheduler_id}'"
 
 
-class ComputationalBackendNotConnectedError(OsparcErrorMixin, SchedulerError):
+class ComputationalBackendNotConnectedError(ComputationalSchedulerError):
     msg_template = "The dask computational backend is not connected"
 
 
-class ComputationalBackendNoS3AccessError(OsparcErrorMixin, SchedulerError):
+class ComputationalBackendNoS3AccessError(ComputationalSchedulerError):
     msg_template = "The S3 backend is not ready, please try again later"
 
 
-class ComputationalBackendTaskNotFoundError(OsparcErrorMixin, SchedulerError):
+class ComputationalBackendTaskNotFoundError(ComputationalSchedulerError):
     msg_template = (
         "The dask computational backend does not know about the task '{job_id}'"
     )
 
 
-class ComputationalBackendTaskResultsNotReadyError(OsparcErrorMixin, SchedulerError):
+class ComputationalBackendTaskResultsNotReadyError(ComputationalSchedulerError):
     msg_template = "The task result is not ready yet for job '{job_id}'"
 
 
-class ClustersKeeperNotAvailableError(OsparcErrorMixin, SchedulerError):
+class ClustersKeeperNotAvailableError(ComputationalSchedulerError):
     msg_template = "clusters-keeper service is not available!"
 
 
-class ComputationalBackendOnDemandNotReadyError(OsparcErrorMixin, SchedulerError):
+class ComputationalBackendOnDemandNotReadyError(ComputationalSchedulerError):
     msg_template = (
         "The on demand computational cluster is not ready 'est. remaining time: {eta}'"
     )
@@ -193,15 +169,15 @@ class ComputationalBackendOnDemandNotReadyError(OsparcErrorMixin, SchedulerError
 #
 # SCHEDULER/CLUSTER ERRORS
 #
-class ClusterNotFoundError(OsparcErrorMixin, SchedulerError):
+class ClusterNotFoundError(ComputationalSchedulerError):
     msg_template = "The cluster '{cluster_id}' not found"
 
 
-class ClusterAccessForbiddenError(OsparcErrorMixin, SchedulerError):
+class ClusterAccessForbiddenError(ComputationalSchedulerError):
     msg_template = "Insufficient rights to access cluster '{cluster_id}'"
 
 
-class ClusterInvalidOperationError(OsparcErrorMixin, SchedulerError):
+class ClusterInvalidOperationError(ComputationalSchedulerError):
     msg_template = "Invalid operation on cluster '{cluster_id}'"
 
 
@@ -210,21 +186,21 @@ class ClusterInvalidOperationError(OsparcErrorMixin, SchedulerError):
 #
 
 
-class DaskClientRequestError(OsparcErrorMixin, SchedulerError):
+class DaskClientRequestError(ComputationalSchedulerError):
     msg_template = (
         "The dask client to cluster on '{endpoint}' did an invalid request '{error}'"
     )
 
 
-class DaskClusterError(OsparcErrorMixin, SchedulerError):
+class DaskClusterError(ComputationalSchedulerError):
     msg_template = "The dask cluster on '{endpoint}' encountered an error: '{error}'"
 
 
-class DaskGatewayServerError(OsparcErrorMixin, SchedulerError):
+class DaskGatewayServerError(ComputationalSchedulerError):
     msg_template = "The dask gateway on '{endpoint}' encountered an error: '{error}'"
 
 
-class DaskClientAcquisisitonError(OsparcErrorMixin, SchedulerError):
+class DaskClientAcquisisitonError(ComputationalSchedulerError):
     msg_template = (
         "The dask client to cluster '{cluster}' encountered an error '{error}'"
     )
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
index 5fd39dda4bc2..9dfae4bc6660 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
@@ -45,10 +45,10 @@
     ComputationalBackendNotConnectedError,
     ComputationalBackendOnDemandNotReadyError,
     ComputationalSchedulerChangedError,
+    ComputationalSchedulerError,
     DaskClientAcquisisitonError,
     InvalidPipelineError,
     PipelineNotFoundError,
-    SchedulerError,
     TaskSchedulingError,
 )
 from ...core.settings import ComputationalBackendSettings
@@ -242,7 +242,7 @@ async def stop_pipeline(
             }
             if not possible_iterations:
                 msg = f"There are no pipeline scheduled for {user_id}:{project_id}"
-                raise SchedulerError(msg)
+                raise ComputationalSchedulerError(msg)
             current_max_iteration = max(possible_iterations)
             selected_iteration = current_max_iteration
         else:
@@ -281,7 +281,7 @@ def _get_last_iteration(self, user_id: UserID, project_id: ProjectID) -> Iterati
         }
         if not possible_iterations:
             msg = f"There are no pipeline scheduled for {user_id}:{project_id}"
-            raise SchedulerError(msg)
+            raise ComputationalSchedulerError(msg)
         return max(possible_iterations)
 
     def _start_scheduling(
@@ -342,10 +342,10 @@ async def _get_pipeline_tasks(
         }
         if len(pipeline_comp_tasks) != len(pipeline_dag.nodes()):  # type: ignore[arg-type]
             msg = (
-                f"{project_id}The tasks defined for {project_id} do not contain all"
+                f"The tasks defined for {project_id} do not contain all"
                 f" the tasks defined in the pipeline [{list(pipeline_dag.nodes)}]! Please check."
             )
-            raise InvalidPipelineError(msg)
+            raise InvalidPipelineError(pipeline_id=project_id, msg=msg)
         return pipeline_comp_tasks
 
     async def _update_run_result_from_tasks(
@@ -929,10 +929,11 @@ async def _schedule_tasks_to_start(  # noqa: C901
                 comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
             raise
         except TaskSchedulingError as exc:
+            exc.error_context()["project_id"]
             _logger.exception(
                 "Project '%s''s task '%s' could not be scheduled",
-                exc.project_id,
-                exc.node_id,
+                exc.error_context()["project_id"],
+                exc.error_context()["node_id"],
             )
             await CompTasksRepository.instance(
                 self.db_engine
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py
index 2fdf7acd2e9e..e578715eb799 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py
@@ -345,11 +345,10 @@ async def _process_task_result(
             except TaskSchedulingError as err:
                 task_final_state = RunningState.FAILED
                 simcore_platform_status = SimcorePlatformStatus.BAD
-                errors = err.get_errors()
                 _logger.debug(
                     "Unexpected failure while processing results of %s: %s",
                     f"{task=}",
-                    f"{errors=}",
+                    f"{err=}",
                 )
 
             # resource tracking
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py
new file mode 100644
index 000000000000..ce15311f9949
--- /dev/null
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py
@@ -0,0 +1,792 @@
+"""The scheduler shall be run as a background task.
+Based on oSparc pipelines, it monitors when to start the next worker task(s), either one at a time or as a group of tasks.
+
+In principle the Scheduler maintains the comp_runs table in the database.
+It contains how the pipeline was run and by whom.
+It also contains the final result of the pipeline run.
+
+When a pipeline is scheduled first all the tasks contained in the DAG are set to PUBLISHED state.
+Once the scheduler determines a task shall run, its state is set to PENDING, so that the sidecar can pick up the task.
+The sidecar will then change the state to STARTED, then to SUCCESS or FAILED.
+
+"""
+
+import asyncio
+import datetime
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Final
+
+import arrow
+import networkx as nx
+from aiopg.sa.engine import Engine
+from models_library.projects import ProjectID
+from models_library.projects_nodes_io import NodeID, NodeIDStr
+from models_library.projects_state import RunningState
+from models_library.services import ServiceKey, ServiceType, ServiceVersion
+from models_library.users import UserID
+from networkx.classes.reportviews import InDegreeView
+from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
+from servicelib.logging_utils import log_context
+from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient
+from servicelib.redis import RedisClientSDK
+
+from ...constants import UNDEFINED_STR_METADATA
+from ...core.errors import (
+    ClustersKeeperNotAvailableError,
+    ComputationalBackendNotConnectedError,
+    ComputationalBackendOnDemandNotReadyError,
+    ComputationalSchedulerChangedError,
+    DaskClientAcquisisitonError,
+    InvalidPipelineError,
+    PipelineNotFoundError,
+    TaskSchedulingError,
+)
+from ...core.settings import ComputationalBackendSettings
+from ...models.comp_pipelines import CompPipelineAtDB
+from ...models.comp_runs import CompRunsAtDB, Iteration, RunMetadataDict
+from ...models.comp_tasks import CompTaskAtDB
+from ...utils.computations import get_pipeline_state_from_task_states
+from ...utils.rabbitmq import (
+    publish_project_log,
+    publish_service_resource_tracking_heartbeat,
+    publish_service_resource_tracking_started,
+    publish_service_started_metrics,
+)
+from ..db.repositories.comp_pipelines import CompPipelinesRepository
+from ..db.repositories.comp_runs import CompRunsRepository
+from ..db.repositories.comp_tasks import CompTasksRepository
+from ._utils import (
+    COMPLETED_STATES,
+    PROCESSING_STATES,
+    RUNNING_STATES,
+    TASK_TO_START_STATES,
+    WAITING_FOR_START_STATES,
+    create_service_resources_from_task,
+    get_resource_tracking_run_id,
+)
+
+_logger = logging.getLogger(__name__)
+
+
+_Previous = CompTaskAtDB
+_Current = CompTaskAtDB
+_MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10
+
+
+@dataclass(frozen=True, slots=True)
+class SortedTasks:
+    started: list[CompTaskAtDB]
+    completed: list[CompTaskAtDB]
+    waiting: list[CompTaskAtDB]
+    potentially_lost: list[CompTaskAtDB]
+
+
+async def _triage_changed_tasks(
+    changed_tasks: list[tuple[_Previous, _Current]]
+) -> SortedTasks:
+    started_tasks = [
+        current
+        for previous, current in changed_tasks
+        if current.state in RUNNING_STATES
+        or (
+            previous.state in WAITING_FOR_START_STATES
+            and current.state in COMPLETED_STATES
+        )
+    ]
+
+    # NOTE: some tasks can be both started and completed since we might have the time they were running
+    completed_tasks = [
+        current for _, current in changed_tasks if current.state in COMPLETED_STATES
+    ]
+
+    waiting_for_resources_tasks = [
+        current
+        for previous, current in changed_tasks
+        if current.state in WAITING_FOR_START_STATES
+    ]
+
+    lost_or_momentarily_lost_tasks = [
+        current for _, current in changed_tasks if current.state is RunningState.UNKNOWN
+    ]
+    if lost_or_momentarily_lost_tasks:
+        _logger.warning(
+            "%s are currently in unknown state. TIP: If they are running in an external cluster and it is not yet ready, that might explain it. But inform @sanderegg nevertheless!",
+            [t.node_id for t in lost_or_momentarily_lost_tasks],
+        )
+
+    return SortedTasks(
+        started_tasks,
+        completed_tasks,
+        waiting_for_resources_tasks,
+        lost_or_momentarily_lost_tasks,
+    )
+
+
+@dataclass
+class BaseCompScheduler(ABC):
+    db_engine: Engine
+    rabbitmq_client: RabbitMQClient
+    rabbitmq_rpc_client: RabbitMQRPCClient
+    settings: ComputationalBackendSettings
+    service_runtime_heartbeat_interval: datetime.timedelta
+    redis_client: RedisClientSDK
+
+    async def _get_pipeline_dag(self, project_id: ProjectID) -> nx.DiGraph:
+        comp_pipeline_repo = CompPipelinesRepository.instance(self.db_engine)
+        pipeline_at_db: CompPipelineAtDB = await comp_pipeline_repo.get_pipeline(
+            project_id
+        )
+        dag = pipeline_at_db.get_graph()
+        _logger.debug("%s: current %s", f"{project_id=}", f"{dag=}")
+        return dag
+
+    async def _get_pipeline_tasks(
+        self, project_id: ProjectID, pipeline_dag: nx.DiGraph
+    ) -> dict[NodeIDStr, CompTaskAtDB]:
+        comp_tasks_repo = CompTasksRepository.instance(self.db_engine)
+        pipeline_comp_tasks: dict[NodeIDStr, CompTaskAtDB] = {
+            NodeIDStr(f"{t.node_id}"): t
+            for t in await comp_tasks_repo.list_computational_tasks(project_id)
+            if (f"{t.node_id}" in list(pipeline_dag.nodes()))
+        }
+        if len(pipeline_comp_tasks) != len(pipeline_dag.nodes()):  # type: ignore[arg-type]
+            msg = (
+                f"{project_id}The tasks defined for {project_id} do not contain all"
+                f" the tasks defined in the pipeline [{list(pipeline_dag.nodes)}]! Please check."
+            )
+            raise InvalidPipelineError(msg)
+        return pipeline_comp_tasks
+
+    async def _update_run_result_from_tasks(
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        iteration: Iteration,
+        pipeline_tasks: dict[NodeIDStr, CompTaskAtDB],
+    ) -> RunningState:
+        pipeline_state_from_tasks: RunningState = get_pipeline_state_from_task_states(
+            list(pipeline_tasks.values()),
+        )
+        _logger.debug(
+            "pipeline %s is currently in %s",
+            f"{user_id=}_{project_id=}_{iteration=}",
+            f"{pipeline_state_from_tasks}",
+        )
+        await self._set_run_result(
+            user_id, project_id, iteration, pipeline_state_from_tasks
+        )
+        return pipeline_state_from_tasks
+
+    async def _set_run_result(
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        iteration: Iteration,
+        run_result: RunningState,
+    ) -> None:
+        comp_runs_repo = CompRunsRepository.instance(self.db_engine)
+        await comp_runs_repo.set_run_result(
+            user_id=user_id,
+            project_id=project_id,
+            iteration=iteration,
+            result_state=run_result,
+            final_state=(run_result in COMPLETED_STATES),
+        )
+
+    async def _set_states_following_failed_to_aborted(
+        self, project_id: ProjectID, dag: nx.DiGraph
+    ) -> dict[NodeIDStr, CompTaskAtDB]:
+        tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks(
+            project_id, dag
+        )
+        node_ids_to_set_as_aborted: set[NodeIDStr] = set()
+        for task in tasks.values():
+            if task.state == RunningState.FAILED:
+                node_ids_to_set_as_aborted.update(nx.bfs_tree(dag, f"{task.node_id}"))
+                node_ids_to_set_as_aborted.remove(NodeIDStr(f"{task.node_id}"))
+        for node_id in node_ids_to_set_as_aborted:
+            tasks[NodeIDStr(f"{node_id}")].state = RunningState.ABORTED
+        if node_ids_to_set_as_aborted:
+            # update the current states back in DB
+            comp_tasks_repo = CompTasksRepository.instance(self.db_engine)
+            await comp_tasks_repo.update_project_tasks_state(
+                project_id,
+                [NodeID(n) for n in node_ids_to_set_as_aborted],
+                RunningState.ABORTED,
+                optional_progress=1.0,
+                optional_stopped=arrow.utcnow().datetime,
+            )
+        return tasks
+
+    async def _send_running_tasks_heartbeat(
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        iteration: Iteration,
+        dag: nx.DiGraph,
+    ) -> None:
+        utc_now = arrow.utcnow().datetime
+
+        def _need_heartbeat(task: CompTaskAtDB) -> bool:
+            if task.state not in RUNNING_STATES:
+                return False
+            if task.last_heartbeat is None:
+                assert task.start  # nosec
+                return bool(
+                    (utc_now - task.start.replace(tzinfo=datetime.UTC))
+                    > self.service_runtime_heartbeat_interval
+                )
+            return bool(
+                (utc_now - task.last_heartbeat)
+                > self.service_runtime_heartbeat_interval
+            )
+
+        tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks(
+            project_id, dag
+        )
+        if running_tasks := [t for t in tasks.values() if _need_heartbeat(t)]:
+            await asyncio.gather(
+                *(
+                    publish_service_resource_tracking_heartbeat(
+                        self.rabbitmq_client,
+                        get_resource_tracking_run_id(
+                            user_id, t.project_id, t.node_id, iteration
+                        ),
+                    )
+                    for t in running_tasks
+                )
+            )
+            comp_tasks_repo = CompTasksRepository(self.db_engine)
+            await asyncio.gather(
+                *(
+                    comp_tasks_repo.update_project_task_last_heartbeat(
+                        t.project_id, t.node_id, utc_now
+                    )
+                    for t in running_tasks
+                )
+            )
+
+    async def _get_changed_tasks_from_backend(
+        self,
+        user_id: UserID,
+        processing_tasks: list[CompTaskAtDB],
+        comp_run: CompRunsAtDB,
+    ) -> list[tuple[_Previous, _Current]]:
+        tasks_backend_status = await self._get_tasks_status(
+            user_id, processing_tasks, comp_run
+        )
+
+        return [
+            (
+                task,
+                task.copy(update={"state": backend_state}),
+            )
+            for task, backend_state in zip(
+                processing_tasks, tasks_backend_status, strict=True
+            )
+            if task.state is not backend_state
+        ]
+
+    async def _process_started_tasks(
+        self,
+        tasks: list[CompTaskAtDB],
+        *,
+        user_id: UserID,
+        iteration: Iteration,
+        run_metadata: RunMetadataDict,
+    ) -> None:
+        utc_now = arrow.utcnow().datetime
+
+        # resource tracking
+        await asyncio.gather(
+            *(
+                publish_service_resource_tracking_started(
+                    self.rabbitmq_client,
+                    service_run_id=get_resource_tracking_run_id(
+                        user_id, t.project_id, t.node_id, iteration
+                    ),
+                    wallet_id=run_metadata.get("wallet_id"),
+                    wallet_name=run_metadata.get("wallet_name"),
+                    pricing_plan_id=(
+                        t.pricing_info.get("pricing_plan_id")
+                        if t.pricing_info
+                        else None
+                    ),
+                    pricing_unit_id=(
+                        t.pricing_info.get("pricing_unit_id")
+                        if t.pricing_info
+                        else None
+                    ),
+                    pricing_unit_cost_id=(
+                        t.pricing_info.get("pricing_unit_cost_id")
+                        if t.pricing_info
+                        else None
+                    ),
+                    product_name=run_metadata.get(
+                        "product_name", UNDEFINED_STR_METADATA
+                    ),
+                    simcore_user_agent=run_metadata.get(
+                        "simcore_user_agent", UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
+                    ),
+                    user_id=user_id,
+                    user_email=run_metadata.get("user_email", UNDEFINED_STR_METADATA),
+                    project_id=t.project_id,
+                    project_name=run_metadata.get("project_metadata", {}).get(  # type: ignore[arg-type]
+                        "project_name", UNDEFINED_STR_METADATA
+                    ),
+                    node_id=t.node_id,
+                    node_name=run_metadata.get("node_id_names_map", {}).get(
+                        t.node_id, UNDEFINED_STR_METADATA
+                    ),
+                    parent_project_id=run_metadata.get("project_metadata", {}).get(
+                        "parent_project_id"
+                    ),
+                    parent_node_id=run_metadata.get("project_metadata", {}).get(
+                        "parent_node_id"
+                    ),
+                    root_parent_project_id=run_metadata.get("project_metadata", {}).get(
+                        "root_parent_project_id"
+                    ),
+                    root_parent_project_name=run_metadata.get(
+                        "project_metadata", {}
+                    ).get("root_parent_project_name"),
+                    root_parent_node_id=run_metadata.get("project_metadata", {}).get(
+                        "root_parent_node_id"
+                    ),
+                    service_key=ServiceKey(t.image.name),
+                    service_version=ServiceVersion(t.image.tag),
+                    service_type=ServiceType.COMPUTATIONAL,
+                    service_resources=create_service_resources_from_task(t),
+                    service_additional_metadata={},
+                )
+                for t in tasks
+            )
+        )
+        # instrumentation
+        await asyncio.gather(
+            *(
+                publish_service_started_metrics(
+                    self.rabbitmq_client,
+                    user_id=user_id,
+                    simcore_user_agent=run_metadata.get(
+                        "simcore_user_agent", UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
+                    ),
+                    task=t,
+                )
+                for t in tasks
+            )
+        )
+
+        # update DB
+        comp_tasks_repo = CompTasksRepository(self.db_engine)
+        await asyncio.gather(
+            *(
+                comp_tasks_repo.update_project_tasks_state(
+                    t.project_id,
+                    [t.node_id],
+                    t.state,
+                    optional_started=utc_now,
+                    optional_progress=t.progress,
+                )
+                for t in tasks
+            )
+        )
+
+    async def _process_waiting_tasks(self, tasks: list[CompTaskAtDB]) -> None:
+        comp_tasks_repo = CompTasksRepository(self.db_engine)
+        await asyncio.gather(
+            *(
+                comp_tasks_repo.update_project_tasks_state(
+                    t.project_id,
+                    [t.node_id],
+                    t.state,
+                )
+                for t in tasks
+            )
+        )
+
+    async def _update_states_from_comp_backend(
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        iteration: Iteration,
+        pipeline_dag: nx.DiGraph,
+        comp_run: CompRunsAtDB,
+    ) -> None:
+        tasks = await self._get_pipeline_tasks(project_id, pipeline_dag)
+        tasks_inprocess = [t for t in tasks.values() if t.state in PROCESSING_STATES]
+        if not tasks_inprocess:
+            return
+
+        # get the tasks which state actually changed since last check
+        tasks_with_changed_states = await self._get_changed_tasks_from_backend(
+            user_id, tasks_inprocess, comp_run
+        )
+        # NOTE: typical states a task goes through
+        # NOT_STARTED (initial state) -> PUBLISHED (user press run/API call) -> PENDING -> WAITING_FOR_CLUSTER (cluster creation) ->
+        # PENDING -> WAITING_FOR_RESOURCES (workers creation or missing) -> PENDING -> STARTED (worker started processing the task) -> SUCCESS/FAILED
+        # or ABORTED (user cancelled) or UNKNOWN (lost task - it might be transient, be careful with this one)
+        sorted_tasks = await _triage_changed_tasks(tasks_with_changed_states)
+
+        # now process the tasks
+        if sorted_tasks.started:
+            # NOTE: the dask-scheduler cannot differentiate between tasks that are effectively computing and
+            # tasks that are only queued and accepted by a dask-worker.
+            # tasks_started should therefore be mostly empty but for cases where
+            # - dask Pub/Sub mechanism failed, the tasks goes from PENDING -> SUCCESS/FAILED/ABORTED without STARTED
+            # - the task finished so fast that the STARTED state was skipped between 2 runs of the dv-2 comp scheduler
+            await self._process_started_tasks(
+                sorted_tasks.started,
+                user_id=user_id,
+                iteration=iteration,
+                run_metadata=comp_run.metadata,
+            )
+
+        if sorted_tasks.completed or sorted_tasks.potentially_lost:
+            await self._process_completed_tasks(
+                user_id,
+                sorted_tasks.completed + sorted_tasks.potentially_lost,
+                iteration,
+                comp_run=comp_run,
+            )
+
+        if sorted_tasks.waiting:
+            await self._process_waiting_tasks(sorted_tasks.waiting)
+
+    @abstractmethod
+    async def _start_tasks(
+        self,
+        *,
+        user_id: UserID,
+        project_id: ProjectID,
+        scheduled_tasks: dict[NodeID, CompTaskAtDB],
+        comp_run: CompRunsAtDB,
+        wake_up_callback: Callable[[], None],
+    ) -> None:
+        ...
+
+    @abstractmethod
+    async def _get_tasks_status(
+        self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB
+    ) -> list[RunningState]:
+        ...
+
+    @abstractmethod
+    async def _stop_tasks(
+        self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB
+    ) -> None:
+        ...
+
+    @abstractmethod
+    async def _process_completed_tasks(
+        self,
+        user_id: UserID,
+        tasks: list[CompTaskAtDB],
+        iteration: Iteration,
+        comp_run: CompRunsAtDB,
+    ) -> None:
+        ...
+
+    async def schedule_pipeline(
+        self,
+        *,
+        user_id: UserID,
+        project_id: ProjectID,
+        iteration: Iteration,
+        wake_up_callback: Callable[[], None],
+    ) -> None:
+        with log_context(
+            _logger,
+            level=logging.INFO,
+            msg=f"scheduling pipeline {user_id=}:{project_id=}:{iteration=}",
+        ):
+            dag: nx.DiGraph = nx.DiGraph()
+            try:
+                comp_run = await CompRunsRepository.instance(self.db_engine).get(
+                    user_id, project_id, iteration
+                )
+                dag = await self._get_pipeline_dag(project_id)
+                # 1. Update our list of tasks with data from backend (state, results)
+                await self._update_states_from_comp_backend(
+                    user_id, project_id, iteration, dag, comp_run
+                )
+                # 2. Any task following a FAILED task shall be ABORTED
+                comp_tasks = await self._set_states_following_failed_to_aborted(
+                    project_id, dag
+                )
+                # 3. do we want to stop the pipeline now?
+                if comp_run.cancelled:
+                    await self._schedule_tasks_to_stop(
+                        user_id, project_id, comp_tasks, comp_run
+                    )
+                else:
+                    # let's get the tasks to schedule then
+                    comp_tasks = await self._schedule_tasks_to_start(
+                        user_id=user_id,
+                        project_id=project_id,
+                        comp_tasks=comp_tasks,
+                        dag=dag,
+                        comp_run=comp_run,
+                        wake_up_callback=wake_up_callback,
+                    )
+                # 4. timeout if waiting for cluster has been there for more than X minutes
+                comp_tasks = await self._timeout_if_waiting_for_cluster_too_long(
+                    user_id, project_id, comp_tasks
+                )
+                # 5. send a heartbeat
+                await self._send_running_tasks_heartbeat(
+                    user_id, project_id, iteration, dag
+                )
+
+                # 6. Update the run result
+                pipeline_result = await self._update_run_result_from_tasks(
+                    user_id, project_id, iteration, comp_tasks
+                )
+
+                # 7. Are we done scheduling that pipeline?
+                if not dag.nodes() or pipeline_result in COMPLETED_STATES:
+                    # there is nothing left, the run is completed, we're done here
+                    _logger.info(
+                        "pipeline %s scheduling completed with result %s",
+                        f"{project_id=}",
+                        f"{pipeline_result=}",
+                    )
+            except PipelineNotFoundError:
+                _logger.warning(
+                    "pipeline %s does not exist in comp_pipeline table, it will be removed from scheduler",
+                    f"{project_id=}",
+                )
+                await self._set_run_result(
+                    user_id, project_id, iteration, RunningState.ABORTED
+                )
+            except InvalidPipelineError as exc:
+                _logger.warning(
+                    "pipeline %s appears to be misconfigured, it will be removed from scheduler. Please check pipeline:\n%s",
+                    f"{project_id=}",
+                    exc,
+                )
+                await self._set_run_result(
+                    user_id, project_id, iteration, RunningState.ABORTED
+                )
+            except (DaskClientAcquisisitonError, ClustersKeeperNotAvailableError):
+                _logger.exception(
+                    "Unexpected error while connecting with computational backend, aborting pipeline"
+                )
+                tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks(
+                    project_id, dag
+                )
+                comp_tasks_repo = CompTasksRepository(self.db_engine)
+                await comp_tasks_repo.update_project_tasks_state(
+                    project_id,
+                    [t.node_id for t in tasks.values()],
+                    RunningState.FAILED,
+                )
+                await self._set_run_result(
+                    user_id, project_id, iteration, RunningState.FAILED
+                )
+            except ComputationalBackendNotConnectedError:
+                _logger.exception("Computational backend is not connected!")
+
+    async def _schedule_tasks_to_stop(
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        comp_tasks: dict[NodeIDStr, CompTaskAtDB],
+        comp_run: CompRunsAtDB,
+    ) -> None:
+        # get any running task and stop them
+        comp_tasks_repo = CompTasksRepository.instance(self.db_engine)
+        await comp_tasks_repo.mark_project_published_waiting_for_cluster_tasks_as_aborted(
+            project_id
+        )
+        # stop any remaining running task, these are already submitted
+        if tasks_to_stop := [
+            t for t in comp_tasks.values() if t.state in PROCESSING_STATES
+        ]:
+            await self._stop_tasks(user_id, tasks_to_stop, comp_run)
+
+    async def _schedule_tasks_to_start(  # noqa: C901
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        comp_tasks: dict[NodeIDStr, CompTaskAtDB],
+        dag: nx.DiGraph,
+        comp_run: CompRunsAtDB,
+        wake_up_callback: Callable[[], None],
+    ) -> dict[NodeIDStr, CompTaskAtDB]:
+        # filter out the successfully completed tasks
+        dag.remove_nodes_from(
+            {
+                node_id
+                for node_id, t in comp_tasks.items()
+                if t.state == RunningState.SUCCESS
+            }
+        )
+        dag_in_degree = dag.in_degree()
+        assert isinstance(dag_in_degree, InDegreeView)  # nosec
+        next_task_node_ids = [
+            node_id for node_id, degree in dag_in_degree if degree == 0
+        ]
+
+        # get the tasks to start
+        tasks_ready_to_start: dict[NodeID, CompTaskAtDB] = {
+            node_id: comp_tasks[NodeIDStr(f"{node_id}")]
+            for node_id in next_task_node_ids
+            if comp_tasks[NodeIDStr(f"{node_id}")].state in TASK_TO_START_STATES
+        }
+
+        if not tasks_ready_to_start:
+            # nothing to do
+            return comp_tasks
+
+        try:
+            await self._start_tasks(
+                user_id=user_id,
+                project_id=project_id,
+                scheduled_tasks=tasks_ready_to_start,
+                comp_run=comp_run,
+                wake_up_callback=wake_up_callback,
+            )
+        except (
+            ComputationalBackendNotConnectedError,
+            ComputationalSchedulerChangedError,
+        ):
+            _logger.exception(
+                "Issue with computational backend. Tasks are set back "
+                "to WAITING_FOR_CLUSTER state until scheduler comes back!",
+            )
+            await CompTasksRepository.instance(
+                self.db_engine
+            ).update_project_tasks_state(
+                project_id,
+                list(tasks_ready_to_start.keys()),
+                RunningState.WAITING_FOR_CLUSTER,
+            )
+            for task in tasks_ready_to_start:
+                comp_tasks[
+                    NodeIDStr(f"{task}")
+                ].state = RunningState.WAITING_FOR_CLUSTER
+
+        except ComputationalBackendOnDemandNotReadyError as exc:
+            _logger.info(
+                "The on demand computational backend is not ready yet: %s", exc
+            )
+            await publish_project_log(
+                self.rabbitmq_client,
+                user_id,
+                project_id,
+                log=f"{exc}",
+                log_level=logging.INFO,
+            )
+
+            await CompTasksRepository.instance(
+                self.db_engine
+            ).update_project_tasks_state(
+                project_id,
+                list(tasks_ready_to_start.keys()),
+                RunningState.WAITING_FOR_CLUSTER,
+            )
+            for task in tasks_ready_to_start:
+                comp_tasks[
+                    NodeIDStr(f"{task}")
+                ].state = RunningState.WAITING_FOR_CLUSTER
+        except ClustersKeeperNotAvailableError:
+            _logger.exception("Unexpected error while starting tasks:")
+            await publish_project_log(
+                self.rabbitmq_client,
+                user_id,
+                project_id,
+                log="Unexpected error while scheduling computational tasks! TIP: contact osparc support.",
+                log_level=logging.ERROR,
+            )
+
+            await CompTasksRepository.instance(
+                self.db_engine
+            ).update_project_tasks_state(
+                project_id,
+                list(tasks_ready_to_start.keys()),
+                RunningState.FAILED,
+                optional_progress=1.0,
+                optional_stopped=arrow.utcnow().datetime,
+            )
+            for task in tasks_ready_to_start:
+                comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
+            raise
+        except TaskSchedulingError as exc:
+            _logger.exception("Project '%s' task could not be scheduled", project_id)
+            node_id = NodeID(exc.error_context()["node_id"])
+            await CompTasksRepository.instance(
+                self.db_engine
+            ).update_project_tasks_state(
+                project_id,
+                [node_id],
+                RunningState.FAILED,
+                [{exc.error_code(): f"{exc}"}],
+                optional_progress=1.0,
+                optional_stopped=arrow.utcnow().datetime,
+            )
+            comp_tasks[NodeIDStr(f"{node_id}")].state = RunningState.FAILED
+        except Exception:
+            _logger.exception(
+                "Unexpected error for %s with %s on %s happened when scheduling %s:",
+                f"{user_id=}",
+                f"{project_id=}",
+                f"{comp_run.cluster_id=}",
+                f"{tasks_ready_to_start.keys()=}",
+            )
+            await CompTasksRepository.instance(
+                self.db_engine
+            ).update_project_tasks_state(
+                project_id,
+                list(tasks_ready_to_start.keys()),
+                RunningState.FAILED,
+                optional_progress=1.0,
+                optional_stopped=arrow.utcnow().datetime,
+            )
+            for task in tasks_ready_to_start:
+                comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
+            raise
+
+        return comp_tasks
+
+    async def _timeout_if_waiting_for_cluster_too_long(
+        self,
+        user_id: UserID,
+        project_id: ProjectID,
+        comp_tasks: dict[NodeIDStr, CompTaskAtDB],
+    ) -> dict[NodeIDStr, CompTaskAtDB]:
+        if all(
+            c.state is RunningState.WAITING_FOR_CLUSTER for c in comp_tasks.values()
+        ):
+            # get latest modified task
+            latest_modified_of_all_tasks = max(
+                comp_tasks.values(), key=lambda task: task.modified
+            ).modified
+
+            if (
+                arrow.utcnow().datetime - latest_modified_of_all_tasks
+            ) > datetime.timedelta(minutes=_MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN):
+                await CompTasksRepository.instance(
+                    self.db_engine
+                ).update_project_tasks_state(
+                    project_id,
+                    [NodeID(idstr) for idstr in comp_tasks],
+                    RunningState.FAILED,
+                    optional_progress=1.0,
+                    optional_stopped=arrow.utcnow().datetime,
+                )
+                for task in comp_tasks.values():
+                    task.state = RunningState.FAILED
+                msg = "Timed-out waiting for computational cluster! Please try again and/or contact Osparc support."
+                _logger.error(msg)
+                await publish_project_log(
+                    self.rabbitmq_client,
+                    user_id,
+                    project_id,
+                    log=msg,
+                    log_level=logging.ERROR,
+                )
+        return comp_tasks
diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask.py b/services/director-v2/src/simcore_service_director_v2/utils/dask.py
index d76596b5bf19..a42a28e392c1 100644
--- a/services/director-v2/src/simcore_service_director_v2/utils/dask.py
+++ b/services/director-v2/src/simcore_service_director_v2/utils/dask.py
@@ -608,18 +608,25 @@ def check_if_cluster_is_able_to_run_pipeline(
         raise MissingComputationalResourcesError(
             project_id=project_id,
             node_id=node_id,
-            msg=f"Service {node_image.name}:{node_image.tag} cannot be scheduled "
-            f"on cluster {cluster_id}: task needs '{task_resources}', "
-            f"cluster has {cluster_resources}",
+            service_name=node_image.name,
+            service_version=node_image.tag,
+            cluster_id=cluster_id,
+            task_resources=task_resources,
+            cluster_resources=cluster_resources,
         )
 
     # well then our workers are not powerful enough
     raise InsuficientComputationalResourcesError(
         project_id=project_id,
         node_id=node_id,
-        msg=f"Insufficient computational resources to run {node_image.name}:{node_image.tag} with {_to_human_readable_resource_values( task_resources)} on cluster {cluster_id}."
-        f"Cluster available workers: {[_to_human_readable_resource_values( worker.get('resources', None)) for worker in workers.values()]}"
-        "TIP: Reduce service required resources or contact oSparc support",
+        service_name=node_image.name,
+        service_version=node_image.tag,
+        service_requested_resources=_to_human_readable_resource_values(task_resources),
+        cluster_id=cluster_id,
+        cluster_available_resources=[
+            _to_human_readable_resource_values(worker.get("resources", None))
+            for worker in workers.values()
+        ],
     )
 
 
diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
index fca890c6128e..e1367e71c842 100644
--- a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
+++ b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
@@ -26,11 +26,11 @@
 from pydantic import AnyUrl
 
 from ..core.errors import (
+    ComputationalSchedulerError,
     ConfigurationError,
     DaskClientRequestError,
     DaskClusterError,
     DaskGatewayServerError,
-    SchedulerError,
 )
 from .dask import check_maximize_workers, wrap_client_async_routine
 
@@ -220,7 +220,7 @@ async def test_scheduler_endpoint(
             ) as dask_client:
                 if dask_client.status != _DASK_SCHEDULER_RUNNING_STATE:
                     msg = "internal scheduler is not running!"
-                    raise SchedulerError(msg)
+                    raise ComputationalSchedulerError(msg)
 
         else:
             gateway_auth = await get_gateway_auth_from_params(authentication)
@@ -247,7 +247,7 @@ async def test_scheduler_endpoint(
         ClientConnectionError,
         ClientResponseError,
         httpx.HTTPError,
-        SchedulerError,
+        ComputationalSchedulerError,
     ) as exc:
         logger.debug("Pinging %s, failed: %s", f"{endpoint=}", f"{exc=!r}")
         msg = f"Could not connect to cluster in {endpoint}: error: {exc}"
diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py
new file mode 100644
index 000000000000..6984fcea12c4
--- /dev/null
+++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py
@@ -0,0 +1,1630 @@
+# pylint:disable=unused-variable
+# pylint:disable=unused-argument
+# pylint:disable=redefined-outer-name
+# pylint:disable=no-value-for-parameter
+# pylint:disable=protected-access
+# pylint:disable=too-many-arguments
+# pylint:disable=no-name-in-module
+# pylint: disable=too-many-statements
+
+
+import asyncio
+import datetime
+from collections.abc import AsyncIterator, Awaitable, Callable
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, cast
+from unittest import mock
+
+import aiopg
+import aiopg.sa
+import httpx
+import pytest
+from _helpers import PublishedProject, RunningProject
+from dask.distributed import SpecCluster
+from dask_task_models_library.container_tasks.errors import TaskCancelledError
+from dask_task_models_library.container_tasks.events import TaskProgressEvent
+from dask_task_models_library.container_tasks.io import TaskOutputData
+from dask_task_models_library.container_tasks.protocol import TaskOwner
+from faker import Faker
+from fastapi.applications import FastAPI
+from models_library.clusters import DEFAULT_CLUSTER_ID
+from models_library.projects import ProjectAtDB, ProjectID
+from models_library.projects_nodes_io import NodeID
+from models_library.projects_state import RunningState
+from models_library.rabbitmq_messages import (
+    InstrumentationRabbitMessage,
+    RabbitResourceTrackingBaseMessage,
+    RabbitResourceTrackingHeartbeatMessage,
+    RabbitResourceTrackingMessages,
+    RabbitResourceTrackingStartedMessage,
+    RabbitResourceTrackingStoppedMessage,
+)
+from models_library.users import UserID
+from pydantic import parse_obj_as, parse_raw_as
+from pytest_mock.plugin import MockerFixture
+from pytest_simcore.helpers.typing_env import EnvVarsDict
+from servicelib.rabbitmq import RabbitMQClient
+from servicelib.redis import CouldNotAcquireLockError
+from settings_library.rabbit import RabbitSettings
+from settings_library.redis import RedisSettings
+from simcore_postgres_database.models.comp_runs import comp_runs
+from simcore_postgres_database.models.comp_tasks import NodeClass, comp_tasks
+from simcore_service_director_v2.core.application import init_app
+from simcore_service_director_v2.core.errors import (
+    ClustersKeeperNotAvailableError,
+    ComputationalBackendNotConnectedError,
+    ComputationalBackendOnDemandNotReadyError,
+    ComputationalBackendTaskNotFoundError,
+    ComputationalBackendTaskResultsNotReadyError,
+    ComputationalSchedulerChangedError,
+    ComputationalSchedulerError,
+    ConfigurationError,
+    PipelineNotFoundError,
+)
+from simcore_service_director_v2.core.settings import AppSettings
+from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB
+from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict
+from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image
+from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState
+from simcore_service_director_v2.modules.comp_scheduler._scheduler_base import (
+    BaseCompScheduler,
+)
+from simcore_service_director_v2.modules.comp_scheduler._scheduler_dask import (
+    DaskScheduler,
+)
+from simcore_service_director_v2.modules.comp_scheduler._utils import COMPLETED_STATES
+from simcore_service_director_v2.modules.dask_client import (
+    DaskJobID,
+    PublishedComputationTask,
+)
+from simcore_service_director_v2.utils.dask_client_utils import TaskHandlers
+from starlette.testclient import TestClient
+from tenacity.asyncio import AsyncRetrying
+from tenacity.retry import retry_if_exception_type
+from tenacity.stop import stop_after_delay
+from tenacity.wait import wait_fixed
+
+pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"]
+pytest_simcore_ops_services_selection = [
+    "adminer",
+]
+
+
+def _assert_dask_client_correctly_initialized(
+    mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler
+) -> None:
+    mocked_dask_client.create.assert_called_once_with(
+        app=mock.ANY,
+        settings=mock.ANY,
+        endpoint=mock.ANY,
+        authentication=mock.ANY,
+        tasks_file_link_type=mock.ANY,
+        cluster_type=mock.ANY,
+    )
+    mocked_dask_client.register_handlers.assert_called_once_with(
+        TaskHandlers(
+            cast(  # noqa: SLF001
+                DaskScheduler, scheduler
+            )._task_progress_change_handler,
+            cast(DaskScheduler, scheduler)._task_log_change_handler,  # noqa: SLF001
+        )
+    )
+
+
+async def _assert_comp_run_db(
+    aiopg_engine: aiopg.sa.engine.Engine,
+    pub_project: PublishedProject,
+    expected_state: RunningState,
+) -> None:
+    # check the database is correctly updated, the run is published
+    async with aiopg_engine.acquire() as conn:
+        result = await conn.execute(
+            comp_runs.select().where(
+                (comp_runs.c.user_id == pub_project.project.prj_owner)
+                & (comp_runs.c.project_uuid == f"{pub_project.project.uuid}")
+            )  # there is only one entry
+        )
+        run_entry = CompRunsAtDB.parse_obj(await result.first())
+    assert (
+        run_entry.result == expected_state
+    ), f"comp_runs: expected state '{expected_state}, found '{run_entry.result}'"
+
+
+async def _assert_comp_tasks_db(
+    aiopg_engine: aiopg.sa.engine.Engine,
+    project_uuid: ProjectID,
+    task_ids: list[NodeID],
+    *,
+    expected_state: RunningState,
+    expected_progress: float | None,
+) -> None:
+    # check the database is correctly updated, the run is published
+    async with aiopg_engine.acquire() as conn:
+        result = await conn.execute(
+            comp_tasks.select().where(
+                (comp_tasks.c.project_id == f"{project_uuid}")
+                & (comp_tasks.c.node_id.in_([f"{n}" for n in task_ids]))
+            )  # there is only one entry
+        )
+        tasks = parse_obj_as(list[CompTaskAtDB], await result.fetchall())
+    assert all(
+        t.state == expected_state for t in tasks
+    ), f"expected state: {expected_state}, found: {[t.state for t in tasks]}"
+    assert all(
+        t.progress == expected_progress for t in tasks
+    ), f"{expected_progress=}, found: {[t.progress for t in tasks]}"
+
+
+async def schedule_all_pipelines(scheduler: BaseCompScheduler) -> None:
+    # NOTE: we take a copy of the pipelines, as this could change quickly if there are
+    # misconfigured pipelines that would be removed from the scheduler
+    # NOTE: we simulate multiple dv-2 replicas by running several times
+    # the same pipeline scheduling
+    local_pipelines = deepcopy(scheduler._scheduled_pipelines)  # noqa: SLF001
+    results = await asyncio.gather(
+        *(
+            scheduler.schedule_pipeline(
+                user_id=user_id,
+                project_id=project_id,
+                iteration=iteration,
+                wake_up_callback=params.scheduler_waker.set,
+            )
+            for _ in range(3)
+            for (
+                user_id,
+                project_id,
+                iteration,
+            ), params in local_pipelines.items()
+        ),
+        return_exceptions=True,
+    )
+    # we should have exceptions 2/3 of the time
+    could_not_acquire_lock_count = sum(
+        isinstance(r, CouldNotAcquireLockError) for r in results
+    )
+    total_results_count = len(results)
+
+    # Check if 2/3 of the results are CouldNotAcquireLockError
+    # checks that scheduling is done exclusively
+    assert could_not_acquire_lock_count == (2 / 3) * total_results_count
+
+
+@pytest.fixture
+def minimal_scheduler_dask_config(
+    mock_env: EnvVarsDict,
+    postgres_host_config: dict[str, str],
+    monkeypatch: pytest.MonkeyPatch,
+    rabbit_service: RabbitSettings,
+    redis_service: RedisSettings,
+    faker: Faker,
+) -> None:
+    """set a minimal configuration for testing the dask connection only"""
+    monkeypatch.setenv("DIRECTOR_V2_DYNAMIC_SIDECAR_ENABLED", "false")
+    monkeypatch.setenv("DIRECTOR_V0_ENABLED", "0")
+    monkeypatch.setenv("COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED", "1")
+    monkeypatch.setenv("COMPUTATIONAL_BACKEND_ENABLED", "1")
+    monkeypatch.setenv("R_CLONE_PROVIDER", "MINIO")
+    monkeypatch.setenv("S3_ENDPOINT", faker.url())
+    monkeypatch.setenv("S3_ACCESS_KEY", faker.pystr())
+    monkeypatch.setenv("S3_REGION", faker.pystr())
+    monkeypatch.setenv("S3_SECRET_KEY", faker.pystr())
+    monkeypatch.setenv("S3_BUCKET_NAME", faker.pystr())
+
+
+@pytest.fixture
+def scheduler(
+    minimal_scheduler_dask_config: None,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    minimal_app: FastAPI,
+) -> BaseCompScheduler:
+    scheduler = _get_scheduler_worker(minimal_app)
+    assert scheduler is not None
+    return scheduler
+
+
+@pytest.fixture
+def mocked_dask_client(mocker: MockerFixture) -> mock.MagicMock:
+    mocked_dask_client = mocker.patch(
+        "simcore_service_director_v2.modules.dask_clients_pool.DaskClient",
+        autospec=True,
+    )
+    mocked_dask_client.create.return_value = mocked_dask_client
+    return mocked_dask_client
+
+
+@pytest.fixture
+def mocked_parse_output_data_fct(mocker: MockerFixture) -> mock.Mock:
+    return mocker.patch(
+        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.parse_output_data",
+        autospec=True,
+    )
+
+
+@pytest.fixture
+def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.MagicMock:
+    return mocker.patch(
+        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid",
+        return_value=None,
+        autospec=True,
+    )
+
+
+@pytest.fixture
+def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.MagicMock:
+    """disables the scheduler task, note that it needs to be triggered manu>ally then"""
+
+    def _fake_starter(
+        self: BaseCompScheduler,
+        *args,
+        **kwargs,
+    ):
+        scheduler_task = mocker.MagicMock()
+        scheduler_task_wake_up_event = mocker.MagicMock()
+        return scheduler_task, scheduler_task_wake_up_event
+
+    return mocker.patch(
+        "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler._start_scheduling",
+        autospec=True,
+        side_effect=_fake_starter,
+    )
+
+
+@pytest.fixture
+async def minimal_app(async_client: httpx.AsyncClient) -> FastAPI:
+    # must use the minimal app from from the `async_client``
+    # the`client` uses starlette's TestClient which spawns
+    # a new thread on which it creates a new loop
+    # causing issues downstream with coroutines not
+    # being created on the same loop
+    return async_client._transport.app  # type: ignore  # noqa: SLF001
+
+
+@pytest.fixture
+def mocked_clean_task_output_and_log_files_if_invalid(mocker: MockerFixture) -> None:
+    mocker.patch(
+        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid",
+        autospec=True,
+    )
+
+
+async def test_scheduler_gracefully_starts_and_stops(
+    minimal_scheduler_dask_config: None,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    dask_spec_local_cluster: SpecCluster,
+    minimal_app: FastAPI,
+):
+    # check it started correctly
+    assert _get_scheduler_worker(minimal_app) is not None
+
+
+@pytest.mark.parametrize(
+    "missing_dependency",
+    [
+        "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED",
+    ],
+)
+def test_scheduler_raises_exception_for_missing_dependencies(
+    minimal_scheduler_dask_config: None,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    dask_spec_local_cluster: SpecCluster,
+    monkeypatch: pytest.MonkeyPatch,
+    missing_dependency: str,
+):
+    # disable the dependency
+    monkeypatch.setenv(missing_dependency, "0")
+    # create the client
+    settings = AppSettings.create_from_envs()
+    app = init_app(settings)
+
+    with pytest.raises(ConfigurationError), TestClient(
+        app, raise_server_exceptions=True
+    ) as _:
+        pass
+
+
+async def test_empty_pipeline_is_not_scheduled(
+    with_disabled_auto_scheduling: None,
+    scheduler: BaseCompScheduler,
+    registered_user: Callable[..., dict[str, Any]],
+    project: Callable[..., Awaitable[ProjectAtDB]],
+    pipeline: Callable[..., CompPipelineAtDB],
+    aiopg_engine: aiopg.sa.engine.Engine,
+    run_metadata: RunMetadataDict,
+):
+    user = registered_user()
+    empty_project = await project(user)
+
+    # the project is not in the comp_pipeline, therefore scheduling it should fail
+    with pytest.raises(PipelineNotFoundError):
+        await scheduler.run_new_pipeline(
+            user_id=user["id"],
+            project_id=empty_project.uuid,
+            cluster_id=DEFAULT_CLUSTER_ID,
+            run_metadata=run_metadata,
+            use_on_demand_clusters=False,
+        )
+    # create the empty pipeline now
+    pipeline(project_id=f"{empty_project.uuid}")
+
+    # creating a run with an empty pipeline is useless, check the scheduler is not kicking in
+    await scheduler.run_new_pipeline(
+        user_id=user["id"],
+        project_id=empty_project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        run_metadata=run_metadata,
+        use_on_demand_clusters=False,
+    )
+    assert len(scheduler._scheduled_pipelines) == 0  # noqa: SLF001
+    # check the database is empty
+    async with aiopg_engine.acquire() as conn:
+        result = await conn.scalar(
+            comp_runs.select().where(
+                (comp_runs.c.user_id == user["id"])
+                & (comp_runs.c.project_uuid == f"{empty_project.uuid}")
+            )  # there is only one entry
+        )
+        assert result is None
+
+
+async def test_misconfigured_pipeline_is_not_scheduled(
+    with_disabled_auto_scheduling: None,
+    scheduler: BaseCompScheduler,
+    registered_user: Callable[..., dict[str, Any]],
+    project: Callable[..., Awaitable[ProjectAtDB]],
+    pipeline: Callable[..., CompPipelineAtDB],
+    fake_workbench_without_outputs: dict[str, Any],
+    fake_workbench_adjacency: dict[str, Any],
+    aiopg_engine: aiopg.sa.engine.Engine,
+    run_metadata: RunMetadataDict,
+):
+    """A pipeline which comp_tasks are missing should not be scheduled.
+    It shall be aborted and shown as such in the comp_runs db"""
+    user = registered_user()
+    sleepers_project = await project(user, workbench=fake_workbench_without_outputs)
+    pipeline(
+        project_id=f"{sleepers_project.uuid}",
+        dag_adjacency_list=fake_workbench_adjacency,
+    )
+    # check the pipeline is correctly added to the scheduled pipelines
+    await scheduler.run_new_pipeline(
+        user_id=user["id"],
+        project_id=sleepers_project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        run_metadata=run_metadata,
+        use_on_demand_clusters=False,
+    )
+    assert len(scheduler._scheduled_pipelines) == 1  # noqa: SLF001
+    for (
+        u_id,
+        p_id,
+        it,
+    ) in scheduler._scheduled_pipelines:  # noqa: SLF001
+        assert u_id == user["id"]
+        assert p_id == sleepers_project.uuid
+        assert it > 0
+    # check the database was properly updated
+    async with aiopg_engine.acquire() as conn:
+        result = await conn.execute(
+            comp_runs.select().where(
+                (comp_runs.c.user_id == user["id"])
+                & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}")
+            )  # there is only one entry
+        )
+        run_entry = CompRunsAtDB.parse_obj(await result.first())
+    assert run_entry.result == RunningState.PUBLISHED
+    # let the scheduler kick in
+    await schedule_all_pipelines(scheduler)
+    # check the scheduled pipelines is again empty since it's misconfigured
+    assert len(scheduler._scheduled_pipelines) == 0  # noqa: SLF001
+    # check the database entry is correctly updated
+    async with aiopg_engine.acquire() as conn:
+        result = await conn.execute(
+            comp_runs.select().where(
+                (comp_runs.c.user_id == user["id"])
+                & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}")
+            )  # there is only one entry
+        )
+        run_entry = CompRunsAtDB.parse_obj(await result.first())
+    assert run_entry.result == RunningState.ABORTED
+    assert run_entry.metadata == run_metadata
+
+
+async def _assert_start_pipeline(
+    aiopg_engine,
+    published_project: PublishedProject,
+    scheduler: BaseCompScheduler,
+    run_metadata: RunMetadataDict,
+) -> list[CompTaskAtDB]:
+    exp_published_tasks = deepcopy(published_project.tasks)
+    assert published_project.project.prj_owner
+    await scheduler.run_new_pipeline(
+        user_id=published_project.project.prj_owner,
+        project_id=published_project.project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        run_metadata=run_metadata,
+        use_on_demand_clusters=False,
+    )
+    assert (
+        len(scheduler._scheduled_pipelines) == 1  # noqa: SLF001
+    ), "the pipeline is not scheduled!"
+    for (
+        u_id,
+        p_id,
+        it,
+    ) in scheduler._scheduled_pipelines:  # noqa: SLF001
+        assert u_id == published_project.project.prj_owner
+        assert p_id == published_project.project.uuid
+        assert it > 0
+
+    # check the database is correctly updated, the run is published
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in exp_published_tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,
+    )
+    return exp_published_tasks
+
+
+async def _assert_schedule_pipeline_PENDING(  # noqa: N802
+    aiopg_engine,
+    published_project: PublishedProject,
+    published_tasks: list[CompTaskAtDB],
+    mocked_dask_client: mock.MagicMock,
+    scheduler: BaseCompScheduler,
+) -> list[CompTaskAtDB]:
+    expected_pending_tasks = [
+        published_tasks[1],
+        published_tasks[3],
+    ]
+    for p in expected_pending_tasks:
+        published_tasks.remove(p)
+
+    async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [DaskClientTaskState.PENDING for job_id in job_ids]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_tasks_pending
+    await schedule_all_pipelines(scheduler)
+    _assert_dask_client_correctly_initialized(mocked_dask_client, scheduler)
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_pending_tasks],
+        expected_state=RunningState.PENDING,
+        expected_progress=None,
+    )
+    # the other tasks are still waiting in published state
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in published_tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,  # since we bypass the API entrypoint this is correct
+    )
+    # tasks were send to the backend
+    assert published_project.project.prj_owner is not None
+    mocked_dask_client.send_computation_tasks.assert_has_calls(
+        calls=[
+            mock.call(
+                user_id=published_project.project.prj_owner,
+                project_id=published_project.project.uuid,
+                cluster_id=DEFAULT_CLUSTER_ID,
+                tasks={f"{p.node_id}": p.image},
+                callback=mock.ANY,
+                metadata=mock.ANY,
+                hardware_info=mock.ANY,
+            )
+            for p in expected_pending_tasks
+        ],
+        any_order=True,
+    )
+    mocked_dask_client.send_computation_tasks.reset_mock()
+    mocked_dask_client.get_tasks_status.assert_not_called()
+    mocked_dask_client.get_task_result.assert_not_called()
+    # there is a second run of the scheduler to move comp_runs to pending, the rest does not change
+    await schedule_all_pipelines(scheduler)
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_pending_tasks],
+        expected_state=RunningState.PENDING,
+        expected_progress=None,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in published_tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,
+    )
+    mocked_dask_client.send_computation_tasks.assert_not_called()
+    mocked_dask_client.get_tasks_status.assert_has_calls(
+        calls=[mock.call([p.job_id for p in expected_pending_tasks])], any_order=True
+    )
+    mocked_dask_client.get_tasks_status.reset_mock()
+    mocked_dask_client.get_task_result.assert_not_called()
+    return expected_pending_tasks
+
+
+@pytest.fixture
+async def instrumentation_rabbit_client_parser(
+    create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture
+) -> AsyncIterator[mock.AsyncMock]:
+    client = create_rabbitmq_client("instrumentation_pytest_consumer")
+    mock = mocker.AsyncMock(return_value=True)
+    queue_name = await client.subscribe(
+        InstrumentationRabbitMessage.get_channel_name(), mock
+    )
+    yield mock
+    await client.unsubscribe(queue_name)
+
+
+@pytest.fixture
+async def resource_tracking_rabbit_client_parser(
+    create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture
+) -> AsyncIterator[mock.AsyncMock]:
+    client = create_rabbitmq_client("resource_tracking_pytest_consumer")
+    mock = mocker.AsyncMock(return_value=True)
+    queue_name = await client.subscribe(
+        RabbitResourceTrackingBaseMessage.get_channel_name(), mock
+    )
+    yield mock
+    await client.unsubscribe(queue_name)
+
+
+async def _assert_message_received(
+    mocked_message_parser: mock.AsyncMock,
+    expected_call_count: int,
+    message_parser: Callable,
+) -> list:
+    async for attempt in AsyncRetrying(
+        wait=wait_fixed(0.1),
+        stop=stop_after_delay(5),
+        retry=retry_if_exception_type(AssertionError),
+        reraise=True,
+    ):
+        with attempt:
+            print(
+                f"--> waiting for rabbitmq message [{attempt.retry_state.attempt_number}, {attempt.retry_state.idle_for}]"
+            )
+            assert mocked_message_parser.call_count == expected_call_count
+            print(
+                f"<-- rabbitmq message received after [{attempt.retry_state.attempt_number}, {attempt.retry_state.idle_for}]"
+            )
+    parsed_messages = [
+        message_parser(mocked_message_parser.call_args_list[c].args[0])
+        for c in range(expected_call_count)
+    ]
+
+    mocked_message_parser.reset_mock()
+    return parsed_messages
+
+
+def _mock_send_computation_tasks(
+    tasks: list[CompTaskAtDB], mocked_dask_client: mock.MagicMock
+) -> None:
+    node_id_to_job_id_map = {task.node_id: task.job_id for task in tasks}
+
+    async def _send_computation_tasks(
+        *args, tasks: dict[NodeID, Image], **kwargs
+    ) -> list[PublishedComputationTask]:
+        for node_id in tasks:
+            assert NodeID(f"{node_id}") in node_id_to_job_id_map
+        return [
+            PublishedComputationTask(
+                node_id=NodeID(f"{node_id}"),
+                job_id=DaskJobID(node_id_to_job_id_map[NodeID(f"{node_id}")]),
+            )
+            for node_id in tasks
+        ]  # type: ignore
+
+    mocked_dask_client.send_computation_tasks.side_effect = _send_computation_tasks
+
+
+async def _trigger_progress_event(
+    scheduler: BaseCompScheduler,
+    *,
+    job_id: str,
+    user_id: UserID,
+    project_id: ProjectID,
+    node_id: NodeID,
+) -> None:
+    event = TaskProgressEvent(
+        job_id=job_id,
+        progress=0,
+        task_owner=TaskOwner(
+            user_id=user_id,
+            project_id=project_id,
+            node_id=node_id,
+            parent_project_id=None,
+            parent_node_id=None,
+        ),
+    )
+    await cast(DaskScheduler, scheduler)._task_progress_change_handler(  # noqa: SLF001
+        event.json()
+    )
+
+
+@pytest.mark.acceptance_test()
+async def test_proper_pipeline_is_scheduled(  # noqa: PLR0915
+    with_disabled_auto_scheduling: None,
+    mocked_dask_client: mock.MagicMock,
+    scheduler: BaseCompScheduler,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    published_project: PublishedProject,
+    mocked_parse_output_data_fct: mock.Mock,
+    mocked_clean_task_output_and_log_files_if_invalid: None,
+    instrumentation_rabbit_client_parser: mock.AsyncMock,
+    resource_tracking_rabbit_client_parser: mock.AsyncMock,
+    run_metadata: RunMetadataDict,
+):
+    _mock_send_computation_tasks(published_project.tasks, mocked_dask_client)
+
+    expected_published_tasks = await _assert_start_pipeline(
+        aiopg_engine, published_project, scheduler, run_metadata
+    )
+
+    # -------------------------------------------------------------------------------
+    # 1. first run will move comp_tasks to PENDING so the worker can take them
+    expected_pending_tasks = await _assert_schedule_pipeline_PENDING(
+        aiopg_engine,
+        published_project,
+        expected_published_tasks,
+        mocked_dask_client,
+        scheduler,
+    )
+
+    # -------------------------------------------------------------------------------
+    # 2.1. the worker might be taking the task, until we get a progress we do not know
+    #      whether it effectively started or it is still queued in the worker process
+    exp_started_task = expected_pending_tasks[0]
+    expected_pending_tasks.remove(exp_started_task)
+
+    async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [
+            (
+                DaskClientTaskState.PENDING_OR_STARTED
+                if job_id == exp_started_task.job_id
+                else DaskClientTaskState.PENDING
+            )
+            for job_id in job_ids
+        ]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running
+
+    await schedule_all_pipelines(scheduler)
+
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [exp_started_task.node_id],
+        expected_state=RunningState.PENDING,
+        expected_progress=None,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_pending_tasks],
+        expected_state=RunningState.PENDING,
+        expected_progress=None,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_published_tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,  # since we bypass the API entrypoint this is correct
+    )
+    mocked_dask_client.send_computation_tasks.assert_not_called()
+    mocked_dask_client.get_tasks_status.assert_called_once_with(
+        [p.job_id for p in (exp_started_task, *expected_pending_tasks)],
+    )
+    mocked_dask_client.get_tasks_status.reset_mock()
+    mocked_dask_client.get_task_result.assert_not_called()
+
+    # -------------------------------------------------------------------------------
+    # 3. the "worker" starts processing a task
+    # here we trigger a progress from the worker
+    assert exp_started_task.job_id
+    assert exp_started_task.project_id
+    assert exp_started_task.node_id
+    assert published_project.project.prj_owner
+    await _trigger_progress_event(
+        scheduler,
+        job_id=exp_started_task.job_id,
+        user_id=published_project.project.prj_owner,
+        project_id=exp_started_task.project_id,
+        node_id=exp_started_task.node_id,
+    )
+
+    await schedule_all_pipelines(scheduler)
+    # comp_run, the comp_task switch to STARTED
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [exp_started_task.node_id],
+        expected_state=RunningState.STARTED,
+        expected_progress=0,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_pending_tasks],
+        expected_state=RunningState.PENDING,
+        expected_progress=None,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_published_tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,
+    )
+    mocked_dask_client.send_computation_tasks.assert_not_called()
+    mocked_dask_client.get_tasks_status.assert_called_once_with(
+        [p.job_id for p in (exp_started_task, *expected_pending_tasks)],
+    )
+    mocked_dask_client.get_tasks_status.reset_mock()
+    mocked_dask_client.get_task_result.assert_not_called()
+    messages = await _assert_message_received(
+        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
+    )
+    assert messages[0].metrics == "service_started"
+    assert messages[0].service_uuid == exp_started_task.node_id
+
+    def _parser(x) -> RabbitResourceTrackingMessages:
+        return parse_raw_as(RabbitResourceTrackingMessages, x)
+
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingStartedMessage.parse_raw,
+    )
+    assert messages[0].node_id == exp_started_task.node_id
+
+    # -------------------------------------------------------------------------------
+    # 4. the "worker" completed the task successfully
+    async def _return_1st_task_success(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [
+            (
+                DaskClientTaskState.SUCCESS
+                if job_id == exp_started_task.job_id
+                else DaskClientTaskState.PENDING
+            )
+            for job_id in job_ids
+        ]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_success
+
+    async def _return_random_task_result(job_id) -> TaskOutputData:
+        return TaskOutputData.parse_obj({"out_1": None, "out_2": 45})
+
+    mocked_dask_client.get_task_result.side_effect = _return_random_task_result
+    await schedule_all_pipelines(scheduler)
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [exp_started_task.node_id],
+        expected_state=RunningState.SUCCESS,
+        expected_progress=1,
+    )
+    messages = await _assert_message_received(
+        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
+    )
+    assert messages[0].metrics == "service_stopped"
+    assert messages[0].service_uuid == exp_started_task.node_id
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingStoppedMessage.parse_raw,
+    )
+
+    completed_tasks = [exp_started_task]
+    next_pending_task = published_project.tasks[2]
+    expected_pending_tasks.append(next_pending_task)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [p.node_id for p in expected_pending_tasks],
+        expected_state=RunningState.PENDING,
+        expected_progress=None,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [
+            p.node_id
+            for p in published_project.tasks
+            if p not in expected_pending_tasks + completed_tasks
+        ],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,  # since we bypass the API entrypoint this is correct
+    )
+    mocked_dask_client.send_computation_tasks.assert_called_once_with(
+        user_id=published_project.project.prj_owner,
+        project_id=published_project.project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        tasks={
+            f"{next_pending_task.node_id}": next_pending_task.image,
+        },
+        callback=mock.ANY,
+        metadata=mock.ANY,
+        hardware_info=mock.ANY,
+    )
+    mocked_dask_client.send_computation_tasks.reset_mock()
+    mocked_dask_client.get_tasks_status.assert_has_calls(
+        calls=[
+            mock.call([p.job_id for p in completed_tasks + expected_pending_tasks[:1]])
+        ],
+        any_order=True,
+    )
+    mocked_dask_client.get_tasks_status.reset_mock()
+    mocked_dask_client.get_task_result.assert_called_once_with(
+        completed_tasks[0].job_id
+    )
+    mocked_dask_client.get_task_result.reset_mock()
+    mocked_parse_output_data_fct.assert_called_once_with(
+        mock.ANY,
+        completed_tasks[0].job_id,
+        await _return_random_task_result(completed_tasks[0].job_id),
+    )
+    mocked_parse_output_data_fct.reset_mock()
+
+    # -------------------------------------------------------------------------------
+    # 6. the "worker" starts processing a task
+    exp_started_task = next_pending_task
+
+    async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [
+            (
+                DaskClientTaskState.PENDING_OR_STARTED
+                if job_id == exp_started_task.job_id
+                else DaskClientTaskState.PENDING
+            )
+            for job_id in job_ids
+        ]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_running
+    # trigger the scheduler, run state should keep to STARTED, task should be as well
+    assert exp_started_task.job_id
+    await _trigger_progress_event(
+        scheduler,
+        job_id=exp_started_task.job_id,
+        user_id=published_project.project.prj_owner,
+        project_id=exp_started_task.project_id,
+        node_id=exp_started_task.node_id,
+    )
+    await schedule_all_pipelines(scheduler)
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [exp_started_task.node_id],
+        expected_state=RunningState.STARTED,
+        expected_progress=0,
+    )
+    mocked_dask_client.send_computation_tasks.assert_not_called()
+    expected_pending_tasks.reverse()
+    mocked_dask_client.get_tasks_status.assert_called_once_with(
+        [p.job_id for p in expected_pending_tasks]
+    )
+    mocked_dask_client.get_tasks_status.reset_mock()
+    mocked_dask_client.get_task_result.assert_not_called()
+    messages = await _assert_message_received(
+        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
+    )
+    assert messages[0].metrics == "service_started"
+    assert messages[0].service_uuid == exp_started_task.node_id
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingStartedMessage.parse_raw,
+    )
+    assert messages[0].node_id == exp_started_task.node_id
+
+    # -------------------------------------------------------------------------------
+    # 7. the task fails
+    async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [
+            (
+                DaskClientTaskState.ERRED
+                if job_id == exp_started_task.job_id
+                else DaskClientTaskState.PENDING
+            )
+            for job_id in job_ids
+        ]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_failed
+    mocked_dask_client.get_task_result.side_effect = None
+    await schedule_all_pipelines(scheduler)
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [exp_started_task.node_id],
+        expected_state=RunningState.FAILED,
+        expected_progress=1,
+    )
+    mocked_dask_client.send_computation_tasks.assert_not_called()
+    mocked_dask_client.get_tasks_status.assert_called_once_with(
+        [p.job_id for p in expected_pending_tasks]
+    )
+    mocked_dask_client.get_tasks_status.reset_mock()
+    mocked_dask_client.get_task_result.assert_called_once_with(exp_started_task.job_id)
+    mocked_dask_client.get_task_result.reset_mock()
+    mocked_parse_output_data_fct.assert_not_called()
+    expected_pending_tasks.remove(exp_started_task)
+    messages = await _assert_message_received(
+        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
+    )
+    assert messages[0].metrics == "service_stopped"
+    assert messages[0].service_uuid == exp_started_task.node_id
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingStoppedMessage.parse_raw,
+    )
+
+    # -------------------------------------------------------------------------------
+    # 8. the last task shall succeed
+    exp_started_task = expected_pending_tasks[0]
+
+    async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [
+            (
+                DaskClientTaskState.SUCCESS
+                if job_id == exp_started_task.job_id
+                else DaskClientTaskState.PENDING
+            )
+            for job_id in job_ids
+        ]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_3rd_task_success
+    mocked_dask_client.get_task_result.side_effect = _return_random_task_result
+
+    # trigger the scheduler, it should switch to FAILED, as we are done
+    await schedule_all_pipelines(scheduler)
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED)
+
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [exp_started_task.node_id],
+        expected_state=RunningState.SUCCESS,
+        expected_progress=1,
+    )
+    mocked_dask_client.send_computation_tasks.assert_not_called()
+    mocked_dask_client.get_tasks_status.assert_called_once_with(
+        [p.job_id for p in expected_pending_tasks]
+    )
+    mocked_dask_client.get_task_result.assert_called_once_with(exp_started_task.job_id)
+    messages = await _assert_message_received(
+        instrumentation_rabbit_client_parser, 2, InstrumentationRabbitMessage.parse_raw
+    )
+    # NOTE: the service was fast and went directly to success
+    assert messages[0].metrics == "service_started"
+    assert messages[0].service_uuid == exp_started_task.node_id
+    assert messages[1].metrics == "service_stopped"
+    assert messages[1].service_uuid == exp_started_task.node_id
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        2,
+        _parser,
+    )
+    assert isinstance(messages[0], RabbitResourceTrackingStartedMessage)
+    assert isinstance(messages[1], RabbitResourceTrackingStoppedMessage)
+
+    # the scheduled pipeline shall be removed
+    assert scheduler._scheduled_pipelines == {}  # noqa: SLF001
+
+
+async def test_task_progress_triggers(
+    with_disabled_auto_scheduling: None,
+    mocked_dask_client: mock.MagicMock,
+    scheduler: BaseCompScheduler,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    published_project: PublishedProject,
+    mocked_parse_output_data_fct: None,
+    mocked_clean_task_output_and_log_files_if_invalid: None,
+    run_metadata: RunMetadataDict,
+):
+    _mock_send_computation_tasks(published_project.tasks, mocked_dask_client)
+    expected_published_tasks = await _assert_start_pipeline(
+        aiopg_engine, published_project, scheduler, run_metadata
+    )
+    # -------------------------------------------------------------------------------
+    # 1. first run will move comp_tasks to PENDING so the worker can take them
+    expected_pending_tasks = await _assert_schedule_pipeline_PENDING(
+        aiopg_engine,
+        published_project,
+        expected_published_tasks,
+        mocked_dask_client,
+        scheduler,
+    )
+
+    # send some progress
+    started_task = expected_pending_tasks[0]
+    assert started_task.job_id
+    assert published_project.project.prj_owner
+    for progress in [-1, 0, 0.3, 0.5, 1, 1.5, 0.7, 0, 20]:
+        progress_event = TaskProgressEvent(
+            job_id=started_task.job_id,
+            progress=progress,
+            task_owner=TaskOwner(
+                user_id=published_project.project.prj_owner,
+                project_id=published_project.project.uuid,
+                node_id=started_task.node_id,
+                parent_node_id=None,
+                parent_project_id=None,
+            ),
+        )
+        await cast(  # noqa: SLF001
+            DaskScheduler, scheduler
+        )._task_progress_change_handler(progress_event.json())
+        # NOTE: not sure whether it should switch to STARTED.. it would make sense
+        await _assert_comp_tasks_db(
+            aiopg_engine,
+            published_project.project.uuid,
+            [started_task.node_id],
+            expected_state=RunningState.STARTED,
+            expected_progress=min(max(0, progress), 1),
+        )
+
+
+@pytest.mark.parametrize(
+    "backend_error",
+    [
+        ComputationalBackendNotConnectedError(msg="faked disconnected backend"),
+        ComputationalSchedulerChangedError(
+            original_scheduler_id="some_old_scheduler_id",
+            current_scheduler_id="some_new_scheduler_id",
+        ),
+    ],
+)
+async def test_handling_of_disconnected_scheduler_dask(
+    with_disabled_auto_scheduling: None,
+    mocked_dask_client: mock.MagicMock,
+    scheduler: BaseCompScheduler,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    mocker: MockerFixture,
+    published_project: PublishedProject,
+    backend_error: ComputationalSchedulerError,
+    run_metadata: RunMetadataDict,
+):
+    # this will create a non connected backend issue that will trigger re-connection
+    mocked_dask_client_send_task = mocker.patch(
+        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.DaskClient.send_computation_tasks",
+        side_effect=backend_error,
+    )
+    assert mocked_dask_client_send_task
+
+    # running the pipeline will now raise and the tasks are set back to PUBLISHED
+    assert published_project.project.prj_owner
+    await scheduler.run_new_pipeline(
+        user_id=published_project.project.prj_owner,
+        project_id=published_project.project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        run_metadata=run_metadata,
+        use_on_demand_clusters=False,
+    )
+
+    # since there is no cluster, there is no dask-scheduler,
+    # the tasks shall all still be in PUBLISHED state now
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
+
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in published_project.tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,
+    )
+    # on the next iteration of the pipeline it will try to re-connect
+    # now try to abort the tasks since we are wondering what is happening, this should auto-trigger the scheduler
+    await scheduler.stop_pipeline(
+        user_id=published_project.project.prj_owner,
+        project_id=published_project.project.uuid,
+    )
+    # we ensure the scheduler was run
+    await schedule_all_pipelines(scheduler)
+    # after this step the tasks are marked as ABORTED
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [
+            t.node_id
+            for t in published_project.tasks
+            if t.node_class == NodeClass.COMPUTATIONAL
+        ],
+        expected_state=RunningState.ABORTED,
+        expected_progress=1,
+    )
+    # then we have another scheduler run
+    await schedule_all_pipelines(scheduler)
+    # now the run should be ABORTED
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.ABORTED)
+
+
+@dataclass(frozen=True, kw_only=True)
+class RebootState:
+    dask_task_status: DaskClientTaskState
+    task_result: Exception | TaskOutputData
+    expected_task_state_group1: RunningState
+    expected_task_progress_group1: float
+    expected_task_state_group2: RunningState
+    expected_task_progress_group2: float
+    expected_run_state: RunningState
+
+
+@pytest.mark.parametrize(
+    "reboot_state",
+    [
+        pytest.param(
+            RebootState(
+                dask_task_status=DaskClientTaskState.LOST,
+                task_result=ComputationalBackendTaskNotFoundError(job_id="fake_job_id"),
+                expected_task_state_group1=RunningState.FAILED,
+                expected_task_progress_group1=1,
+                expected_task_state_group2=RunningState.ABORTED,
+                expected_task_progress_group2=1,
+                expected_run_state=RunningState.FAILED,
+            ),
+            id="reboot with lost tasks",
+        ),
+        pytest.param(
+            RebootState(
+                dask_task_status=DaskClientTaskState.ABORTED,
+                task_result=TaskCancelledError(job_id="fake_job_id"),
+                expected_task_state_group1=RunningState.ABORTED,
+                expected_task_progress_group1=1,
+                expected_task_state_group2=RunningState.ABORTED,
+                expected_task_progress_group2=1,
+                expected_run_state=RunningState.ABORTED,
+            ),
+            id="reboot with aborted tasks",
+        ),
+        pytest.param(
+            RebootState(
+                dask_task_status=DaskClientTaskState.ERRED,
+                task_result=ValueError("some error during the call"),
+                expected_task_state_group1=RunningState.FAILED,
+                expected_task_progress_group1=1,
+                expected_task_state_group2=RunningState.ABORTED,
+                expected_task_progress_group2=1,
+                expected_run_state=RunningState.FAILED,
+            ),
+            id="reboot with failed tasks",
+        ),
+        pytest.param(
+            RebootState(
+                dask_task_status=DaskClientTaskState.PENDING_OR_STARTED,
+                task_result=ComputationalBackendTaskResultsNotReadyError(
+                    job_id="fake_job_id"
+                ),
+                expected_task_state_group1=RunningState.STARTED,
+                expected_task_progress_group1=0,
+                expected_task_state_group2=RunningState.STARTED,
+                expected_task_progress_group2=0,
+                expected_run_state=RunningState.STARTED,
+            ),
+            id="reboot with running tasks",
+        ),
+        pytest.param(
+            RebootState(
+                dask_task_status=DaskClientTaskState.SUCCESS,
+                task_result=TaskOutputData.parse_obj({"whatever_output": 123}),
+                expected_task_state_group1=RunningState.SUCCESS,
+                expected_task_progress_group1=1,
+                expected_task_state_group2=RunningState.SUCCESS,
+                expected_task_progress_group2=1,
+                expected_run_state=RunningState.SUCCESS,
+            ),
+            id="reboot with completed tasks",
+        ),
+    ],
+)
+async def test_handling_scheduling_after_reboot(
+    with_disabled_auto_scheduling: None,
+    mocked_dask_client: mock.MagicMock,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    running_project: RunningProject,
+    scheduler: BaseCompScheduler,
+    mocked_parse_output_data_fct: mock.MagicMock,
+    mocked_clean_task_output_fct: mock.MagicMock,
+    reboot_state: RebootState,
+):
+    """After the dask client is rebooted, or that the director-v2 reboots the dv-2 internal scheduler
+    shall continue scheduling correctly. Even though the task might have continued to run
+    in the dask-scheduler."""
+
+    async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [reboot_state.dask_task_status for j in job_ids]
+
+    mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status
+
+    async def mocked_get_task_result(_job_id: str) -> TaskOutputData:
+        if isinstance(reboot_state.task_result, Exception):
+            raise reboot_state.task_result
+        return reboot_state.task_result
+
+    mocked_dask_client.get_task_result.side_effect = mocked_get_task_result
+
+    await schedule_all_pipelines(scheduler)
+    # the status will be called once for all RUNNING tasks
+    mocked_dask_client.get_tasks_status.assert_called_once()
+    if reboot_state.expected_run_state in COMPLETED_STATES:
+        mocked_dask_client.get_task_result.assert_has_calls(
+            [
+                mock.call(t.job_id)
+                for t in running_project.tasks
+                if t.node_class == NodeClass.COMPUTATIONAL
+            ],
+            any_order=True,
+        )
+    else:
+        mocked_dask_client.get_task_result.assert_not_called()
+    if reboot_state.expected_run_state in [RunningState.ABORTED, RunningState.FAILED]:
+        # the clean up of the outputs should be done
+        mocked_clean_task_output_fct.assert_has_calls(
+            [
+                mock.call(
+                    mock.ANY,
+                    running_project.project.prj_owner,
+                    running_project.project.uuid,
+                    t.node_id,
+                )
+                for t in running_project.tasks
+                if t.node_class == NodeClass.COMPUTATIONAL
+            ],
+            any_order=True,
+        )
+    else:
+        mocked_clean_task_output_fct.assert_not_called()
+
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        running_project.project.uuid,
+        [
+            running_project.tasks[1].node_id,
+            running_project.tasks[2].node_id,
+            running_project.tasks[3].node_id,
+        ],
+        expected_state=reboot_state.expected_task_state_group1,
+        expected_progress=reboot_state.expected_task_progress_group1,
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        running_project.project.uuid,
+        [running_project.tasks[4].node_id],
+        expected_state=reboot_state.expected_task_state_group2,
+        expected_progress=reboot_state.expected_task_progress_group2,
+    )
+    assert running_project.project.prj_owner
+    await _assert_comp_run_db(
+        aiopg_engine, running_project, reboot_state.expected_run_state
+    )
+
+
+async def test_handling_cancellation_of_jobs_after_reboot(
+    with_disabled_auto_scheduling: None,
+    mocked_dask_client: mock.MagicMock,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    running_project_mark_for_cancellation: RunningProject,
+    scheduler: BaseCompScheduler,
+    mocked_parse_output_data_fct: mock.MagicMock,
+    mocked_clean_task_output_fct: mock.MagicMock,
+):
+    """A running pipeline was cancelled by a user and the DV-2 was restarted BEFORE
+    It could actually cancel the task. On reboot the DV-2 shall recover
+    and actually cancel the pipeline properly"""
+
+    # check initial status
+    await _assert_comp_run_db(
+        aiopg_engine, running_project_mark_for_cancellation, RunningState.STARTED
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        running_project_mark_for_cancellation.project.uuid,
+        [t.node_id for t in running_project_mark_for_cancellation.tasks],
+        expected_state=RunningState.STARTED,
+        expected_progress=0,
+    )
+
+    # the backend shall report the tasks as running
+    async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [DaskClientTaskState.PENDING_OR_STARTED for j in job_ids]
+
+    mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status
+    # Running the scheduler, should actually cancel the run now
+    await schedule_all_pipelines(scheduler)
+    mocked_dask_client.abort_computation_task.assert_called()
+    assert mocked_dask_client.abort_computation_task.call_count == len(
+        [
+            t.node_id
+            for t in running_project_mark_for_cancellation.tasks
+            if t.node_class == NodeClass.COMPUTATIONAL
+        ]
+    )
+    # in the DB they are still running, they will be stopped in the next iteration
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        running_project_mark_for_cancellation.project.uuid,
+        [
+            t.node_id
+            for t in running_project_mark_for_cancellation.tasks
+            if t.node_class == NodeClass.COMPUTATIONAL
+        ],
+        expected_state=RunningState.STARTED,
+        expected_progress=0,
+    )
+    await _assert_comp_run_db(
+        aiopg_engine, running_project_mark_for_cancellation, RunningState.STARTED
+    )
+
+    # the backend shall now report the tasks as aborted
+    async def mocked_get_tasks_status_aborted(
+        job_ids: list[str],
+    ) -> list[DaskClientTaskState]:
+        return [DaskClientTaskState.ABORTED for j in job_ids]
+
+    mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status_aborted
+
+    async def _return_random_task_result(job_id) -> TaskOutputData:
+        raise TaskCancelledError
+
+    mocked_dask_client.get_task_result.side_effect = _return_random_task_result
+    await schedule_all_pipelines(scheduler)
+    # now should be stopped
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        running_project_mark_for_cancellation.project.uuid,
+        [
+            t.node_id
+            for t in running_project_mark_for_cancellation.tasks
+            if t.node_class == NodeClass.COMPUTATIONAL
+        ],
+        expected_state=RunningState.ABORTED,
+        expected_progress=1,
+    )
+    await _assert_comp_run_db(
+        aiopg_engine, running_project_mark_for_cancellation, RunningState.ABORTED
+    )
+    mocked_clean_task_output_fct.assert_called()
+
+
+@pytest.fixture
+def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int:
+    seconds = 1
+    monkeypatch.setenv("SERVICE_TRACKING_HEARTBEAT", f"{seconds}")
+    return seconds
+
+
+async def test_running_pipeline_triggers_heartbeat(
+    with_disabled_auto_scheduling: None,
+    with_fast_service_heartbeat_s: int,
+    mocked_dask_client: mock.MagicMock,
+    scheduler: BaseCompScheduler,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    published_project: PublishedProject,
+    resource_tracking_rabbit_client_parser: mock.AsyncMock,
+    run_metadata: RunMetadataDict,
+):
+    _mock_send_computation_tasks(published_project.tasks, mocked_dask_client)
+    expected_published_tasks = await _assert_start_pipeline(
+        aiopg_engine, published_project, scheduler, run_metadata
+    )
+    # -------------------------------------------------------------------------------
+    # 1. first run will move comp_tasks to PENDING so the worker can take them
+    expected_pending_tasks = await _assert_schedule_pipeline_PENDING(
+        aiopg_engine,
+        published_project,
+        expected_published_tasks,
+        mocked_dask_client,
+        scheduler,
+    )
+    # -------------------------------------------------------------------------------
+    # 2. the "worker" starts processing a task
+    exp_started_task = expected_pending_tasks[0]
+    expected_pending_tasks.remove(exp_started_task)
+
+    async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskState]:
+        return [
+            (
+                DaskClientTaskState.PENDING_OR_STARTED
+                if job_id == exp_started_task.job_id
+                else DaskClientTaskState.PENDING
+            )
+            for job_id in job_ids
+        ]
+
+    mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running
+    assert exp_started_task.job_id
+    assert published_project.project.prj_owner
+    await _trigger_progress_event(
+        scheduler,
+        job_id=exp_started_task.job_id,
+        user_id=published_project.project.prj_owner,
+        project_id=exp_started_task.project_id,
+        node_id=exp_started_task.node_id,
+    )
+    await schedule_all_pipelines(scheduler)
+
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingStartedMessage.parse_raw,
+    )
+    assert messages[0].node_id == exp_started_task.node_id
+
+    # -------------------------------------------------------------------------------
+    # 3. wait a bit and run again we should get another heartbeat, but only one!
+    await asyncio.sleep(with_fast_service_heartbeat_s + 1)
+    await schedule_all_pipelines(scheduler)
+    await schedule_all_pipelines(scheduler)
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingHeartbeatMessage.parse_raw,
+    )
+    assert isinstance(messages[0], RabbitResourceTrackingHeartbeatMessage)
+
+    # -------------------------------------------------------------------------------
+    # 4. wait a bit and run again we should get another heartbeat, but only one!
+    await asyncio.sleep(with_fast_service_heartbeat_s + 1)
+    await schedule_all_pipelines(scheduler)
+    await schedule_all_pipelines(scheduler)
+    messages = await _assert_message_received(
+        resource_tracking_rabbit_client_parser,
+        1,
+        RabbitResourceTrackingHeartbeatMessage.parse_raw,
+    )
+    assert isinstance(messages[0], RabbitResourceTrackingHeartbeatMessage)
+
+
+@pytest.fixture
+async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock:
+    return mocker.patch(
+        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.get_or_create_on_demand_cluster",
+        autospec=True,
+    )
+
+
+async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits(
+    with_disabled_auto_scheduling: None,
+    scheduler: BaseCompScheduler,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    published_project: PublishedProject,
+    run_metadata: RunMetadataDict,
+    mocked_get_or_create_cluster: mock.Mock,
+    faker: Faker,
+):
+    mocked_get_or_create_cluster.side_effect = (
+        ComputationalBackendOnDemandNotReadyError(
+            eta=faker.time_delta(datetime.timedelta(hours=1))
+        )
+    )
+    # running the pipeline will trigger a call to the clusters-keeper
+    assert published_project.project.prj_owner
+    await scheduler.run_new_pipeline(
+        user_id=published_project.project.prj_owner,
+        project_id=published_project.project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        run_metadata=run_metadata,
+        use_on_demand_clusters=True,
+    )
+
+    # we ask to use an on-demand cluster, therefore the tasks are published first
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in published_project.tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,
+    )
+    mocked_get_or_create_cluster.assert_not_called()
+    # now it should switch to waiting
+    expected_waiting_tasks = [
+        published_project.tasks[1],
+        published_project.tasks[3],
+    ]
+    await schedule_all_pipelines(scheduler)
+    mocked_get_or_create_cluster.assert_called()
+    assert mocked_get_or_create_cluster.call_count == 1
+    mocked_get_or_create_cluster.reset_mock()
+    await _assert_comp_run_db(
+        aiopg_engine, published_project, RunningState.WAITING_FOR_CLUSTER
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in expected_waiting_tasks],
+        expected_state=RunningState.WAITING_FOR_CLUSTER,
+        expected_progress=None,
+    )
+    # again will trigger the same response
+    await schedule_all_pipelines(scheduler)
+    mocked_get_or_create_cluster.assert_called()
+    assert mocked_get_or_create_cluster.call_count == 1
+    mocked_get_or_create_cluster.reset_mock()
+    await _assert_comp_run_db(
+        aiopg_engine, published_project, RunningState.WAITING_FOR_CLUSTER
+    )
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in expected_waiting_tasks],
+        expected_state=RunningState.WAITING_FOR_CLUSTER,
+        expected_progress=None,
+    )
+
+
+@pytest.mark.parametrize(
+    "get_or_create_exception",
+    [ClustersKeeperNotAvailableError],
+)
+async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails(
+    with_disabled_auto_scheduling: None,
+    scheduler: BaseCompScheduler,
+    aiopg_engine: aiopg.sa.engine.Engine,
+    published_project: PublishedProject,
+    run_metadata: RunMetadataDict,
+    mocked_get_or_create_cluster: mock.Mock,
+    get_or_create_exception: Exception,
+):
+    mocked_get_or_create_cluster.side_effect = get_or_create_exception
+    # running the pipeline will trigger a call to the clusters-keeper
+    assert published_project.project.prj_owner
+    await scheduler.run_new_pipeline(
+        user_id=published_project.project.prj_owner,
+        project_id=published_project.project.uuid,
+        cluster_id=DEFAULT_CLUSTER_ID,
+        run_metadata=run_metadata,
+        use_on_demand_clusters=True,
+    )
+
+    # we ask to use an on-demand cluster, therefore the tasks are published first
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in published_project.tasks],
+        expected_state=RunningState.PUBLISHED,
+        expected_progress=None,
+    )
+    # now it should switch to failed, the run still runs until the next iteration
+    expected_failed_tasks = [
+        published_project.tasks[1],
+        published_project.tasks[3],
+    ]
+    await schedule_all_pipelines(scheduler)
+    mocked_get_or_create_cluster.assert_called()
+    assert mocked_get_or_create_cluster.call_count == 1
+    mocked_get_or_create_cluster.reset_mock()
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in expected_failed_tasks],
+        expected_state=RunningState.FAILED,
+        expected_progress=1.0,
+    )
+    # again will not re-trigger the call to clusters-keeper
+    await schedule_all_pipelines(scheduler)
+    mocked_get_or_create_cluster.assert_not_called()
+    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED)
+    await _assert_comp_tasks_db(
+        aiopg_engine,
+        published_project.project.uuid,
+        [t.node_id for t in expected_failed_tasks],
+        expected_state=RunningState.FAILED,
+        expected_progress=1.0,
+    )
diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py
index 495023dbda2a..8fbc2d9006b7 100644
--- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py
+++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py
@@ -58,9 +58,9 @@
     ComputationalBackendTaskNotFoundError,
     ComputationalBackendTaskResultsNotReadyError,
     ComputationalSchedulerChangedError,
+    ComputationalSchedulerError,
     ConfigurationError,
     PipelineNotFoundError,
-    SchedulerError,
 )
 from simcore_service_director_v2.core.settings import AppSettings
 from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB
@@ -1107,7 +1107,7 @@ async def test_handling_of_disconnected_dask_scheduler(
     aiopg_engine: aiopg.sa.engine.Engine,
     mocker: MockerFixture,
     published_project: PublishedProject,
-    backend_error: SchedulerError,
+    backend_error: ComputationalSchedulerError,
     run_metadata: RunMetadataDict,
 ):
     # this will create a non connected backend issue that will trigger re-connection

From 76af07b70f03bbabbae8cd8f411975845481ad54 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:47:21 +0100
Subject: [PATCH 04/27] pruning

---
 .../core/errors.py                            | 45 ++-----------------
 .../simcore_service_director_v2/utils/dask.py | 14 ++++--
 2 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index eb605c307eab..6576c411f8ec 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -20,9 +20,6 @@
 """
 
 from common_library.errors_classes import OsparcErrorMixin
-from models_library.errors import ErrorDict
-from models_library.projects import ProjectID
-from models_library.projects_nodes_io import NodeID
 
 
 class DirectorError(OsparcErrorMixin, RuntimeError):
@@ -93,45 +90,9 @@ class InsuficientComputationalResourcesError(TaskSchedulingError):
 
 
 class PortsValidationError(TaskSchedulingError):
-    """
-    Gathers all validation errors raised while checking input/output
-    ports in a project's node.
-    """
-
-    def __init__(self, project_id: ProjectID, node_id: NodeID, errors: list[ErrorDict]):
-        super().__init__(
-            project_id,
-            node_id,
-            msg=f"Node with {len(errors)} ports having invalid values",
-        )
-        self.errors = errors
-
-    def get_errors(self) -> list[ErrorDict]:
-        """Returns 'public errors': filters only value_error.port_validation errors for the client.
-        The rest only shown as number
-        """
-        value_errors: list[ErrorDict] = []
-        for error in self.errors:
-            # NOTE: should I filter? if error["type"].startswith("value_error."):
-
-            loc_tail: list[str] = []
-            if port_key := error.get("ctx", {}).get("port_key"):
-                loc_tail.append(f"{port_key}")
-
-            if schema_error_path := error.get("ctx", {}).get("schema_error_path"):
-                loc_tail += list(schema_error_path)
-
-            # WARNING: error in a node, might come from the previous node's port
-            # DO NOT remove project/node/port hiearchy
-            value_errors.append(
-                {
-                    "loc": (f"{self.project_id}", f"{self.node_id}", *tuple(loc_tail)),
-                    "msg": error["msg"],
-                    # NOTE: here we list the codes of the PydanticValueErrors collected in ValidationError
-                    "type": error["type"],
-                }
-            )
-        return value_errors
+    msg_template: str = (
+        "Node {node_id} in {project_id} with ports having invalid values {errors_list}"
+    )
 
 
 class ComputationalSchedulerChangedError(ComputationalSchedulerError):
diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask.py b/services/director-v2/src/simcore_service_director_v2/utils/dask.py
index a42a28e392c1..afb1e0b37704 100644
--- a/services/director-v2/src/simcore_service_director_v2/utils/dask.py
+++ b/services/director-v2/src/simcore_service_director_v2/utils/dask.py
@@ -129,7 +129,9 @@ async def create_node_ports(
             db_manager=db_manager,
         )
     except ValidationError as err:
-        raise PortsValidationError(project_id, node_id, list(err.errors())) from err
+        raise PortsValidationError(
+            project_id=project_id, node_id=node_id, errors_list=list(err.errors())
+        ) from err
 
 
 async def parse_output_data(
@@ -181,7 +183,9 @@ async def parse_output_data(
             ports_errors.extend(_get_port_validation_errors(port_key, err))
 
     if ports_errors:
-        raise PortsValidationError(project_id, node_id, ports_errors)
+        raise PortsValidationError(
+            project_id=project_id, node_id=node_id, errors_list=ports_errors
+        )
 
 
 async def compute_input_data(
@@ -218,11 +222,13 @@ async def compute_input_data(
             else:
                 input_data[port.key] = value
 
-        except ValidationError as err:  # noqa: PERF203
+        except ValidationError as err:
             ports_errors.extend(_get_port_validation_errors(port.key, err))
 
     if ports_errors:
-        raise PortsValidationError(project_id, node_id, ports_errors)
+        raise PortsValidationError(
+            project_id=project_id, node_id=node_id, errors_list=ports_errors
+        )
 
     return TaskInputData.model_validate(input_data)
 

From 593a795958657b050243d7da99e20b00db405f38 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:50:45 +0100
Subject: [PATCH 05/27] fix

---
 .../director-v2/src/simcore_service_director_v2/core/errors.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 6576c411f8ec..08f10bca8597 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -77,7 +77,7 @@ class MissingComputationalResourcesError(TaskSchedulingError):
     msg_template = (
         "Service {service_name}:{service_version} cannot be scheduled "
         "on cluster {cluster_id}: task needs '{task_resources}', "
-        "cluster has {cluster_resources}",
+        "cluster has {cluster_resources}"
     )
 
 

From 99127c500bad038d2c47597700be2ad4aca2edff Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:51:56 +0100
Subject: [PATCH 06/27] should not be here

---
 .../modules/comp_scheduler/_scheduler_base.py | 792 ------------------
 1 file changed, 792 deletions(-)
 delete mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py
deleted file mode 100644
index ce15311f9949..000000000000
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py
+++ /dev/null
@@ -1,792 +0,0 @@
-"""The scheduler shall be run as a background task.
-Based on oSparc pipelines, it monitors when to start the next worker task(s), either one at a time or as a group of tasks.
-
-In principle the Scheduler maintains the comp_runs table in the database.
-It contains how the pipeline was run and by whom.
-It also contains the final result of the pipeline run.
-
-When a pipeline is scheduled first all the tasks contained in the DAG are set to PUBLISHED state.
-Once the scheduler determines a task shall run, its state is set to PENDING, so that the sidecar can pick up the task.
-The sidecar will then change the state to STARTED, then to SUCCESS or FAILED.
-
-"""
-
-import asyncio
-import datetime
-import logging
-from abc import ABC, abstractmethod
-from collections.abc import Callable
-from dataclasses import dataclass
-from typing import Final
-
-import arrow
-import networkx as nx
-from aiopg.sa.engine import Engine
-from models_library.projects import ProjectID
-from models_library.projects_nodes_io import NodeID, NodeIDStr
-from models_library.projects_state import RunningState
-from models_library.services import ServiceKey, ServiceType, ServiceVersion
-from models_library.users import UserID
-from networkx.classes.reportviews import InDegreeView
-from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
-from servicelib.logging_utils import log_context
-from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient
-from servicelib.redis import RedisClientSDK
-
-from ...constants import UNDEFINED_STR_METADATA
-from ...core.errors import (
-    ClustersKeeperNotAvailableError,
-    ComputationalBackendNotConnectedError,
-    ComputationalBackendOnDemandNotReadyError,
-    ComputationalSchedulerChangedError,
-    DaskClientAcquisisitonError,
-    InvalidPipelineError,
-    PipelineNotFoundError,
-    TaskSchedulingError,
-)
-from ...core.settings import ComputationalBackendSettings
-from ...models.comp_pipelines import CompPipelineAtDB
-from ...models.comp_runs import CompRunsAtDB, Iteration, RunMetadataDict
-from ...models.comp_tasks import CompTaskAtDB
-from ...utils.computations import get_pipeline_state_from_task_states
-from ...utils.rabbitmq import (
-    publish_project_log,
-    publish_service_resource_tracking_heartbeat,
-    publish_service_resource_tracking_started,
-    publish_service_started_metrics,
-)
-from ..db.repositories.comp_pipelines import CompPipelinesRepository
-from ..db.repositories.comp_runs import CompRunsRepository
-from ..db.repositories.comp_tasks import CompTasksRepository
-from ._utils import (
-    COMPLETED_STATES,
-    PROCESSING_STATES,
-    RUNNING_STATES,
-    TASK_TO_START_STATES,
-    WAITING_FOR_START_STATES,
-    create_service_resources_from_task,
-    get_resource_tracking_run_id,
-)
-
-_logger = logging.getLogger(__name__)
-
-
-_Previous = CompTaskAtDB
-_Current = CompTaskAtDB
-_MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10
-
-
-@dataclass(frozen=True, slots=True)
-class SortedTasks:
-    started: list[CompTaskAtDB]
-    completed: list[CompTaskAtDB]
-    waiting: list[CompTaskAtDB]
-    potentially_lost: list[CompTaskAtDB]
-
-
-async def _triage_changed_tasks(
-    changed_tasks: list[tuple[_Previous, _Current]]
-) -> SortedTasks:
-    started_tasks = [
-        current
-        for previous, current in changed_tasks
-        if current.state in RUNNING_STATES
-        or (
-            previous.state in WAITING_FOR_START_STATES
-            and current.state in COMPLETED_STATES
-        )
-    ]
-
-    # NOTE: some tasks can be both started and completed since we might have the time they were running
-    completed_tasks = [
-        current for _, current in changed_tasks if current.state in COMPLETED_STATES
-    ]
-
-    waiting_for_resources_tasks = [
-        current
-        for previous, current in changed_tasks
-        if current.state in WAITING_FOR_START_STATES
-    ]
-
-    lost_or_momentarily_lost_tasks = [
-        current for _, current in changed_tasks if current.state is RunningState.UNKNOWN
-    ]
-    if lost_or_momentarily_lost_tasks:
-        _logger.warning(
-            "%s are currently in unknown state. TIP: If they are running in an external cluster and it is not yet ready, that might explain it. But inform @sanderegg nevertheless!",
-            [t.node_id for t in lost_or_momentarily_lost_tasks],
-        )
-
-    return SortedTasks(
-        started_tasks,
-        completed_tasks,
-        waiting_for_resources_tasks,
-        lost_or_momentarily_lost_tasks,
-    )
-
-
-@dataclass
-class BaseCompScheduler(ABC):
-    db_engine: Engine
-    rabbitmq_client: RabbitMQClient
-    rabbitmq_rpc_client: RabbitMQRPCClient
-    settings: ComputationalBackendSettings
-    service_runtime_heartbeat_interval: datetime.timedelta
-    redis_client: RedisClientSDK
-
-    async def _get_pipeline_dag(self, project_id: ProjectID) -> nx.DiGraph:
-        comp_pipeline_repo = CompPipelinesRepository.instance(self.db_engine)
-        pipeline_at_db: CompPipelineAtDB = await comp_pipeline_repo.get_pipeline(
-            project_id
-        )
-        dag = pipeline_at_db.get_graph()
-        _logger.debug("%s: current %s", f"{project_id=}", f"{dag=}")
-        return dag
-
-    async def _get_pipeline_tasks(
-        self, project_id: ProjectID, pipeline_dag: nx.DiGraph
-    ) -> dict[NodeIDStr, CompTaskAtDB]:
-        comp_tasks_repo = CompTasksRepository.instance(self.db_engine)
-        pipeline_comp_tasks: dict[NodeIDStr, CompTaskAtDB] = {
-            NodeIDStr(f"{t.node_id}"): t
-            for t in await comp_tasks_repo.list_computational_tasks(project_id)
-            if (f"{t.node_id}" in list(pipeline_dag.nodes()))
-        }
-        if len(pipeline_comp_tasks) != len(pipeline_dag.nodes()):  # type: ignore[arg-type]
-            msg = (
-                f"{project_id}The tasks defined for {project_id} do not contain all"
-                f" the tasks defined in the pipeline [{list(pipeline_dag.nodes)}]! Please check."
-            )
-            raise InvalidPipelineError(msg)
-        return pipeline_comp_tasks
-
-    async def _update_run_result_from_tasks(
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        iteration: Iteration,
-        pipeline_tasks: dict[NodeIDStr, CompTaskAtDB],
-    ) -> RunningState:
-        pipeline_state_from_tasks: RunningState = get_pipeline_state_from_task_states(
-            list(pipeline_tasks.values()),
-        )
-        _logger.debug(
-            "pipeline %s is currently in %s",
-            f"{user_id=}_{project_id=}_{iteration=}",
-            f"{pipeline_state_from_tasks}",
-        )
-        await self._set_run_result(
-            user_id, project_id, iteration, pipeline_state_from_tasks
-        )
-        return pipeline_state_from_tasks
-
-    async def _set_run_result(
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        iteration: Iteration,
-        run_result: RunningState,
-    ) -> None:
-        comp_runs_repo = CompRunsRepository.instance(self.db_engine)
-        await comp_runs_repo.set_run_result(
-            user_id=user_id,
-            project_id=project_id,
-            iteration=iteration,
-            result_state=run_result,
-            final_state=(run_result in COMPLETED_STATES),
-        )
-
-    async def _set_states_following_failed_to_aborted(
-        self, project_id: ProjectID, dag: nx.DiGraph
-    ) -> dict[NodeIDStr, CompTaskAtDB]:
-        tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks(
-            project_id, dag
-        )
-        node_ids_to_set_as_aborted: set[NodeIDStr] = set()
-        for task in tasks.values():
-            if task.state == RunningState.FAILED:
-                node_ids_to_set_as_aborted.update(nx.bfs_tree(dag, f"{task.node_id}"))
-                node_ids_to_set_as_aborted.remove(NodeIDStr(f"{task.node_id}"))
-        for node_id in node_ids_to_set_as_aborted:
-            tasks[NodeIDStr(f"{node_id}")].state = RunningState.ABORTED
-        if node_ids_to_set_as_aborted:
-            # update the current states back in DB
-            comp_tasks_repo = CompTasksRepository.instance(self.db_engine)
-            await comp_tasks_repo.update_project_tasks_state(
-                project_id,
-                [NodeID(n) for n in node_ids_to_set_as_aborted],
-                RunningState.ABORTED,
-                optional_progress=1.0,
-                optional_stopped=arrow.utcnow().datetime,
-            )
-        return tasks
-
-    async def _send_running_tasks_heartbeat(
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        iteration: Iteration,
-        dag: nx.DiGraph,
-    ) -> None:
-        utc_now = arrow.utcnow().datetime
-
-        def _need_heartbeat(task: CompTaskAtDB) -> bool:
-            if task.state not in RUNNING_STATES:
-                return False
-            if task.last_heartbeat is None:
-                assert task.start  # nosec
-                return bool(
-                    (utc_now - task.start.replace(tzinfo=datetime.UTC))
-                    > self.service_runtime_heartbeat_interval
-                )
-            return bool(
-                (utc_now - task.last_heartbeat)
-                > self.service_runtime_heartbeat_interval
-            )
-
-        tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks(
-            project_id, dag
-        )
-        if running_tasks := [t for t in tasks.values() if _need_heartbeat(t)]:
-            await asyncio.gather(
-                *(
-                    publish_service_resource_tracking_heartbeat(
-                        self.rabbitmq_client,
-                        get_resource_tracking_run_id(
-                            user_id, t.project_id, t.node_id, iteration
-                        ),
-                    )
-                    for t in running_tasks
-                )
-            )
-            comp_tasks_repo = CompTasksRepository(self.db_engine)
-            await asyncio.gather(
-                *(
-                    comp_tasks_repo.update_project_task_last_heartbeat(
-                        t.project_id, t.node_id, utc_now
-                    )
-                    for t in running_tasks
-                )
-            )
-
-    async def _get_changed_tasks_from_backend(
-        self,
-        user_id: UserID,
-        processing_tasks: list[CompTaskAtDB],
-        comp_run: CompRunsAtDB,
-    ) -> list[tuple[_Previous, _Current]]:
-        tasks_backend_status = await self._get_tasks_status(
-            user_id, processing_tasks, comp_run
-        )
-
-        return [
-            (
-                task,
-                task.copy(update={"state": backend_state}),
-            )
-            for task, backend_state in zip(
-                processing_tasks, tasks_backend_status, strict=True
-            )
-            if task.state is not backend_state
-        ]
-
-    async def _process_started_tasks(
-        self,
-        tasks: list[CompTaskAtDB],
-        *,
-        user_id: UserID,
-        iteration: Iteration,
-        run_metadata: RunMetadataDict,
-    ) -> None:
-        utc_now = arrow.utcnow().datetime
-
-        # resource tracking
-        await asyncio.gather(
-            *(
-                publish_service_resource_tracking_started(
-                    self.rabbitmq_client,
-                    service_run_id=get_resource_tracking_run_id(
-                        user_id, t.project_id, t.node_id, iteration
-                    ),
-                    wallet_id=run_metadata.get("wallet_id"),
-                    wallet_name=run_metadata.get("wallet_name"),
-                    pricing_plan_id=(
-                        t.pricing_info.get("pricing_plan_id")
-                        if t.pricing_info
-                        else None
-                    ),
-                    pricing_unit_id=(
-                        t.pricing_info.get("pricing_unit_id")
-                        if t.pricing_info
-                        else None
-                    ),
-                    pricing_unit_cost_id=(
-                        t.pricing_info.get("pricing_unit_cost_id")
-                        if t.pricing_info
-                        else None
-                    ),
-                    product_name=run_metadata.get(
-                        "product_name", UNDEFINED_STR_METADATA
-                    ),
-                    simcore_user_agent=run_metadata.get(
-                        "simcore_user_agent", UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
-                    ),
-                    user_id=user_id,
-                    user_email=run_metadata.get("user_email", UNDEFINED_STR_METADATA),
-                    project_id=t.project_id,
-                    project_name=run_metadata.get("project_metadata", {}).get(  # type: ignore[arg-type]
-                        "project_name", UNDEFINED_STR_METADATA
-                    ),
-                    node_id=t.node_id,
-                    node_name=run_metadata.get("node_id_names_map", {}).get(
-                        t.node_id, UNDEFINED_STR_METADATA
-                    ),
-                    parent_project_id=run_metadata.get("project_metadata", {}).get(
-                        "parent_project_id"
-                    ),
-                    parent_node_id=run_metadata.get("project_metadata", {}).get(
-                        "parent_node_id"
-                    ),
-                    root_parent_project_id=run_metadata.get("project_metadata", {}).get(
-                        "root_parent_project_id"
-                    ),
-                    root_parent_project_name=run_metadata.get(
-                        "project_metadata", {}
-                    ).get("root_parent_project_name"),
-                    root_parent_node_id=run_metadata.get("project_metadata", {}).get(
-                        "root_parent_node_id"
-                    ),
-                    service_key=ServiceKey(t.image.name),
-                    service_version=ServiceVersion(t.image.tag),
-                    service_type=ServiceType.COMPUTATIONAL,
-                    service_resources=create_service_resources_from_task(t),
-                    service_additional_metadata={},
-                )
-                for t in tasks
-            )
-        )
-        # instrumentation
-        await asyncio.gather(
-            *(
-                publish_service_started_metrics(
-                    self.rabbitmq_client,
-                    user_id=user_id,
-                    simcore_user_agent=run_metadata.get(
-                        "simcore_user_agent", UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE
-                    ),
-                    task=t,
-                )
-                for t in tasks
-            )
-        )
-
-        # update DB
-        comp_tasks_repo = CompTasksRepository(self.db_engine)
-        await asyncio.gather(
-            *(
-                comp_tasks_repo.update_project_tasks_state(
-                    t.project_id,
-                    [t.node_id],
-                    t.state,
-                    optional_started=utc_now,
-                    optional_progress=t.progress,
-                )
-                for t in tasks
-            )
-        )
-
-    async def _process_waiting_tasks(self, tasks: list[CompTaskAtDB]) -> None:
-        comp_tasks_repo = CompTasksRepository(self.db_engine)
-        await asyncio.gather(
-            *(
-                comp_tasks_repo.update_project_tasks_state(
-                    t.project_id,
-                    [t.node_id],
-                    t.state,
-                )
-                for t in tasks
-            )
-        )
-
-    async def _update_states_from_comp_backend(
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        iteration: Iteration,
-        pipeline_dag: nx.DiGraph,
-        comp_run: CompRunsAtDB,
-    ) -> None:
-        tasks = await self._get_pipeline_tasks(project_id, pipeline_dag)
-        tasks_inprocess = [t for t in tasks.values() if t.state in PROCESSING_STATES]
-        if not tasks_inprocess:
-            return
-
-        # get the tasks which state actually changed since last check
-        tasks_with_changed_states = await self._get_changed_tasks_from_backend(
-            user_id, tasks_inprocess, comp_run
-        )
-        # NOTE: typical states a task goes through
-        # NOT_STARTED (initial state) -> PUBLISHED (user press run/API call) -> PENDING -> WAITING_FOR_CLUSTER (cluster creation) ->
-        # PENDING -> WAITING_FOR_RESOURCES (workers creation or missing) -> PENDING -> STARTED (worker started processing the task) -> SUCCESS/FAILED
-        # or ABORTED (user cancelled) or UNKNOWN (lost task - it might be transient, be careful with this one)
-        sorted_tasks = await _triage_changed_tasks(tasks_with_changed_states)
-
-        # now process the tasks
-        if sorted_tasks.started:
-            # NOTE: the dask-scheduler cannot differentiate between tasks that are effectively computing and
-            # tasks that are only queued and accepted by a dask-worker.
-            # tasks_started should therefore be mostly empty but for cases where
-            # - dask Pub/Sub mechanism failed, the tasks goes from PENDING -> SUCCESS/FAILED/ABORTED without STARTED
-            # - the task finished so fast that the STARTED state was skipped between 2 runs of the dv-2 comp scheduler
-            await self._process_started_tasks(
-                sorted_tasks.started,
-                user_id=user_id,
-                iteration=iteration,
-                run_metadata=comp_run.metadata,
-            )
-
-        if sorted_tasks.completed or sorted_tasks.potentially_lost:
-            await self._process_completed_tasks(
-                user_id,
-                sorted_tasks.completed + sorted_tasks.potentially_lost,
-                iteration,
-                comp_run=comp_run,
-            )
-
-        if sorted_tasks.waiting:
-            await self._process_waiting_tasks(sorted_tasks.waiting)
-
-    @abstractmethod
-    async def _start_tasks(
-        self,
-        *,
-        user_id: UserID,
-        project_id: ProjectID,
-        scheduled_tasks: dict[NodeID, CompTaskAtDB],
-        comp_run: CompRunsAtDB,
-        wake_up_callback: Callable[[], None],
-    ) -> None:
-        ...
-
-    @abstractmethod
-    async def _get_tasks_status(
-        self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB
-    ) -> list[RunningState]:
-        ...
-
-    @abstractmethod
-    async def _stop_tasks(
-        self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB
-    ) -> None:
-        ...
-
-    @abstractmethod
-    async def _process_completed_tasks(
-        self,
-        user_id: UserID,
-        tasks: list[CompTaskAtDB],
-        iteration: Iteration,
-        comp_run: CompRunsAtDB,
-    ) -> None:
-        ...
-
-    async def schedule_pipeline(
-        self,
-        *,
-        user_id: UserID,
-        project_id: ProjectID,
-        iteration: Iteration,
-        wake_up_callback: Callable[[], None],
-    ) -> None:
-        with log_context(
-            _logger,
-            level=logging.INFO,
-            msg=f"scheduling pipeline {user_id=}:{project_id=}:{iteration=}",
-        ):
-            dag: nx.DiGraph = nx.DiGraph()
-            try:
-                comp_run = await CompRunsRepository.instance(self.db_engine).get(
-                    user_id, project_id, iteration
-                )
-                dag = await self._get_pipeline_dag(project_id)
-                # 1. Update our list of tasks with data from backend (state, results)
-                await self._update_states_from_comp_backend(
-                    user_id, project_id, iteration, dag, comp_run
-                )
-                # 2. Any task following a FAILED task shall be ABORTED
-                comp_tasks = await self._set_states_following_failed_to_aborted(
-                    project_id, dag
-                )
-                # 3. do we want to stop the pipeline now?
-                if comp_run.cancelled:
-                    await self._schedule_tasks_to_stop(
-                        user_id, project_id, comp_tasks, comp_run
-                    )
-                else:
-                    # let's get the tasks to schedule then
-                    comp_tasks = await self._schedule_tasks_to_start(
-                        user_id=user_id,
-                        project_id=project_id,
-                        comp_tasks=comp_tasks,
-                        dag=dag,
-                        comp_run=comp_run,
-                        wake_up_callback=wake_up_callback,
-                    )
-                # 4. timeout if waiting for cluster has been there for more than X minutes
-                comp_tasks = await self._timeout_if_waiting_for_cluster_too_long(
-                    user_id, project_id, comp_tasks
-                )
-                # 5. send a heartbeat
-                await self._send_running_tasks_heartbeat(
-                    user_id, project_id, iteration, dag
-                )
-
-                # 6. Update the run result
-                pipeline_result = await self._update_run_result_from_tasks(
-                    user_id, project_id, iteration, comp_tasks
-                )
-
-                # 7. Are we done scheduling that pipeline?
-                if not dag.nodes() or pipeline_result in COMPLETED_STATES:
-                    # there is nothing left, the run is completed, we're done here
-                    _logger.info(
-                        "pipeline %s scheduling completed with result %s",
-                        f"{project_id=}",
-                        f"{pipeline_result=}",
-                    )
-            except PipelineNotFoundError:
-                _logger.warning(
-                    "pipeline %s does not exist in comp_pipeline table, it will be removed from scheduler",
-                    f"{project_id=}",
-                )
-                await self._set_run_result(
-                    user_id, project_id, iteration, RunningState.ABORTED
-                )
-            except InvalidPipelineError as exc:
-                _logger.warning(
-                    "pipeline %s appears to be misconfigured, it will be removed from scheduler. Please check pipeline:\n%s",
-                    f"{project_id=}",
-                    exc,
-                )
-                await self._set_run_result(
-                    user_id, project_id, iteration, RunningState.ABORTED
-                )
-            except (DaskClientAcquisisitonError, ClustersKeeperNotAvailableError):
-                _logger.exception(
-                    "Unexpected error while connecting with computational backend, aborting pipeline"
-                )
-                tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks(
-                    project_id, dag
-                )
-                comp_tasks_repo = CompTasksRepository(self.db_engine)
-                await comp_tasks_repo.update_project_tasks_state(
-                    project_id,
-                    [t.node_id for t in tasks.values()],
-                    RunningState.FAILED,
-                )
-                await self._set_run_result(
-                    user_id, project_id, iteration, RunningState.FAILED
-                )
-            except ComputationalBackendNotConnectedError:
-                _logger.exception("Computational backend is not connected!")
-
-    async def _schedule_tasks_to_stop(
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        comp_tasks: dict[NodeIDStr, CompTaskAtDB],
-        comp_run: CompRunsAtDB,
-    ) -> None:
-        # get any running task and stop them
-        comp_tasks_repo = CompTasksRepository.instance(self.db_engine)
-        await comp_tasks_repo.mark_project_published_waiting_for_cluster_tasks_as_aborted(
-            project_id
-        )
-        # stop any remaining running task, these are already submitted
-        if tasks_to_stop := [
-            t for t in comp_tasks.values() if t.state in PROCESSING_STATES
-        ]:
-            await self._stop_tasks(user_id, tasks_to_stop, comp_run)
-
-    async def _schedule_tasks_to_start(  # noqa: C901
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        comp_tasks: dict[NodeIDStr, CompTaskAtDB],
-        dag: nx.DiGraph,
-        comp_run: CompRunsAtDB,
-        wake_up_callback: Callable[[], None],
-    ) -> dict[NodeIDStr, CompTaskAtDB]:
-        # filter out the successfully completed tasks
-        dag.remove_nodes_from(
-            {
-                node_id
-                for node_id, t in comp_tasks.items()
-                if t.state == RunningState.SUCCESS
-            }
-        )
-        dag_in_degree = dag.in_degree()
-        assert isinstance(dag_in_degree, InDegreeView)  # nosec
-        next_task_node_ids = [
-            node_id for node_id, degree in dag_in_degree if degree == 0
-        ]
-
-        # get the tasks to start
-        tasks_ready_to_start: dict[NodeID, CompTaskAtDB] = {
-            node_id: comp_tasks[NodeIDStr(f"{node_id}")]
-            for node_id in next_task_node_ids
-            if comp_tasks[NodeIDStr(f"{node_id}")].state in TASK_TO_START_STATES
-        }
-
-        if not tasks_ready_to_start:
-            # nothing to do
-            return comp_tasks
-
-        try:
-            await self._start_tasks(
-                user_id=user_id,
-                project_id=project_id,
-                scheduled_tasks=tasks_ready_to_start,
-                comp_run=comp_run,
-                wake_up_callback=wake_up_callback,
-            )
-        except (
-            ComputationalBackendNotConnectedError,
-            ComputationalSchedulerChangedError,
-        ):
-            _logger.exception(
-                "Issue with computational backend. Tasks are set back "
-                "to WAITING_FOR_CLUSTER state until scheduler comes back!",
-            )
-            await CompTasksRepository.instance(
-                self.db_engine
-            ).update_project_tasks_state(
-                project_id,
-                list(tasks_ready_to_start.keys()),
-                RunningState.WAITING_FOR_CLUSTER,
-            )
-            for task in tasks_ready_to_start:
-                comp_tasks[
-                    NodeIDStr(f"{task}")
-                ].state = RunningState.WAITING_FOR_CLUSTER
-
-        except ComputationalBackendOnDemandNotReadyError as exc:
-            _logger.info(
-                "The on demand computational backend is not ready yet: %s", exc
-            )
-            await publish_project_log(
-                self.rabbitmq_client,
-                user_id,
-                project_id,
-                log=f"{exc}",
-                log_level=logging.INFO,
-            )
-
-            await CompTasksRepository.instance(
-                self.db_engine
-            ).update_project_tasks_state(
-                project_id,
-                list(tasks_ready_to_start.keys()),
-                RunningState.WAITING_FOR_CLUSTER,
-            )
-            for task in tasks_ready_to_start:
-                comp_tasks[
-                    NodeIDStr(f"{task}")
-                ].state = RunningState.WAITING_FOR_CLUSTER
-        except ClustersKeeperNotAvailableError:
-            _logger.exception("Unexpected error while starting tasks:")
-            await publish_project_log(
-                self.rabbitmq_client,
-                user_id,
-                project_id,
-                log="Unexpected error while scheduling computational tasks! TIP: contact osparc support.",
-                log_level=logging.ERROR,
-            )
-
-            await CompTasksRepository.instance(
-                self.db_engine
-            ).update_project_tasks_state(
-                project_id,
-                list(tasks_ready_to_start.keys()),
-                RunningState.FAILED,
-                optional_progress=1.0,
-                optional_stopped=arrow.utcnow().datetime,
-            )
-            for task in tasks_ready_to_start:
-                comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
-            raise
-        except TaskSchedulingError as exc:
-            _logger.exception("Project '%s' task could not be scheduled", project_id)
-            node_id = NodeID(exc.error_context()["node_id"])
-            await CompTasksRepository.instance(
-                self.db_engine
-            ).update_project_tasks_state(
-                project_id,
-                [node_id],
-                RunningState.FAILED,
-                [{exc.error_code(): f"{exc}"}],
-                optional_progress=1.0,
-                optional_stopped=arrow.utcnow().datetime,
-            )
-            comp_tasks[NodeIDStr(f"{node_id}")].state = RunningState.FAILED
-        except Exception:
-            _logger.exception(
-                "Unexpected error for %s with %s on %s happened when scheduling %s:",
-                f"{user_id=}",
-                f"{project_id=}",
-                f"{comp_run.cluster_id=}",
-                f"{tasks_ready_to_start.keys()=}",
-            )
-            await CompTasksRepository.instance(
-                self.db_engine
-            ).update_project_tasks_state(
-                project_id,
-                list(tasks_ready_to_start.keys()),
-                RunningState.FAILED,
-                optional_progress=1.0,
-                optional_stopped=arrow.utcnow().datetime,
-            )
-            for task in tasks_ready_to_start:
-                comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
-            raise
-
-        return comp_tasks
-
-    async def _timeout_if_waiting_for_cluster_too_long(
-        self,
-        user_id: UserID,
-        project_id: ProjectID,
-        comp_tasks: dict[NodeIDStr, CompTaskAtDB],
-    ) -> dict[NodeIDStr, CompTaskAtDB]:
-        if all(
-            c.state is RunningState.WAITING_FOR_CLUSTER for c in comp_tasks.values()
-        ):
-            # get latest modified task
-            latest_modified_of_all_tasks = max(
-                comp_tasks.values(), key=lambda task: task.modified
-            ).modified
-
-            if (
-                arrow.utcnow().datetime - latest_modified_of_all_tasks
-            ) > datetime.timedelta(minutes=_MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN):
-                await CompTasksRepository.instance(
-                    self.db_engine
-                ).update_project_tasks_state(
-                    project_id,
-                    [NodeID(idstr) for idstr in comp_tasks],
-                    RunningState.FAILED,
-                    optional_progress=1.0,
-                    optional_stopped=arrow.utcnow().datetime,
-                )
-                for task in comp_tasks.values():
-                    task.state = RunningState.FAILED
-                msg = "Timed-out waiting for computational cluster! Please try again and/or contact Osparc support."
-                _logger.error(msg)
-                await publish_project_log(
-                    self.rabbitmq_client,
-                    user_id,
-                    project_id,
-                    log=msg,
-                    log_level=logging.ERROR,
-                )
-        return comp_tasks

From 07c34550ea694f5922e61e017abb8de85d8dc78c Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:54:04 +0100
Subject: [PATCH 07/27] mypy

---
 .../src/simcore_service_director_v2/core/application.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/application.py b/services/director-v2/src/simcore_service_director_v2/core/application.py
index fb9d6094dff9..43a9dcc4e031 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/application.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/application.py
@@ -126,6 +126,7 @@ def create_base_app(settings: AppSettings | None = None) -> FastAPI:
     for name in _NOISY_LOGGERS:
         logging.getLogger(name).setLevel(quiet_level)
 
+    assert settings.SC_BOOT_MODE  # nosec
     app = FastAPI(
         debug=settings.SC_BOOT_MODE.is_devel_mode(),
         title=PROJECT_NAME,

From 6d8dfbbdf6191efa1d91dafac3ba00379a558789 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:55:57 +0100
Subject: [PATCH 08/27] access to error context

---
 .../simcore_service_director_v2/api/routes/computations.py    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
index 4dd160ed6f40..6d1ca989eec1 100644
--- a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
+++ b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
@@ -204,7 +204,9 @@ async def _get_project_node_names(
     except DBProjectNotFoundError:
         _logger.exception("Could not find project: %s", f"{project_id=}")
     except ProjectNotFoundError as exc:
-        _logger.exception("Could not find parent project: %s", f"{exc.project_id=}")
+        _logger.exception(
+            "Could not find parent project: %s", exc.error_context()["project_id"]
+        )
 
     return {}
 

From ca60bce139248ffb2d973e0ae906e35a86e295a8 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 12:56:57 +0100
Subject: [PATCH 09/27] correct error usage

---
 .../simcore_service_director_v2/core/errors.py   |  2 +-
 .../modules/comp_scheduler/_base_scheduler.py    | 16 +++++++++-------
 .../utils/dask_client_utils.py                   |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 08f10bca8597..6ec36df9aecb 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -62,7 +62,7 @@ class WalletNotEnoughCreditsError(DirectorError):
 # SCHEDULER ERRORS
 #
 class ComputationalSchedulerError(DirectorError):
-    msg_template = "Computational scheduler unexpected error"
+    msg_template = "Computational scheduler unexpected error {msg}"
 
 
 class InvalidPipelineError(ComputationalSchedulerError):
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
index 9dfae4bc6660..bd2439a618ba 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
@@ -242,7 +242,7 @@ async def stop_pipeline(
             }
             if not possible_iterations:
                 msg = f"There are no pipeline scheduled for {user_id}:{project_id}"
-                raise ComputationalSchedulerError(msg)
+                raise ComputationalSchedulerError(msg=msg)
             current_max_iteration = max(possible_iterations)
             selected_iteration = current_max_iteration
         else:
@@ -281,7 +281,7 @@ def _get_last_iteration(self, user_id: UserID, project_id: ProjectID) -> Iterati
         }
         if not possible_iterations:
             msg = f"There are no pipeline scheduled for {user_id}:{project_id}"
-            raise ComputationalSchedulerError(msg)
+            raise ComputationalSchedulerError(msg=msg)
         return max(possible_iterations)
 
     def _start_scheduling(
@@ -929,23 +929,25 @@ async def _schedule_tasks_to_start(  # noqa: C901
                 comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
             raise
         except TaskSchedulingError as exc:
-            exc.error_context()["project_id"]
+            err_context = exc.error_context()
             _logger.exception(
                 "Project '%s''s task '%s' could not be scheduled",
-                exc.error_context()["project_id"],
-                exc.error_context()["node_id"],
+                err_context["project_id"],
+                err_context["node_id"],
             )
             await CompTasksRepository.instance(
                 self.db_engine
             ).update_project_tasks_state(
                 project_id,
-                [exc.node_id],
+                [err_context["node_id"]],
                 RunningState.FAILED,
                 exc.get_errors(),
                 optional_progress=1.0,
                 optional_stopped=arrow.utcnow().datetime,
             )
-            comp_tasks[NodeIDStr(f"{exc.node_id}")].state = RunningState.FAILED
+            comp_tasks[
+                NodeIDStr(f"{err_context['node_id']}")
+            ].state = RunningState.FAILED
         except Exception:
             _logger.exception(
                 "Unexpected error for %s with %s on %s happened when scheduling %s:",
diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
index e1367e71c842..964f38e64845 100644
--- a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
+++ b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py
@@ -220,7 +220,7 @@ async def test_scheduler_endpoint(
             ) as dask_client:
                 if dask_client.status != _DASK_SCHEDULER_RUNNING_STATE:
                     msg = "internal scheduler is not running!"
-                    raise ComputationalSchedulerError(msg)
+                    raise ComputationalSchedulerError(msg=msg)
 
         else:
             gateway_auth = await get_gateway_auth_from_params(authentication)

From 69fa68d2e70ee38640a12e51541fe1bb907af546 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 13:07:47 +0100
Subject: [PATCH 10/27] errors on dynamic sidecar submodule

---
 .../dynamic_sidecar/docker_api/_core.py       | 15 ++++++-----
 .../dynamic_sidecar/docker_api/_utils.py      |  3 +--
 .../docker_service_specs/settings.py          |  2 +-
 .../modules/dynamic_sidecar/errors.py         | 26 +++++--------------
 .../scheduler/_core/_scheduler.py             | 16 ++++++------
 .../modules/dynamic_sidecar/volumes.py        |  2 +-
 6 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py
index ed834581b03e..7772d0d67be5 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py
@@ -59,7 +59,7 @@ async def get_swarm_network(simcore_services_network_name: DockerNetworkName) ->
             f"Swarm network name (searching for '*{simcore_services_network_name}*') "
             f"is not configured.Found following networks: {networks}"
         )
-        raise DynamicSidecarError(msg)
+        raise DynamicSidecarError(msg=msg)
     return networks[0]
 
 
@@ -89,7 +89,7 @@ async def create_network(network_config: dict[str, Any]) -> NetworkId:
             # finally raise an error if a network cannot be spawned
             # pylint: disable=raise-missing-from
             msg = f"Could not create or recover a network ID for {network_config}"
-            raise DynamicSidecarError(msg) from e
+            raise DynamicSidecarError(msg=msg) from e
 
 
 def _to_snake_case(string: str) -> str:
@@ -119,7 +119,7 @@ async def create_service_and_get_id(
 
     if "ID" not in service_start_result:
         msg = f"Error while starting service: {service_start_result!s}"
-        raise DynamicSidecarError(msg)
+        raise DynamicSidecarError(msg=msg)
     service_id: ServiceId = service_start_result["ID"]
     return service_id
 
@@ -159,7 +159,10 @@ async def _get_service_latest_task(service_id: str) -> Mapping[str, Any]:
             last_task: Mapping[str, Any] = sorted_tasks[-1]
             return last_task
     except GenericDockerError as err:
-        if err.original_exception.status == status.HTTP_404_NOT_FOUND:
+        if (
+            err.error_context()["original_exception"].status
+            == status.HTTP_404_NOT_FOUND
+        ):
             raise DockerServiceNotFoundError(service_id=service_id) from err
         raise
 
@@ -205,7 +208,7 @@ async def _get_task_data_when_service_running(service_id: str) -> Mapping[str, A
     docker_node_id: None | str = task.get("NodeID", None)
     if not docker_node_id:
         msg = f"Could not find an assigned NodeID for service_id={service_id}. Last task inspect result: {task}"
-        raise DynamicSidecarError(msg)
+        raise DynamicSidecarError(msg=msg)
 
     return docker_node_id
 
@@ -484,7 +487,7 @@ async def update_scheduler_data_label(scheduler_data: SchedulerData) -> None:
             },
         )
     except GenericDockerError as e:
-        if e.original_exception.status == status.HTTP_404_NOT_FOUND:
+        if e.error_context()["original_exception"].status == status.HTTP_404_NOT_FOUND:
             log.info(
                 "Skipped labels update for service '%s' which could not be found.",
                 scheduler_data.service_name,
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py
index ceb9d276c131..75f057e97564 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py
@@ -13,8 +13,7 @@ async def docker_client() -> AsyncIterator[aiodocker.docker.Docker]:
         client = aiodocker.Docker()
         yield client
     except aiodocker.exceptions.DockerError as e:
-        message = "Unexpected error from docker client"
-        raise GenericDockerError(message, e) from e
+        raise GenericDockerError(msg=f"{e}", original_exception=e) from e
     finally:
         if client is not None:
             await client.close()
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/settings.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/settings.py
index 78a1201a7148..8c1849064eef 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/settings.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/settings.py
@@ -267,7 +267,7 @@ def remap_to_compose_spec_key() -> dict[str, SimcoreServiceLabels]:
             f"docker_image_name_by_services={docker_image_name_by_services}"
         )
         log.error(message)
-        raise DynamicSidecarError(message)
+        raise DynamicSidecarError(msg=message)
 
     return remap_to_compose_spec_key()
 
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
index e3d67da68aa1..0ebc222e914b 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
@@ -1,34 +1,20 @@
-from aiodocker.exceptions import DockerError
-from common_library.errors_classes import OsparcErrorMixin
-from models_library.projects_nodes_io import NodeID
-
 from ...core.errors import DirectorError
 
 
 class DynamicSidecarError(DirectorError):
-    pass
+    msg_template: str = "Unexpected dynamic sidecar error: {msg}"
 
 
 class GenericDockerError(DynamicSidecarError):
-    """Generic docker library error"""
-
-    def __init__(self, msg: str, original_exception: DockerError):
-        super().__init__(msg + f": {original_exception.message}")
-        self.original_exception = original_exception
+    msg_template: str = "Unexpected error using docker: {msg}"
 
 
 class DynamicSidecarNotFoundError(DirectorError):
-    """Dynamic sidecar was not found"""
-
-    def __init__(self, node_uuid: NodeID):
-        super().__init__(f"node {node_uuid} not found")
+    msg_template: str = "node {node_uuid} not found"
 
 
 class DockerServiceNotFoundError(DirectorError):
-    """Raised when an expected docker service is not found"""
-
-    def __init__(self, service_id: str):
-        super().__init__(f"docker service with {service_id=} not found")
+    msg_template: str = "docker service with {service_id} not found"
 
 
 class EntrypointContainerNotFoundError(DynamicSidecarError):
@@ -39,5 +25,5 @@ class LegacyServiceIsNotSupportedError(DirectorError):
     """This API is not implemented by the director-v0"""
 
 
-class UnexpectedContainerStatusError(OsparcErrorMixin, DynamicSidecarError):
-    msg_template = "Unexpected status from containers: {containers_with_error}"
+class UnexpectedContainerStatusError(DynamicSidecarError):
+    msg_template: str = "Unexpected status from containers: {containers_with_error}"
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py
index 04853661c477..99fa3517130e 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_scheduler.py
@@ -171,7 +171,7 @@ def toggle_observation(self, node_uuid: NodeID, *, disable: bool) -> bool:
         raises DynamicSidecarNotFoundError
         """
         if node_uuid not in self._inverse_search_mapping:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
         service_name = self._inverse_search_mapping[node_uuid]
 
         service_task = self._service_observation_task.get(service_name)
@@ -274,7 +274,7 @@ async def add_service_from_scheduler_data(
                     f"node_uuids at a global level collided. A running service for node {scheduler_data.node_uuid} already exists."
                     " Please checkout other projects which may have this issue."
                 )
-                raise DynamicSidecarError(msg)
+                raise DynamicSidecarError(msg=msg)
 
             self._inverse_search_mapping[
                 scheduler_data.node_uuid
@@ -288,7 +288,7 @@ def is_service_tracked(self, node_uuid: NodeID) -> bool:
 
     def get_scheduler_data(self, node_uuid: NodeID) -> SchedulerData:
         if node_uuid not in self._inverse_search_mapping:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
         service_name = self._inverse_search_mapping[node_uuid]
         return self._to_observe[service_name]
 
@@ -336,7 +336,7 @@ async def mark_service_for_removal(
         """Marks service for removal, causing RemoveMarkedService to trigger"""
         async with self._lock:
             if node_uuid not in self._inverse_search_mapping:
-                raise DynamicSidecarNotFoundError(node_uuid)
+                raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
 
             service_name = self._inverse_search_mapping[node_uuid]
             if service_name not in self._to_observe:
@@ -416,7 +416,7 @@ async def remove_service_from_observation(self, node_uuid: NodeID) -> None:
         """
         async with self._lock:
             if node_uuid not in self._inverse_search_mapping:
-                raise DynamicSidecarNotFoundError(node_uuid)
+                raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
 
             service_name = self._inverse_search_mapping[node_uuid]
             if service_name not in self._to_observe:
@@ -438,7 +438,7 @@ async def get_stack_status(self, node_uuid: NodeID) -> RunningDynamicServiceDeta
         raises DynamicSidecarNotFoundError
         """
         if node_uuid not in self._inverse_search_mapping:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
         service_name = self._inverse_search_mapping[node_uuid]
 
         scheduler_data: SchedulerData = self._to_observe[service_name]
@@ -451,7 +451,7 @@ async def retrieve_service_inputs(
     ) -> RetrieveDataOutEnveloped:
         """Pulls data from input ports for the service"""
         if node_uuid not in self._inverse_search_mapping:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
 
         service_name = self._inverse_search_mapping[node_uuid]
         scheduler_data: SchedulerData = self._to_observe[service_name]
@@ -518,7 +518,7 @@ async def detach_project_network(
     async def restart_containers(self, node_uuid: NodeID) -> None:
         """Restarts containers without saving or restoring the state or I/O ports"""
         if node_uuid not in self._inverse_search_mapping:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
 
         service_name: ServiceName = self._inverse_search_mapping[node_uuid]
         scheduler_data: SchedulerData = self._to_observe[service_name]
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py
index d003eec60e60..7f55dc68498a 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py
@@ -79,7 +79,7 @@ def _get_s3_volume_driver_config(
         }
     else:
         msg = f"Unexpected, all {S3Provider.__name__} should be covered"
-        raise DynamicSidecarError(msg)
+        raise DynamicSidecarError(msg=msg)
 
     assert extra_options is not None  # nosec
     options: dict[str, Any] = driver_config["Options"]

From 94432410ea638a9b4e60f26df495879995fc9e81 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 13:08:49 +0100
Subject: [PATCH 11/27] temporary fix

---
 .../modules/comp_scheduler/_base_scheduler.py                   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
index bd2439a618ba..20a4a5eb8bca 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
@@ -941,7 +941,7 @@ async def _schedule_tasks_to_start(  # noqa: C901
                 project_id,
                 [err_context["node_id"]],
                 RunningState.FAILED,
-                exc.get_errors(),
+                None,  # exc.get_errors(), # @pcrespov I need your help here!
                 optional_progress=1.0,
                 optional_stopped=arrow.utcnow().datetime,
             )

From ea7c1cac5fbf952678aa429a891772a61661eb12 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 13:10:57 +0100
Subject: [PATCH 12/27] bad merge

---
 ...t_modules_comp_scheduler_dask_scheduler.py | 1630 -----------------
 1 file changed, 1630 deletions(-)
 delete mode 100644 services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py

diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py
deleted file mode 100644
index 6984fcea12c4..000000000000
--- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py
+++ /dev/null
@@ -1,1630 +0,0 @@
-# pylint:disable=unused-variable
-# pylint:disable=unused-argument
-# pylint:disable=redefined-outer-name
-# pylint:disable=no-value-for-parameter
-# pylint:disable=protected-access
-# pylint:disable=too-many-arguments
-# pylint:disable=no-name-in-module
-# pylint: disable=too-many-statements
-
-
-import asyncio
-import datetime
-from collections.abc import AsyncIterator, Awaitable, Callable
-from copy import deepcopy
-from dataclasses import dataclass
-from typing import Any, cast
-from unittest import mock
-
-import aiopg
-import aiopg.sa
-import httpx
-import pytest
-from _helpers import PublishedProject, RunningProject
-from dask.distributed import SpecCluster
-from dask_task_models_library.container_tasks.errors import TaskCancelledError
-from dask_task_models_library.container_tasks.events import TaskProgressEvent
-from dask_task_models_library.container_tasks.io import TaskOutputData
-from dask_task_models_library.container_tasks.protocol import TaskOwner
-from faker import Faker
-from fastapi.applications import FastAPI
-from models_library.clusters import DEFAULT_CLUSTER_ID
-from models_library.projects import ProjectAtDB, ProjectID
-from models_library.projects_nodes_io import NodeID
-from models_library.projects_state import RunningState
-from models_library.rabbitmq_messages import (
-    InstrumentationRabbitMessage,
-    RabbitResourceTrackingBaseMessage,
-    RabbitResourceTrackingHeartbeatMessage,
-    RabbitResourceTrackingMessages,
-    RabbitResourceTrackingStartedMessage,
-    RabbitResourceTrackingStoppedMessage,
-)
-from models_library.users import UserID
-from pydantic import parse_obj_as, parse_raw_as
-from pytest_mock.plugin import MockerFixture
-from pytest_simcore.helpers.typing_env import EnvVarsDict
-from servicelib.rabbitmq import RabbitMQClient
-from servicelib.redis import CouldNotAcquireLockError
-from settings_library.rabbit import RabbitSettings
-from settings_library.redis import RedisSettings
-from simcore_postgres_database.models.comp_runs import comp_runs
-from simcore_postgres_database.models.comp_tasks import NodeClass, comp_tasks
-from simcore_service_director_v2.core.application import init_app
-from simcore_service_director_v2.core.errors import (
-    ClustersKeeperNotAvailableError,
-    ComputationalBackendNotConnectedError,
-    ComputationalBackendOnDemandNotReadyError,
-    ComputationalBackendTaskNotFoundError,
-    ComputationalBackendTaskResultsNotReadyError,
-    ComputationalSchedulerChangedError,
-    ComputationalSchedulerError,
-    ConfigurationError,
-    PipelineNotFoundError,
-)
-from simcore_service_director_v2.core.settings import AppSettings
-from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB
-from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict
-from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image
-from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState
-from simcore_service_director_v2.modules.comp_scheduler._scheduler_base import (
-    BaseCompScheduler,
-)
-from simcore_service_director_v2.modules.comp_scheduler._scheduler_dask import (
-    DaskScheduler,
-)
-from simcore_service_director_v2.modules.comp_scheduler._utils import COMPLETED_STATES
-from simcore_service_director_v2.modules.dask_client import (
-    DaskJobID,
-    PublishedComputationTask,
-)
-from simcore_service_director_v2.utils.dask_client_utils import TaskHandlers
-from starlette.testclient import TestClient
-from tenacity.asyncio import AsyncRetrying
-from tenacity.retry import retry_if_exception_type
-from tenacity.stop import stop_after_delay
-from tenacity.wait import wait_fixed
-
-pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"]
-pytest_simcore_ops_services_selection = [
-    "adminer",
-]
-
-
-def _assert_dask_client_correctly_initialized(
-    mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler
-) -> None:
-    mocked_dask_client.create.assert_called_once_with(
-        app=mock.ANY,
-        settings=mock.ANY,
-        endpoint=mock.ANY,
-        authentication=mock.ANY,
-        tasks_file_link_type=mock.ANY,
-        cluster_type=mock.ANY,
-    )
-    mocked_dask_client.register_handlers.assert_called_once_with(
-        TaskHandlers(
-            cast(  # noqa: SLF001
-                DaskScheduler, scheduler
-            )._task_progress_change_handler,
-            cast(DaskScheduler, scheduler)._task_log_change_handler,  # noqa: SLF001
-        )
-    )
-
-
-async def _assert_comp_run_db(
-    aiopg_engine: aiopg.sa.engine.Engine,
-    pub_project: PublishedProject,
-    expected_state: RunningState,
-) -> None:
-    # check the database is correctly updated, the run is published
-    async with aiopg_engine.acquire() as conn:
-        result = await conn.execute(
-            comp_runs.select().where(
-                (comp_runs.c.user_id == pub_project.project.prj_owner)
-                & (comp_runs.c.project_uuid == f"{pub_project.project.uuid}")
-            )  # there is only one entry
-        )
-        run_entry = CompRunsAtDB.parse_obj(await result.first())
-    assert (
-        run_entry.result == expected_state
-    ), f"comp_runs: expected state '{expected_state}, found '{run_entry.result}'"
-
-
-async def _assert_comp_tasks_db(
-    aiopg_engine: aiopg.sa.engine.Engine,
-    project_uuid: ProjectID,
-    task_ids: list[NodeID],
-    *,
-    expected_state: RunningState,
-    expected_progress: float | None,
-) -> None:
-    # check the database is correctly updated, the run is published
-    async with aiopg_engine.acquire() as conn:
-        result = await conn.execute(
-            comp_tasks.select().where(
-                (comp_tasks.c.project_id == f"{project_uuid}")
-                & (comp_tasks.c.node_id.in_([f"{n}" for n in task_ids]))
-            )  # there is only one entry
-        )
-        tasks = parse_obj_as(list[CompTaskAtDB], await result.fetchall())
-    assert all(
-        t.state == expected_state for t in tasks
-    ), f"expected state: {expected_state}, found: {[t.state for t in tasks]}"
-    assert all(
-        t.progress == expected_progress for t in tasks
-    ), f"{expected_progress=}, found: {[t.progress for t in tasks]}"
-
-
-async def schedule_all_pipelines(scheduler: BaseCompScheduler) -> None:
-    # NOTE: we take a copy of the pipelines, as this could change quickly if there are
-    # misconfigured pipelines that would be removed from the scheduler
-    # NOTE: we simulate multiple dv-2 replicas by running several times
-    # the same pipeline scheduling
-    local_pipelines = deepcopy(scheduler._scheduled_pipelines)  # noqa: SLF001
-    results = await asyncio.gather(
-        *(
-            scheduler.schedule_pipeline(
-                user_id=user_id,
-                project_id=project_id,
-                iteration=iteration,
-                wake_up_callback=params.scheduler_waker.set,
-            )
-            for _ in range(3)
-            for (
-                user_id,
-                project_id,
-                iteration,
-            ), params in local_pipelines.items()
-        ),
-        return_exceptions=True,
-    )
-    # we should have exceptions 2/3 of the time
-    could_not_acquire_lock_count = sum(
-        isinstance(r, CouldNotAcquireLockError) for r in results
-    )
-    total_results_count = len(results)
-
-    # Check if 2/3 of the results are CouldNotAcquireLockError
-    # checks that scheduling is done exclusively
-    assert could_not_acquire_lock_count == (2 / 3) * total_results_count
-
-
-@pytest.fixture
-def minimal_scheduler_dask_config(
-    mock_env: EnvVarsDict,
-    postgres_host_config: dict[str, str],
-    monkeypatch: pytest.MonkeyPatch,
-    rabbit_service: RabbitSettings,
-    redis_service: RedisSettings,
-    faker: Faker,
-) -> None:
-    """set a minimal configuration for testing the dask connection only"""
-    monkeypatch.setenv("DIRECTOR_V2_DYNAMIC_SIDECAR_ENABLED", "false")
-    monkeypatch.setenv("DIRECTOR_V0_ENABLED", "0")
-    monkeypatch.setenv("COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED", "1")
-    monkeypatch.setenv("COMPUTATIONAL_BACKEND_ENABLED", "1")
-    monkeypatch.setenv("R_CLONE_PROVIDER", "MINIO")
-    monkeypatch.setenv("S3_ENDPOINT", faker.url())
-    monkeypatch.setenv("S3_ACCESS_KEY", faker.pystr())
-    monkeypatch.setenv("S3_REGION", faker.pystr())
-    monkeypatch.setenv("S3_SECRET_KEY", faker.pystr())
-    monkeypatch.setenv("S3_BUCKET_NAME", faker.pystr())
-
-
-@pytest.fixture
-def scheduler(
-    minimal_scheduler_dask_config: None,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    minimal_app: FastAPI,
-) -> BaseCompScheduler:
-    scheduler = _get_scheduler_worker(minimal_app)
-    assert scheduler is not None
-    return scheduler
-
-
-@pytest.fixture
-def mocked_dask_client(mocker: MockerFixture) -> mock.MagicMock:
-    mocked_dask_client = mocker.patch(
-        "simcore_service_director_v2.modules.dask_clients_pool.DaskClient",
-        autospec=True,
-    )
-    mocked_dask_client.create.return_value = mocked_dask_client
-    return mocked_dask_client
-
-
-@pytest.fixture
-def mocked_parse_output_data_fct(mocker: MockerFixture) -> mock.Mock:
-    return mocker.patch(
-        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.parse_output_data",
-        autospec=True,
-    )
-
-
-@pytest.fixture
-def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.MagicMock:
-    return mocker.patch(
-        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid",
-        return_value=None,
-        autospec=True,
-    )
-
-
-@pytest.fixture
-def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.MagicMock:
-    """disables the scheduler task, note that it needs to be triggered manu>ally then"""
-
-    def _fake_starter(
-        self: BaseCompScheduler,
-        *args,
-        **kwargs,
-    ):
-        scheduler_task = mocker.MagicMock()
-        scheduler_task_wake_up_event = mocker.MagicMock()
-        return scheduler_task, scheduler_task_wake_up_event
-
-    return mocker.patch(
-        "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler._start_scheduling",
-        autospec=True,
-        side_effect=_fake_starter,
-    )
-
-
-@pytest.fixture
-async def minimal_app(async_client: httpx.AsyncClient) -> FastAPI:
-    # must use the minimal app from from the `async_client``
-    # the`client` uses starlette's TestClient which spawns
-    # a new thread on which it creates a new loop
-    # causing issues downstream with coroutines not
-    # being created on the same loop
-    return async_client._transport.app  # type: ignore  # noqa: SLF001
-
-
-@pytest.fixture
-def mocked_clean_task_output_and_log_files_if_invalid(mocker: MockerFixture) -> None:
-    mocker.patch(
-        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid",
-        autospec=True,
-    )
-
-
-async def test_scheduler_gracefully_starts_and_stops(
-    minimal_scheduler_dask_config: None,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    dask_spec_local_cluster: SpecCluster,
-    minimal_app: FastAPI,
-):
-    # check it started correctly
-    assert _get_scheduler_worker(minimal_app) is not None
-
-
-@pytest.mark.parametrize(
-    "missing_dependency",
-    [
-        "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED",
-    ],
-)
-def test_scheduler_raises_exception_for_missing_dependencies(
-    minimal_scheduler_dask_config: None,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    dask_spec_local_cluster: SpecCluster,
-    monkeypatch: pytest.MonkeyPatch,
-    missing_dependency: str,
-):
-    # disable the dependency
-    monkeypatch.setenv(missing_dependency, "0")
-    # create the client
-    settings = AppSettings.create_from_envs()
-    app = init_app(settings)
-
-    with pytest.raises(ConfigurationError), TestClient(
-        app, raise_server_exceptions=True
-    ) as _:
-        pass
-
-
-async def test_empty_pipeline_is_not_scheduled(
-    with_disabled_auto_scheduling: None,
-    scheduler: BaseCompScheduler,
-    registered_user: Callable[..., dict[str, Any]],
-    project: Callable[..., Awaitable[ProjectAtDB]],
-    pipeline: Callable[..., CompPipelineAtDB],
-    aiopg_engine: aiopg.sa.engine.Engine,
-    run_metadata: RunMetadataDict,
-):
-    user = registered_user()
-    empty_project = await project(user)
-
-    # the project is not in the comp_pipeline, therefore scheduling it should fail
-    with pytest.raises(PipelineNotFoundError):
-        await scheduler.run_new_pipeline(
-            user_id=user["id"],
-            project_id=empty_project.uuid,
-            cluster_id=DEFAULT_CLUSTER_ID,
-            run_metadata=run_metadata,
-            use_on_demand_clusters=False,
-        )
-    # create the empty pipeline now
-    pipeline(project_id=f"{empty_project.uuid}")
-
-    # creating a run with an empty pipeline is useless, check the scheduler is not kicking in
-    await scheduler.run_new_pipeline(
-        user_id=user["id"],
-        project_id=empty_project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        run_metadata=run_metadata,
-        use_on_demand_clusters=False,
-    )
-    assert len(scheduler._scheduled_pipelines) == 0  # noqa: SLF001
-    # check the database is empty
-    async with aiopg_engine.acquire() as conn:
-        result = await conn.scalar(
-            comp_runs.select().where(
-                (comp_runs.c.user_id == user["id"])
-                & (comp_runs.c.project_uuid == f"{empty_project.uuid}")
-            )  # there is only one entry
-        )
-        assert result is None
-
-
-async def test_misconfigured_pipeline_is_not_scheduled(
-    with_disabled_auto_scheduling: None,
-    scheduler: BaseCompScheduler,
-    registered_user: Callable[..., dict[str, Any]],
-    project: Callable[..., Awaitable[ProjectAtDB]],
-    pipeline: Callable[..., CompPipelineAtDB],
-    fake_workbench_without_outputs: dict[str, Any],
-    fake_workbench_adjacency: dict[str, Any],
-    aiopg_engine: aiopg.sa.engine.Engine,
-    run_metadata: RunMetadataDict,
-):
-    """A pipeline which comp_tasks are missing should not be scheduled.
-    It shall be aborted and shown as such in the comp_runs db"""
-    user = registered_user()
-    sleepers_project = await project(user, workbench=fake_workbench_without_outputs)
-    pipeline(
-        project_id=f"{sleepers_project.uuid}",
-        dag_adjacency_list=fake_workbench_adjacency,
-    )
-    # check the pipeline is correctly added to the scheduled pipelines
-    await scheduler.run_new_pipeline(
-        user_id=user["id"],
-        project_id=sleepers_project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        run_metadata=run_metadata,
-        use_on_demand_clusters=False,
-    )
-    assert len(scheduler._scheduled_pipelines) == 1  # noqa: SLF001
-    for (
-        u_id,
-        p_id,
-        it,
-    ) in scheduler._scheduled_pipelines:  # noqa: SLF001
-        assert u_id == user["id"]
-        assert p_id == sleepers_project.uuid
-        assert it > 0
-    # check the database was properly updated
-    async with aiopg_engine.acquire() as conn:
-        result = await conn.execute(
-            comp_runs.select().where(
-                (comp_runs.c.user_id == user["id"])
-                & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}")
-            )  # there is only one entry
-        )
-        run_entry = CompRunsAtDB.parse_obj(await result.first())
-    assert run_entry.result == RunningState.PUBLISHED
-    # let the scheduler kick in
-    await schedule_all_pipelines(scheduler)
-    # check the scheduled pipelines is again empty since it's misconfigured
-    assert len(scheduler._scheduled_pipelines) == 0  # noqa: SLF001
-    # check the database entry is correctly updated
-    async with aiopg_engine.acquire() as conn:
-        result = await conn.execute(
-            comp_runs.select().where(
-                (comp_runs.c.user_id == user["id"])
-                & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}")
-            )  # there is only one entry
-        )
-        run_entry = CompRunsAtDB.parse_obj(await result.first())
-    assert run_entry.result == RunningState.ABORTED
-    assert run_entry.metadata == run_metadata
-
-
-async def _assert_start_pipeline(
-    aiopg_engine,
-    published_project: PublishedProject,
-    scheduler: BaseCompScheduler,
-    run_metadata: RunMetadataDict,
-) -> list[CompTaskAtDB]:
-    exp_published_tasks = deepcopy(published_project.tasks)
-    assert published_project.project.prj_owner
-    await scheduler.run_new_pipeline(
-        user_id=published_project.project.prj_owner,
-        project_id=published_project.project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        run_metadata=run_metadata,
-        use_on_demand_clusters=False,
-    )
-    assert (
-        len(scheduler._scheduled_pipelines) == 1  # noqa: SLF001
-    ), "the pipeline is not scheduled!"
-    for (
-        u_id,
-        p_id,
-        it,
-    ) in scheduler._scheduled_pipelines:  # noqa: SLF001
-        assert u_id == published_project.project.prj_owner
-        assert p_id == published_project.project.uuid
-        assert it > 0
-
-    # check the database is correctly updated, the run is published
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in exp_published_tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,
-    )
-    return exp_published_tasks
-
-
-async def _assert_schedule_pipeline_PENDING(  # noqa: N802
-    aiopg_engine,
-    published_project: PublishedProject,
-    published_tasks: list[CompTaskAtDB],
-    mocked_dask_client: mock.MagicMock,
-    scheduler: BaseCompScheduler,
-) -> list[CompTaskAtDB]:
-    expected_pending_tasks = [
-        published_tasks[1],
-        published_tasks[3],
-    ]
-    for p in expected_pending_tasks:
-        published_tasks.remove(p)
-
-    async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [DaskClientTaskState.PENDING for job_id in job_ids]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_tasks_pending
-    await schedule_all_pipelines(scheduler)
-    _assert_dask_client_correctly_initialized(mocked_dask_client, scheduler)
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_pending_tasks],
-        expected_state=RunningState.PENDING,
-        expected_progress=None,
-    )
-    # the other tasks are still waiting in published state
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in published_tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,  # since we bypass the API entrypoint this is correct
-    )
-    # tasks were send to the backend
-    assert published_project.project.prj_owner is not None
-    mocked_dask_client.send_computation_tasks.assert_has_calls(
-        calls=[
-            mock.call(
-                user_id=published_project.project.prj_owner,
-                project_id=published_project.project.uuid,
-                cluster_id=DEFAULT_CLUSTER_ID,
-                tasks={f"{p.node_id}": p.image},
-                callback=mock.ANY,
-                metadata=mock.ANY,
-                hardware_info=mock.ANY,
-            )
-            for p in expected_pending_tasks
-        ],
-        any_order=True,
-    )
-    mocked_dask_client.send_computation_tasks.reset_mock()
-    mocked_dask_client.get_tasks_status.assert_not_called()
-    mocked_dask_client.get_task_result.assert_not_called()
-    # there is a second run of the scheduler to move comp_runs to pending, the rest does not change
-    await schedule_all_pipelines(scheduler)
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_pending_tasks],
-        expected_state=RunningState.PENDING,
-        expected_progress=None,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in published_tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,
-    )
-    mocked_dask_client.send_computation_tasks.assert_not_called()
-    mocked_dask_client.get_tasks_status.assert_has_calls(
-        calls=[mock.call([p.job_id for p in expected_pending_tasks])], any_order=True
-    )
-    mocked_dask_client.get_tasks_status.reset_mock()
-    mocked_dask_client.get_task_result.assert_not_called()
-    return expected_pending_tasks
-
-
-@pytest.fixture
-async def instrumentation_rabbit_client_parser(
-    create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture
-) -> AsyncIterator[mock.AsyncMock]:
-    client = create_rabbitmq_client("instrumentation_pytest_consumer")
-    mock = mocker.AsyncMock(return_value=True)
-    queue_name = await client.subscribe(
-        InstrumentationRabbitMessage.get_channel_name(), mock
-    )
-    yield mock
-    await client.unsubscribe(queue_name)
-
-
-@pytest.fixture
-async def resource_tracking_rabbit_client_parser(
-    create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture
-) -> AsyncIterator[mock.AsyncMock]:
-    client = create_rabbitmq_client("resource_tracking_pytest_consumer")
-    mock = mocker.AsyncMock(return_value=True)
-    queue_name = await client.subscribe(
-        RabbitResourceTrackingBaseMessage.get_channel_name(), mock
-    )
-    yield mock
-    await client.unsubscribe(queue_name)
-
-
-async def _assert_message_received(
-    mocked_message_parser: mock.AsyncMock,
-    expected_call_count: int,
-    message_parser: Callable,
-) -> list:
-    async for attempt in AsyncRetrying(
-        wait=wait_fixed(0.1),
-        stop=stop_after_delay(5),
-        retry=retry_if_exception_type(AssertionError),
-        reraise=True,
-    ):
-        with attempt:
-            print(
-                f"--> waiting for rabbitmq message [{attempt.retry_state.attempt_number}, {attempt.retry_state.idle_for}]"
-            )
-            assert mocked_message_parser.call_count == expected_call_count
-            print(
-                f"<-- rabbitmq message received after [{attempt.retry_state.attempt_number}, {attempt.retry_state.idle_for}]"
-            )
-    parsed_messages = [
-        message_parser(mocked_message_parser.call_args_list[c].args[0])
-        for c in range(expected_call_count)
-    ]
-
-    mocked_message_parser.reset_mock()
-    return parsed_messages
-
-
-def _mock_send_computation_tasks(
-    tasks: list[CompTaskAtDB], mocked_dask_client: mock.MagicMock
-) -> None:
-    node_id_to_job_id_map = {task.node_id: task.job_id for task in tasks}
-
-    async def _send_computation_tasks(
-        *args, tasks: dict[NodeID, Image], **kwargs
-    ) -> list[PublishedComputationTask]:
-        for node_id in tasks:
-            assert NodeID(f"{node_id}") in node_id_to_job_id_map
-        return [
-            PublishedComputationTask(
-                node_id=NodeID(f"{node_id}"),
-                job_id=DaskJobID(node_id_to_job_id_map[NodeID(f"{node_id}")]),
-            )
-            for node_id in tasks
-        ]  # type: ignore
-
-    mocked_dask_client.send_computation_tasks.side_effect = _send_computation_tasks
-
-
-async def _trigger_progress_event(
-    scheduler: BaseCompScheduler,
-    *,
-    job_id: str,
-    user_id: UserID,
-    project_id: ProjectID,
-    node_id: NodeID,
-) -> None:
-    event = TaskProgressEvent(
-        job_id=job_id,
-        progress=0,
-        task_owner=TaskOwner(
-            user_id=user_id,
-            project_id=project_id,
-            node_id=node_id,
-            parent_project_id=None,
-            parent_node_id=None,
-        ),
-    )
-    await cast(DaskScheduler, scheduler)._task_progress_change_handler(  # noqa: SLF001
-        event.json()
-    )
-
-
-@pytest.mark.acceptance_test()
-async def test_proper_pipeline_is_scheduled(  # noqa: PLR0915
-    with_disabled_auto_scheduling: None,
-    mocked_dask_client: mock.MagicMock,
-    scheduler: BaseCompScheduler,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    published_project: PublishedProject,
-    mocked_parse_output_data_fct: mock.Mock,
-    mocked_clean_task_output_and_log_files_if_invalid: None,
-    instrumentation_rabbit_client_parser: mock.AsyncMock,
-    resource_tracking_rabbit_client_parser: mock.AsyncMock,
-    run_metadata: RunMetadataDict,
-):
-    _mock_send_computation_tasks(published_project.tasks, mocked_dask_client)
-
-    expected_published_tasks = await _assert_start_pipeline(
-        aiopg_engine, published_project, scheduler, run_metadata
-    )
-
-    # -------------------------------------------------------------------------------
-    # 1. first run will move comp_tasks to PENDING so the worker can take them
-    expected_pending_tasks = await _assert_schedule_pipeline_PENDING(
-        aiopg_engine,
-        published_project,
-        expected_published_tasks,
-        mocked_dask_client,
-        scheduler,
-    )
-
-    # -------------------------------------------------------------------------------
-    # 2.1. the worker might be taking the task, until we get a progress we do not know
-    #      whether it effectively started or it is still queued in the worker process
-    exp_started_task = expected_pending_tasks[0]
-    expected_pending_tasks.remove(exp_started_task)
-
-    async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [
-            (
-                DaskClientTaskState.PENDING_OR_STARTED
-                if job_id == exp_started_task.job_id
-                else DaskClientTaskState.PENDING
-            )
-            for job_id in job_ids
-        ]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running
-
-    await schedule_all_pipelines(scheduler)
-
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [exp_started_task.node_id],
-        expected_state=RunningState.PENDING,
-        expected_progress=None,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_pending_tasks],
-        expected_state=RunningState.PENDING,
-        expected_progress=None,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_published_tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,  # since we bypass the API entrypoint this is correct
-    )
-    mocked_dask_client.send_computation_tasks.assert_not_called()
-    mocked_dask_client.get_tasks_status.assert_called_once_with(
-        [p.job_id for p in (exp_started_task, *expected_pending_tasks)],
-    )
-    mocked_dask_client.get_tasks_status.reset_mock()
-    mocked_dask_client.get_task_result.assert_not_called()
-
-    # -------------------------------------------------------------------------------
-    # 3. the "worker" starts processing a task
-    # here we trigger a progress from the worker
-    assert exp_started_task.job_id
-    assert exp_started_task.project_id
-    assert exp_started_task.node_id
-    assert published_project.project.prj_owner
-    await _trigger_progress_event(
-        scheduler,
-        job_id=exp_started_task.job_id,
-        user_id=published_project.project.prj_owner,
-        project_id=exp_started_task.project_id,
-        node_id=exp_started_task.node_id,
-    )
-
-    await schedule_all_pipelines(scheduler)
-    # comp_run, the comp_task switch to STARTED
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [exp_started_task.node_id],
-        expected_state=RunningState.STARTED,
-        expected_progress=0,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_pending_tasks],
-        expected_state=RunningState.PENDING,
-        expected_progress=None,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_published_tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,
-    )
-    mocked_dask_client.send_computation_tasks.assert_not_called()
-    mocked_dask_client.get_tasks_status.assert_called_once_with(
-        [p.job_id for p in (exp_started_task, *expected_pending_tasks)],
-    )
-    mocked_dask_client.get_tasks_status.reset_mock()
-    mocked_dask_client.get_task_result.assert_not_called()
-    messages = await _assert_message_received(
-        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
-    )
-    assert messages[0].metrics == "service_started"
-    assert messages[0].service_uuid == exp_started_task.node_id
-
-    def _parser(x) -> RabbitResourceTrackingMessages:
-        return parse_raw_as(RabbitResourceTrackingMessages, x)
-
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingStartedMessage.parse_raw,
-    )
-    assert messages[0].node_id == exp_started_task.node_id
-
-    # -------------------------------------------------------------------------------
-    # 4. the "worker" completed the task successfully
-    async def _return_1st_task_success(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [
-            (
-                DaskClientTaskState.SUCCESS
-                if job_id == exp_started_task.job_id
-                else DaskClientTaskState.PENDING
-            )
-            for job_id in job_ids
-        ]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_success
-
-    async def _return_random_task_result(job_id) -> TaskOutputData:
-        return TaskOutputData.parse_obj({"out_1": None, "out_2": 45})
-
-    mocked_dask_client.get_task_result.side_effect = _return_random_task_result
-    await schedule_all_pipelines(scheduler)
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [exp_started_task.node_id],
-        expected_state=RunningState.SUCCESS,
-        expected_progress=1,
-    )
-    messages = await _assert_message_received(
-        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
-    )
-    assert messages[0].metrics == "service_stopped"
-    assert messages[0].service_uuid == exp_started_task.node_id
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingStoppedMessage.parse_raw,
-    )
-
-    completed_tasks = [exp_started_task]
-    next_pending_task = published_project.tasks[2]
-    expected_pending_tasks.append(next_pending_task)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [p.node_id for p in expected_pending_tasks],
-        expected_state=RunningState.PENDING,
-        expected_progress=None,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [
-            p.node_id
-            for p in published_project.tasks
-            if p not in expected_pending_tasks + completed_tasks
-        ],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,  # since we bypass the API entrypoint this is correct
-    )
-    mocked_dask_client.send_computation_tasks.assert_called_once_with(
-        user_id=published_project.project.prj_owner,
-        project_id=published_project.project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        tasks={
-            f"{next_pending_task.node_id}": next_pending_task.image,
-        },
-        callback=mock.ANY,
-        metadata=mock.ANY,
-        hardware_info=mock.ANY,
-    )
-    mocked_dask_client.send_computation_tasks.reset_mock()
-    mocked_dask_client.get_tasks_status.assert_has_calls(
-        calls=[
-            mock.call([p.job_id for p in completed_tasks + expected_pending_tasks[:1]])
-        ],
-        any_order=True,
-    )
-    mocked_dask_client.get_tasks_status.reset_mock()
-    mocked_dask_client.get_task_result.assert_called_once_with(
-        completed_tasks[0].job_id
-    )
-    mocked_dask_client.get_task_result.reset_mock()
-    mocked_parse_output_data_fct.assert_called_once_with(
-        mock.ANY,
-        completed_tasks[0].job_id,
-        await _return_random_task_result(completed_tasks[0].job_id),
-    )
-    mocked_parse_output_data_fct.reset_mock()
-
-    # -------------------------------------------------------------------------------
-    # 6. the "worker" starts processing a task
-    exp_started_task = next_pending_task
-
-    async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [
-            (
-                DaskClientTaskState.PENDING_OR_STARTED
-                if job_id == exp_started_task.job_id
-                else DaskClientTaskState.PENDING
-            )
-            for job_id in job_ids
-        ]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_running
-    # trigger the scheduler, run state should keep to STARTED, task should be as well
-    assert exp_started_task.job_id
-    await _trigger_progress_event(
-        scheduler,
-        job_id=exp_started_task.job_id,
-        user_id=published_project.project.prj_owner,
-        project_id=exp_started_task.project_id,
-        node_id=exp_started_task.node_id,
-    )
-    await schedule_all_pipelines(scheduler)
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [exp_started_task.node_id],
-        expected_state=RunningState.STARTED,
-        expected_progress=0,
-    )
-    mocked_dask_client.send_computation_tasks.assert_not_called()
-    expected_pending_tasks.reverse()
-    mocked_dask_client.get_tasks_status.assert_called_once_with(
-        [p.job_id for p in expected_pending_tasks]
-    )
-    mocked_dask_client.get_tasks_status.reset_mock()
-    mocked_dask_client.get_task_result.assert_not_called()
-    messages = await _assert_message_received(
-        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
-    )
-    assert messages[0].metrics == "service_started"
-    assert messages[0].service_uuid == exp_started_task.node_id
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingStartedMessage.parse_raw,
-    )
-    assert messages[0].node_id == exp_started_task.node_id
-
-    # -------------------------------------------------------------------------------
-    # 7. the task fails
-    async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [
-            (
-                DaskClientTaskState.ERRED
-                if job_id == exp_started_task.job_id
-                else DaskClientTaskState.PENDING
-            )
-            for job_id in job_ids
-        ]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_failed
-    mocked_dask_client.get_task_result.side_effect = None
-    await schedule_all_pipelines(scheduler)
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [exp_started_task.node_id],
-        expected_state=RunningState.FAILED,
-        expected_progress=1,
-    )
-    mocked_dask_client.send_computation_tasks.assert_not_called()
-    mocked_dask_client.get_tasks_status.assert_called_once_with(
-        [p.job_id for p in expected_pending_tasks]
-    )
-    mocked_dask_client.get_tasks_status.reset_mock()
-    mocked_dask_client.get_task_result.assert_called_once_with(exp_started_task.job_id)
-    mocked_dask_client.get_task_result.reset_mock()
-    mocked_parse_output_data_fct.assert_not_called()
-    expected_pending_tasks.remove(exp_started_task)
-    messages = await _assert_message_received(
-        instrumentation_rabbit_client_parser, 1, InstrumentationRabbitMessage.parse_raw
-    )
-    assert messages[0].metrics == "service_stopped"
-    assert messages[0].service_uuid == exp_started_task.node_id
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingStoppedMessage.parse_raw,
-    )
-
-    # -------------------------------------------------------------------------------
-    # 8. the last task shall succeed
-    exp_started_task = expected_pending_tasks[0]
-
-    async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [
-            (
-                DaskClientTaskState.SUCCESS
-                if job_id == exp_started_task.job_id
-                else DaskClientTaskState.PENDING
-            )
-            for job_id in job_ids
-        ]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_3rd_task_success
-    mocked_dask_client.get_task_result.side_effect = _return_random_task_result
-
-    # trigger the scheduler, it should switch to FAILED, as we are done
-    await schedule_all_pipelines(scheduler)
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED)
-
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [exp_started_task.node_id],
-        expected_state=RunningState.SUCCESS,
-        expected_progress=1,
-    )
-    mocked_dask_client.send_computation_tasks.assert_not_called()
-    mocked_dask_client.get_tasks_status.assert_called_once_with(
-        [p.job_id for p in expected_pending_tasks]
-    )
-    mocked_dask_client.get_task_result.assert_called_once_with(exp_started_task.job_id)
-    messages = await _assert_message_received(
-        instrumentation_rabbit_client_parser, 2, InstrumentationRabbitMessage.parse_raw
-    )
-    # NOTE: the service was fast and went directly to success
-    assert messages[0].metrics == "service_started"
-    assert messages[0].service_uuid == exp_started_task.node_id
-    assert messages[1].metrics == "service_stopped"
-    assert messages[1].service_uuid == exp_started_task.node_id
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        2,
-        _parser,
-    )
-    assert isinstance(messages[0], RabbitResourceTrackingStartedMessage)
-    assert isinstance(messages[1], RabbitResourceTrackingStoppedMessage)
-
-    # the scheduled pipeline shall be removed
-    assert scheduler._scheduled_pipelines == {}  # noqa: SLF001
-
-
-async def test_task_progress_triggers(
-    with_disabled_auto_scheduling: None,
-    mocked_dask_client: mock.MagicMock,
-    scheduler: BaseCompScheduler,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    published_project: PublishedProject,
-    mocked_parse_output_data_fct: None,
-    mocked_clean_task_output_and_log_files_if_invalid: None,
-    run_metadata: RunMetadataDict,
-):
-    _mock_send_computation_tasks(published_project.tasks, mocked_dask_client)
-    expected_published_tasks = await _assert_start_pipeline(
-        aiopg_engine, published_project, scheduler, run_metadata
-    )
-    # -------------------------------------------------------------------------------
-    # 1. first run will move comp_tasks to PENDING so the worker can take them
-    expected_pending_tasks = await _assert_schedule_pipeline_PENDING(
-        aiopg_engine,
-        published_project,
-        expected_published_tasks,
-        mocked_dask_client,
-        scheduler,
-    )
-
-    # send some progress
-    started_task = expected_pending_tasks[0]
-    assert started_task.job_id
-    assert published_project.project.prj_owner
-    for progress in [-1, 0, 0.3, 0.5, 1, 1.5, 0.7, 0, 20]:
-        progress_event = TaskProgressEvent(
-            job_id=started_task.job_id,
-            progress=progress,
-            task_owner=TaskOwner(
-                user_id=published_project.project.prj_owner,
-                project_id=published_project.project.uuid,
-                node_id=started_task.node_id,
-                parent_node_id=None,
-                parent_project_id=None,
-            ),
-        )
-        await cast(  # noqa: SLF001
-            DaskScheduler, scheduler
-        )._task_progress_change_handler(progress_event.json())
-        # NOTE: not sure whether it should switch to STARTED.. it would make sense
-        await _assert_comp_tasks_db(
-            aiopg_engine,
-            published_project.project.uuid,
-            [started_task.node_id],
-            expected_state=RunningState.STARTED,
-            expected_progress=min(max(0, progress), 1),
-        )
-
-
-@pytest.mark.parametrize(
-    "backend_error",
-    [
-        ComputationalBackendNotConnectedError(msg="faked disconnected backend"),
-        ComputationalSchedulerChangedError(
-            original_scheduler_id="some_old_scheduler_id",
-            current_scheduler_id="some_new_scheduler_id",
-        ),
-    ],
-)
-async def test_handling_of_disconnected_scheduler_dask(
-    with_disabled_auto_scheduling: None,
-    mocked_dask_client: mock.MagicMock,
-    scheduler: BaseCompScheduler,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    mocker: MockerFixture,
-    published_project: PublishedProject,
-    backend_error: ComputationalSchedulerError,
-    run_metadata: RunMetadataDict,
-):
-    # this will create a non connected backend issue that will trigger re-connection
-    mocked_dask_client_send_task = mocker.patch(
-        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.DaskClient.send_computation_tasks",
-        side_effect=backend_error,
-    )
-    assert mocked_dask_client_send_task
-
-    # running the pipeline will now raise and the tasks are set back to PUBLISHED
-    assert published_project.project.prj_owner
-    await scheduler.run_new_pipeline(
-        user_id=published_project.project.prj_owner,
-        project_id=published_project.project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        run_metadata=run_metadata,
-        use_on_demand_clusters=False,
-    )
-
-    # since there is no cluster, there is no dask-scheduler,
-    # the tasks shall all still be in PUBLISHED state now
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
-
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in published_project.tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,
-    )
-    # on the next iteration of the pipeline it will try to re-connect
-    # now try to abort the tasks since we are wondering what is happening, this should auto-trigger the scheduler
-    await scheduler.stop_pipeline(
-        user_id=published_project.project.prj_owner,
-        project_id=published_project.project.uuid,
-    )
-    # we ensure the scheduler was run
-    await schedule_all_pipelines(scheduler)
-    # after this step the tasks are marked as ABORTED
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [
-            t.node_id
-            for t in published_project.tasks
-            if t.node_class == NodeClass.COMPUTATIONAL
-        ],
-        expected_state=RunningState.ABORTED,
-        expected_progress=1,
-    )
-    # then we have another scheduler run
-    await schedule_all_pipelines(scheduler)
-    # now the run should be ABORTED
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.ABORTED)
-
-
-@dataclass(frozen=True, kw_only=True)
-class RebootState:
-    dask_task_status: DaskClientTaskState
-    task_result: Exception | TaskOutputData
-    expected_task_state_group1: RunningState
-    expected_task_progress_group1: float
-    expected_task_state_group2: RunningState
-    expected_task_progress_group2: float
-    expected_run_state: RunningState
-
-
-@pytest.mark.parametrize(
-    "reboot_state",
-    [
-        pytest.param(
-            RebootState(
-                dask_task_status=DaskClientTaskState.LOST,
-                task_result=ComputationalBackendTaskNotFoundError(job_id="fake_job_id"),
-                expected_task_state_group1=RunningState.FAILED,
-                expected_task_progress_group1=1,
-                expected_task_state_group2=RunningState.ABORTED,
-                expected_task_progress_group2=1,
-                expected_run_state=RunningState.FAILED,
-            ),
-            id="reboot with lost tasks",
-        ),
-        pytest.param(
-            RebootState(
-                dask_task_status=DaskClientTaskState.ABORTED,
-                task_result=TaskCancelledError(job_id="fake_job_id"),
-                expected_task_state_group1=RunningState.ABORTED,
-                expected_task_progress_group1=1,
-                expected_task_state_group2=RunningState.ABORTED,
-                expected_task_progress_group2=1,
-                expected_run_state=RunningState.ABORTED,
-            ),
-            id="reboot with aborted tasks",
-        ),
-        pytest.param(
-            RebootState(
-                dask_task_status=DaskClientTaskState.ERRED,
-                task_result=ValueError("some error during the call"),
-                expected_task_state_group1=RunningState.FAILED,
-                expected_task_progress_group1=1,
-                expected_task_state_group2=RunningState.ABORTED,
-                expected_task_progress_group2=1,
-                expected_run_state=RunningState.FAILED,
-            ),
-            id="reboot with failed tasks",
-        ),
-        pytest.param(
-            RebootState(
-                dask_task_status=DaskClientTaskState.PENDING_OR_STARTED,
-                task_result=ComputationalBackendTaskResultsNotReadyError(
-                    job_id="fake_job_id"
-                ),
-                expected_task_state_group1=RunningState.STARTED,
-                expected_task_progress_group1=0,
-                expected_task_state_group2=RunningState.STARTED,
-                expected_task_progress_group2=0,
-                expected_run_state=RunningState.STARTED,
-            ),
-            id="reboot with running tasks",
-        ),
-        pytest.param(
-            RebootState(
-                dask_task_status=DaskClientTaskState.SUCCESS,
-                task_result=TaskOutputData.parse_obj({"whatever_output": 123}),
-                expected_task_state_group1=RunningState.SUCCESS,
-                expected_task_progress_group1=1,
-                expected_task_state_group2=RunningState.SUCCESS,
-                expected_task_progress_group2=1,
-                expected_run_state=RunningState.SUCCESS,
-            ),
-            id="reboot with completed tasks",
-        ),
-    ],
-)
-async def test_handling_scheduling_after_reboot(
-    with_disabled_auto_scheduling: None,
-    mocked_dask_client: mock.MagicMock,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    running_project: RunningProject,
-    scheduler: BaseCompScheduler,
-    mocked_parse_output_data_fct: mock.MagicMock,
-    mocked_clean_task_output_fct: mock.MagicMock,
-    reboot_state: RebootState,
-):
-    """After the dask client is rebooted, or that the director-v2 reboots the dv-2 internal scheduler
-    shall continue scheduling correctly. Even though the task might have continued to run
-    in the dask-scheduler."""
-
-    async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [reboot_state.dask_task_status for j in job_ids]
-
-    mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status
-
-    async def mocked_get_task_result(_job_id: str) -> TaskOutputData:
-        if isinstance(reboot_state.task_result, Exception):
-            raise reboot_state.task_result
-        return reboot_state.task_result
-
-    mocked_dask_client.get_task_result.side_effect = mocked_get_task_result
-
-    await schedule_all_pipelines(scheduler)
-    # the status will be called once for all RUNNING tasks
-    mocked_dask_client.get_tasks_status.assert_called_once()
-    if reboot_state.expected_run_state in COMPLETED_STATES:
-        mocked_dask_client.get_task_result.assert_has_calls(
-            [
-                mock.call(t.job_id)
-                for t in running_project.tasks
-                if t.node_class == NodeClass.COMPUTATIONAL
-            ],
-            any_order=True,
-        )
-    else:
-        mocked_dask_client.get_task_result.assert_not_called()
-    if reboot_state.expected_run_state in [RunningState.ABORTED, RunningState.FAILED]:
-        # the clean up of the outputs should be done
-        mocked_clean_task_output_fct.assert_has_calls(
-            [
-                mock.call(
-                    mock.ANY,
-                    running_project.project.prj_owner,
-                    running_project.project.uuid,
-                    t.node_id,
-                )
-                for t in running_project.tasks
-                if t.node_class == NodeClass.COMPUTATIONAL
-            ],
-            any_order=True,
-        )
-    else:
-        mocked_clean_task_output_fct.assert_not_called()
-
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        running_project.project.uuid,
-        [
-            running_project.tasks[1].node_id,
-            running_project.tasks[2].node_id,
-            running_project.tasks[3].node_id,
-        ],
-        expected_state=reboot_state.expected_task_state_group1,
-        expected_progress=reboot_state.expected_task_progress_group1,
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        running_project.project.uuid,
-        [running_project.tasks[4].node_id],
-        expected_state=reboot_state.expected_task_state_group2,
-        expected_progress=reboot_state.expected_task_progress_group2,
-    )
-    assert running_project.project.prj_owner
-    await _assert_comp_run_db(
-        aiopg_engine, running_project, reboot_state.expected_run_state
-    )
-
-
-async def test_handling_cancellation_of_jobs_after_reboot(
-    with_disabled_auto_scheduling: None,
-    mocked_dask_client: mock.MagicMock,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    running_project_mark_for_cancellation: RunningProject,
-    scheduler: BaseCompScheduler,
-    mocked_parse_output_data_fct: mock.MagicMock,
-    mocked_clean_task_output_fct: mock.MagicMock,
-):
-    """A running pipeline was cancelled by a user and the DV-2 was restarted BEFORE
-    It could actually cancel the task. On reboot the DV-2 shall recover
-    and actually cancel the pipeline properly"""
-
-    # check initial status
-    await _assert_comp_run_db(
-        aiopg_engine, running_project_mark_for_cancellation, RunningState.STARTED
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        running_project_mark_for_cancellation.project.uuid,
-        [t.node_id for t in running_project_mark_for_cancellation.tasks],
-        expected_state=RunningState.STARTED,
-        expected_progress=0,
-    )
-
-    # the backend shall report the tasks as running
-    async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [DaskClientTaskState.PENDING_OR_STARTED for j in job_ids]
-
-    mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status
-    # Running the scheduler, should actually cancel the run now
-    await schedule_all_pipelines(scheduler)
-    mocked_dask_client.abort_computation_task.assert_called()
-    assert mocked_dask_client.abort_computation_task.call_count == len(
-        [
-            t.node_id
-            for t in running_project_mark_for_cancellation.tasks
-            if t.node_class == NodeClass.COMPUTATIONAL
-        ]
-    )
-    # in the DB they are still running, they will be stopped in the next iteration
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        running_project_mark_for_cancellation.project.uuid,
-        [
-            t.node_id
-            for t in running_project_mark_for_cancellation.tasks
-            if t.node_class == NodeClass.COMPUTATIONAL
-        ],
-        expected_state=RunningState.STARTED,
-        expected_progress=0,
-    )
-    await _assert_comp_run_db(
-        aiopg_engine, running_project_mark_for_cancellation, RunningState.STARTED
-    )
-
-    # the backend shall now report the tasks as aborted
-    async def mocked_get_tasks_status_aborted(
-        job_ids: list[str],
-    ) -> list[DaskClientTaskState]:
-        return [DaskClientTaskState.ABORTED for j in job_ids]
-
-    mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status_aborted
-
-    async def _return_random_task_result(job_id) -> TaskOutputData:
-        raise TaskCancelledError
-
-    mocked_dask_client.get_task_result.side_effect = _return_random_task_result
-    await schedule_all_pipelines(scheduler)
-    # now should be stopped
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        running_project_mark_for_cancellation.project.uuid,
-        [
-            t.node_id
-            for t in running_project_mark_for_cancellation.tasks
-            if t.node_class == NodeClass.COMPUTATIONAL
-        ],
-        expected_state=RunningState.ABORTED,
-        expected_progress=1,
-    )
-    await _assert_comp_run_db(
-        aiopg_engine, running_project_mark_for_cancellation, RunningState.ABORTED
-    )
-    mocked_clean_task_output_fct.assert_called()
-
-
-@pytest.fixture
-def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int:
-    seconds = 1
-    monkeypatch.setenv("SERVICE_TRACKING_HEARTBEAT", f"{seconds}")
-    return seconds
-
-
-async def test_running_pipeline_triggers_heartbeat(
-    with_disabled_auto_scheduling: None,
-    with_fast_service_heartbeat_s: int,
-    mocked_dask_client: mock.MagicMock,
-    scheduler: BaseCompScheduler,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    published_project: PublishedProject,
-    resource_tracking_rabbit_client_parser: mock.AsyncMock,
-    run_metadata: RunMetadataDict,
-):
-    _mock_send_computation_tasks(published_project.tasks, mocked_dask_client)
-    expected_published_tasks = await _assert_start_pipeline(
-        aiopg_engine, published_project, scheduler, run_metadata
-    )
-    # -------------------------------------------------------------------------------
-    # 1. first run will move comp_tasks to PENDING so the worker can take them
-    expected_pending_tasks = await _assert_schedule_pipeline_PENDING(
-        aiopg_engine,
-        published_project,
-        expected_published_tasks,
-        mocked_dask_client,
-        scheduler,
-    )
-    # -------------------------------------------------------------------------------
-    # 2. the "worker" starts processing a task
-    exp_started_task = expected_pending_tasks[0]
-    expected_pending_tasks.remove(exp_started_task)
-
-    async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskState]:
-        return [
-            (
-                DaskClientTaskState.PENDING_OR_STARTED
-                if job_id == exp_started_task.job_id
-                else DaskClientTaskState.PENDING
-            )
-            for job_id in job_ids
-        ]
-
-    mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running
-    assert exp_started_task.job_id
-    assert published_project.project.prj_owner
-    await _trigger_progress_event(
-        scheduler,
-        job_id=exp_started_task.job_id,
-        user_id=published_project.project.prj_owner,
-        project_id=exp_started_task.project_id,
-        node_id=exp_started_task.node_id,
-    )
-    await schedule_all_pipelines(scheduler)
-
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingStartedMessage.parse_raw,
-    )
-    assert messages[0].node_id == exp_started_task.node_id
-
-    # -------------------------------------------------------------------------------
-    # 3. wait a bit and run again we should get another heartbeat, but only one!
-    await asyncio.sleep(with_fast_service_heartbeat_s + 1)
-    await schedule_all_pipelines(scheduler)
-    await schedule_all_pipelines(scheduler)
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingHeartbeatMessage.parse_raw,
-    )
-    assert isinstance(messages[0], RabbitResourceTrackingHeartbeatMessage)
-
-    # -------------------------------------------------------------------------------
-    # 4. wait a bit and run again we should get another heartbeat, but only one!
-    await asyncio.sleep(with_fast_service_heartbeat_s + 1)
-    await schedule_all_pipelines(scheduler)
-    await schedule_all_pipelines(scheduler)
-    messages = await _assert_message_received(
-        resource_tracking_rabbit_client_parser,
-        1,
-        RabbitResourceTrackingHeartbeatMessage.parse_raw,
-    )
-    assert isinstance(messages[0], RabbitResourceTrackingHeartbeatMessage)
-
-
-@pytest.fixture
-async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock:
-    return mocker.patch(
-        "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.get_or_create_on_demand_cluster",
-        autospec=True,
-    )
-
-
-async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits(
-    with_disabled_auto_scheduling: None,
-    scheduler: BaseCompScheduler,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    published_project: PublishedProject,
-    run_metadata: RunMetadataDict,
-    mocked_get_or_create_cluster: mock.Mock,
-    faker: Faker,
-):
-    mocked_get_or_create_cluster.side_effect = (
-        ComputationalBackendOnDemandNotReadyError(
-            eta=faker.time_delta(datetime.timedelta(hours=1))
-        )
-    )
-    # running the pipeline will trigger a call to the clusters-keeper
-    assert published_project.project.prj_owner
-    await scheduler.run_new_pipeline(
-        user_id=published_project.project.prj_owner,
-        project_id=published_project.project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        run_metadata=run_metadata,
-        use_on_demand_clusters=True,
-    )
-
-    # we ask to use an on-demand cluster, therefore the tasks are published first
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in published_project.tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,
-    )
-    mocked_get_or_create_cluster.assert_not_called()
-    # now it should switch to waiting
-    expected_waiting_tasks = [
-        published_project.tasks[1],
-        published_project.tasks[3],
-    ]
-    await schedule_all_pipelines(scheduler)
-    mocked_get_or_create_cluster.assert_called()
-    assert mocked_get_or_create_cluster.call_count == 1
-    mocked_get_or_create_cluster.reset_mock()
-    await _assert_comp_run_db(
-        aiopg_engine, published_project, RunningState.WAITING_FOR_CLUSTER
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in expected_waiting_tasks],
-        expected_state=RunningState.WAITING_FOR_CLUSTER,
-        expected_progress=None,
-    )
-    # again will trigger the same response
-    await schedule_all_pipelines(scheduler)
-    mocked_get_or_create_cluster.assert_called()
-    assert mocked_get_or_create_cluster.call_count == 1
-    mocked_get_or_create_cluster.reset_mock()
-    await _assert_comp_run_db(
-        aiopg_engine, published_project, RunningState.WAITING_FOR_CLUSTER
-    )
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in expected_waiting_tasks],
-        expected_state=RunningState.WAITING_FOR_CLUSTER,
-        expected_progress=None,
-    )
-
-
-@pytest.mark.parametrize(
-    "get_or_create_exception",
-    [ClustersKeeperNotAvailableError],
-)
-async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails(
-    with_disabled_auto_scheduling: None,
-    scheduler: BaseCompScheduler,
-    aiopg_engine: aiopg.sa.engine.Engine,
-    published_project: PublishedProject,
-    run_metadata: RunMetadataDict,
-    mocked_get_or_create_cluster: mock.Mock,
-    get_or_create_exception: Exception,
-):
-    mocked_get_or_create_cluster.side_effect = get_or_create_exception
-    # running the pipeline will trigger a call to the clusters-keeper
-    assert published_project.project.prj_owner
-    await scheduler.run_new_pipeline(
-        user_id=published_project.project.prj_owner,
-        project_id=published_project.project.uuid,
-        cluster_id=DEFAULT_CLUSTER_ID,
-        run_metadata=run_metadata,
-        use_on_demand_clusters=True,
-    )
-
-    # we ask to use an on-demand cluster, therefore the tasks are published first
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in published_project.tasks],
-        expected_state=RunningState.PUBLISHED,
-        expected_progress=None,
-    )
-    # now it should switch to failed, the run still runs until the next iteration
-    expected_failed_tasks = [
-        published_project.tasks[1],
-        published_project.tasks[3],
-    ]
-    await schedule_all_pipelines(scheduler)
-    mocked_get_or_create_cluster.assert_called()
-    assert mocked_get_or_create_cluster.call_count == 1
-    mocked_get_or_create_cluster.reset_mock()
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in expected_failed_tasks],
-        expected_state=RunningState.FAILED,
-        expected_progress=1.0,
-    )
-    # again will not re-trigger the call to clusters-keeper
-    await schedule_all_pipelines(scheduler)
-    mocked_get_or_create_cluster.assert_not_called()
-    await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED)
-    await _assert_comp_tasks_db(
-        aiopg_engine,
-        published_project.project.uuid,
-        [t.node_id for t in expected_failed_tasks],
-        expected_state=RunningState.FAILED,
-        expected_progress=1.0,
-    )

From 1b70a152adf08a327c2193dc9ff1a52cbd9fe380 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 13:11:29 +0100
Subject: [PATCH 13/27] linter

---
 .../src/simcore_service_director_v2/core/errors.py     | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 6ec36df9aecb..6851f3f96866 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -73,7 +73,9 @@ class TaskSchedulingError(ComputationalSchedulerError):
     msg_template = "Computational scheduler: Task {node_id} in project {project_id} could not be scheduled {msg}"
 
 
-class MissingComputationalResourcesError(TaskSchedulingError):
+class MissingComputationalResourcesError(
+    TaskSchedulingError
+):  # pylint: disable=too-many-ancestors
     msg_template = (
         "Service {service_name}:{service_version} cannot be scheduled "
         "on cluster {cluster_id}: task needs '{task_resources}', "
@@ -81,7 +83,9 @@ class MissingComputationalResourcesError(TaskSchedulingError):
     )
 
 
-class InsuficientComputationalResourcesError(TaskSchedulingError):
+class InsuficientComputationalResourcesError(
+    TaskSchedulingError
+):  # pylint: disable=too-many-ancestors
     msg_template: str = (
         "Insufficient computational resources to run {service_name}:{service_version} with {service_requested_resources} on cluster {cluster_id}."
         "Cluster available workers: {cluster_available_resources}"
@@ -89,7 +93,7 @@ class InsuficientComputationalResourcesError(TaskSchedulingError):
     )
 
 
-class PortsValidationError(TaskSchedulingError):
+class PortsValidationError(TaskSchedulingError):  # pylint: disable=too-many-ancestors
     msg_template: str = (
         "Node {node_id} in {project_id} with ports having invalid values {errors_list}"
     )

From 42404d8e6659010dc922cddd53651c6d5ab1fb6d Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:09:00 +0100
Subject: [PATCH 14/27] revert

---
 .../core/errors.py                            | 21 +++++++++++++++++++
 .../modules/comp_scheduler/_base_scheduler.py | 13 +++++-------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index 6851f3f96866..e783e437a6a6 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -20,6 +20,9 @@
 """
 
 from common_library.errors_classes import OsparcErrorMixin
+from models_library.errors import ErrorDict
+from models_library.projects import ProjectID
+from models_library.projects_nodes_io import NodeID
 
 
 class DirectorError(OsparcErrorMixin, RuntimeError):
@@ -72,6 +75,24 @@ class InvalidPipelineError(ComputationalSchedulerError):
 class TaskSchedulingError(ComputationalSchedulerError):
     msg_template = "Computational scheduler: Task {node_id} in project {project_id} could not be scheduled {msg}"
 
+    def __init__(self, project_id: ProjectID, node_id: NodeID, msg: str | None) -> None:
+        super().__init__(msg=msg)
+        self.project_id = project_id
+        self.node_id = node_id
+
+    def get_errors(self) -> list[ErrorDict]:
+        # default implementation
+        return [
+            {
+                "loc": (
+                    f"{self.project_id}",
+                    f"{self.node_id}",
+                ),
+                "msg": f"{self.args[0]}",
+                "type": self.code,
+            },
+        ]
+
 
 class MissingComputationalResourcesError(
     TaskSchedulingError
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
index 20a4a5eb8bca..554594b5b519 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
@@ -929,25 +929,22 @@ async def _schedule_tasks_to_start(  # noqa: C901
                 comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED
             raise
         except TaskSchedulingError as exc:
-            err_context = exc.error_context()
             _logger.exception(
                 "Project '%s''s task '%s' could not be scheduled",
-                err_context["project_id"],
-                err_context["node_id"],
+                exc.project_id,
+                exc.node_id,
             )
             await CompTasksRepository.instance(
                 self.db_engine
             ).update_project_tasks_state(
                 project_id,
-                [err_context["node_id"]],
+                [exc.node_id],
                 RunningState.FAILED,
-                None,  # exc.get_errors(), # @pcrespov I need your help here!
+                exc.get_errors(),  # @pcrespov I need your help here!
                 optional_progress=1.0,
                 optional_stopped=arrow.utcnow().datetime,
             )
-            comp_tasks[
-                NodeIDStr(f"{err_context['node_id']}")
-            ].state = RunningState.FAILED
+            comp_tasks[NodeIDStr(f"{exc.node_id}")].state = RunningState.FAILED
         except Exception:
             _logger.exception(
                 "Unexpected error for %s with %s on %s happened when scheduling %s:",

From f78009d9af1f794108ba8e0de1c54500ca812397 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:09:05 +0100
Subject: [PATCH 15/27] typing

---
 packages/common-library/src/common_library/errors_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/common-library/src/common_library/errors_classes.py b/packages/common-library/src/common_library/errors_classes.py
index 83e40b2a2b0e..dfee557d38ce 100644
--- a/packages/common-library/src/common_library/errors_classes.py
+++ b/packages/common-library/src/common_library/errors_classes.py
@@ -45,7 +45,7 @@ def _get_full_class_name(cls) -> str:
         ]
         return ".".join(reversed(relevant_classes))
 
-    def error_context(self):
+    def error_context(self) -> dict[str, Any]:
         """Returns context in which error occurred and stored within the exception"""
         return dict(**self.__dict__)
 

From 6eb0dd3064e6aac03534ca402b686780bff6266a Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:09:52 +0100
Subject: [PATCH 16/27] revert

---
 .../modules/comp_scheduler/_dask_scheduler.py                  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py
index e578715eb799..2fdf7acd2e9e 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py
@@ -345,10 +345,11 @@ async def _process_task_result(
             except TaskSchedulingError as err:
                 task_final_state = RunningState.FAILED
                 simcore_platform_status = SimcorePlatformStatus.BAD
+                errors = err.get_errors()
                 _logger.debug(
                     "Unexpected failure while processing results of %s: %s",
                     f"{task=}",
-                    f"{err=}",
+                    f"{errors=}",
                 )
 
             # resource tracking

From 0b668edfc687d5319622e50c67e3b5ffb7d562df Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:12:18 +0100
Subject: [PATCH 17/27] now it works

---
 .../src/simcore_service_director_v2/core/errors.py          | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py
index e783e437a6a6..18a5b674ed2e 100644
--- a/services/director-v2/src/simcore_service_director_v2/core/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py
@@ -19,6 +19,8 @@
 }
 """
 
+from typing import Any
+
 from common_library.errors_classes import OsparcErrorMixin
 from models_library.errors import ErrorDict
 from models_library.projects import ProjectID
@@ -75,8 +77,8 @@ class InvalidPipelineError(ComputationalSchedulerError):
 class TaskSchedulingError(ComputationalSchedulerError):
     msg_template = "Computational scheduler: Task {node_id} in project {project_id} could not be scheduled {msg}"
 
-    def __init__(self, project_id: ProjectID, node_id: NodeID, msg: str | None) -> None:
-        super().__init__(msg=msg)
+    def __init__(self, project_id: ProjectID, node_id: NodeID, **ctx: Any) -> None:
+        super().__init__(project_id=project_id, node_id=node_id, **ctx)
         self.project_id = project_id
         self.node_id = node_id
 

From 6d219e96340aa92794019a3761b111c89d693915 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:13:49 +0100
Subject: [PATCH 18/27] use get

---
 .../src/simcore_service_director_v2/api/routes/computations.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
index 6d1ca989eec1..251e35fa6383 100644
--- a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
+++ b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py
@@ -205,7 +205,7 @@ async def _get_project_node_names(
         _logger.exception("Could not find project: %s", f"{project_id=}")
     except ProjectNotFoundError as exc:
         _logger.exception(
-            "Could not find parent project: %s", exc.error_context()["project_id"]
+            "Could not find parent project: %s", exc.error_context().get("project_id")
         )
 
     return {}

From 9a02ad3b8917d42e1c91efcbc2acbc3381c47a9b Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:14:55 +0100
Subject: [PATCH 19/27] remove comment

---
 .../modules/comp_scheduler/_base_scheduler.py                   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
index 554594b5b519..2d663aec9a1f 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py
@@ -940,7 +940,7 @@ async def _schedule_tasks_to_start(  # noqa: C901
                 project_id,
                 [exc.node_id],
                 RunningState.FAILED,
-                exc.get_errors(),  # @pcrespov I need your help here!
+                exc.get_errors(),
                 optional_progress=1.0,
                 optional_stopped=arrow.utcnow().datetime,
             )

From 85875e802a9580125b75169cf1052ada60ed905a Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:18:50 +0100
Subject: [PATCH 20/27] revert

---
 .../modules/dynamic_sidecar/docker_api/_core.py           | 2 +-
 .../modules/dynamic_sidecar/errors.py                     | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py
index 7772d0d67be5..1e05524b48de 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_core.py
@@ -487,7 +487,7 @@ async def update_scheduler_data_label(scheduler_data: SchedulerData) -> None:
             },
         )
     except GenericDockerError as e:
-        if e.error_context()["original_exception"].status == status.HTTP_404_NOT_FOUND:
+        if e.original_exception.status == status.HTTP_404_NOT_FOUND:
             log.info(
                 "Skipped labels update for service '%s' which could not be found.",
                 scheduler_data.service_name,
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
index 0ebc222e914b..f8b4d01e40cf 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
@@ -1,3 +1,7 @@
+from typing import Any
+
+from aiodocker import DockerError
+
 from ...core.errors import DirectorError
 
 
@@ -6,6 +10,10 @@ class DynamicSidecarError(DirectorError):
 
 
 class GenericDockerError(DynamicSidecarError):
+    def __init__(self, original_exception: DockerError, **ctx: Any) -> None:
+        super().__init__(original_exception=original_exception, **ctx)
+        self.original_exception = original_exception
+
     msg_template: str = "Unexpected error using docker: {msg}"
 
 

From 593719b269f93bb2160f8d1e8e0b0cd8016444c2 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:38:48 +0100
Subject: [PATCH 21/27] use pytest match

---
 .../tests/unit/test_modules_dynamic_sidecar_scheduler.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler.py b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler.py
index 560794460def..f0a17c5e51cf 100644
--- a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler.py
+++ b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler.py
@@ -399,9 +399,10 @@ async def test_get_stack_status_missing(
     mocked_dynamic_scheduler_events: None,
     mock_docker_api: None,
 ) -> None:
-    with pytest.raises(DynamicSidecarNotFoundError) as execinfo:
+    with pytest.raises(
+        DynamicSidecarNotFoundError, match=rf"{scheduler_data.node_uuid} not found"
+    ):
         await scheduler.get_stack_status(scheduler_data.node_uuid)
-    assert f"{scheduler_data.node_uuid} not found" in str(execinfo)
 
 
 async def test_get_stack_status_failing_sidecar(

From 180c5f0e72380d260eafaa508e8f0b4bfd9a0aed Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:42:58 +0100
Subject: [PATCH 22/27] linter happy

---
 .../src/models_library/rest_ordering.py         | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/packages/models-library/src/models_library/rest_ordering.py b/packages/models-library/src/models_library/rest_ordering.py
index b042950c3526..faac153400a2 100644
--- a/packages/models-library/src/models_library/rest_ordering.py
+++ b/packages/models-library/src/models_library/rest_ordering.py
@@ -16,14 +16,17 @@ class OrderDirection(str, Enum):
 
 class OrderBy(BaseModel):
     # Based on https://google.aip.dev/132#ordering
-    field: IDStr = Field(..., description="field name identifier")
-    direction: OrderDirection = Field(
-        default=OrderDirection.ASC,
-        description=(
-            f"As [A,B,C,...] if `{OrderDirection.ASC.value}`"
-            f" or [Z,Y,X, ...] if `{OrderDirection.DESC.value}`"
+    field: Annotated[IDStr, Field(..., description="field name identifier")]
+    direction: Annotated[
+        OrderDirection,
+        Field(
+            default=OrderDirection.ASC,
+            description=(
+                f"As [A,B,C,...] if `{OrderDirection.ASC.value}`"
+                f" or [Z,Y,X, ...] if `{OrderDirection.DESC.value}`"
+            ),
         ),
-    )
+    ]
 
 
 class _BaseOrderQueryParams(RequestParameters):

From e18831123200e625fb07ee57dd41708d385624e1 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:21:04 +0100
Subject: [PATCH 23/27] fix test

---
 .../tests/unit/with_dbs/test_api_route_dynamic_services.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py b/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py
index 5787fa119e1c..6ad77dbf6182 100644
--- a/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py
+++ b/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py
@@ -252,7 +252,7 @@ def mocked_director_v2_scheduler(mocker: MockerFixture, exp_status_code: int) ->
     # MOCKING get_stack_status
     def get_stack_status(node_uuid: NodeID) -> RunningDynamicServiceDetails:
         if exp_status_code == status.HTTP_307_TEMPORARY_REDIRECT:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
 
         return RunningDynamicServiceDetails.model_validate(
             RunningDynamicServiceDetails.model_config["json_schema_extra"]["examples"][

From 102339756378ee752d3959c24eb18972760c7794 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:22:06 +0100
Subject: [PATCH 24/27] fixed test

---
 .../tests/unit/with_dbs/test_api_route_dynamic_services.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py b/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py
index 6ad77dbf6182..2de98368d9ac 100644
--- a/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py
+++ b/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py
@@ -269,7 +269,7 @@ def get_stack_status(node_uuid: NodeID) -> RunningDynamicServiceDetails:
     # MOCKING remove_service
     def remove_service(node_uuid: NodeID, *ars: Any, **kwargs: Any) -> None:
         if exp_status_code == status.HTTP_307_TEMPORARY_REDIRECT:
-            raise DynamicSidecarNotFoundError(node_uuid)
+            raise DynamicSidecarNotFoundError(node_uuid=node_uuid)
 
     mocker.patch(
         f"{module_base}._task.DynamicSidecarsScheduler.mark_service_for_removal",

From 883bf75335f356c1f5d8e3493895245f781877dd Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:26:07 +0100
Subject: [PATCH 25/27] fix test

---
 .../modules/dynamic_sidecar/docker_api/_utils.py         | 2 +-
 .../modules/dynamic_sidecar/errors.py                    | 2 +-
 .../with_dbs/test_modules_dynamic_sidecar_docker_api.py  | 9 ++++-----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py
index 75f057e97564..f625c2ea6250 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_api/_utils.py
@@ -13,7 +13,7 @@ async def docker_client() -> AsyncIterator[aiodocker.docker.Docker]:
         client = aiodocker.Docker()
         yield client
     except aiodocker.exceptions.DockerError as e:
-        raise GenericDockerError(msg=f"{e}", original_exception=e) from e
+        raise GenericDockerError(msg=f"{e.message}", original_exception=e) from e
     finally:
         if client is not None:
             await client.close()
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
index f8b4d01e40cf..3b0a400223bc 100644
--- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
+++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/errors.py
@@ -14,7 +14,7 @@ def __init__(self, original_exception: DockerError, **ctx: Any) -> None:
         super().__init__(original_exception=original_exception, **ctx)
         self.original_exception = original_exception
 
-    msg_template: str = "Unexpected error using docker: {msg}"
+    msg_template: str = "Unexpected error using docker client: {msg}"
 
 
 class DynamicSidecarNotFoundError(DirectorError):
diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py
index f36a8f8f7f64..24e708b70054 100644
--- a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py
+++ b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py
@@ -403,13 +403,12 @@ def test_settings__valid_network_names(
 async def test_failed_docker_client_request(docker_swarm: None):
     missing_network_name = "this_network_cannot_be_found"
 
-    with pytest.raises(GenericDockerError) as execinfo:
+    with pytest.raises(
+        GenericDockerError,
+        match=f"Unexpected error using docker client: network {missing_network_name} not found",
+    ):
         async with docker_client() as client:
             await client.networks.get(missing_network_name)
-    assert (
-        str(execinfo.value)
-        == f"Unexpected error from docker client: network {missing_network_name} not found"
-    )
 
 
 async def test_get_swarm_network_ok(

From 7a549dea145de9bb5cbf918ef2ca39138d73b8bc Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:30:10 +0100
Subject: [PATCH 26/27] use pytest match

---
 .../test_modules_dynamic_sidecar_docker_api.py       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py
index 24e708b70054..77c327706fd5 100644
--- a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py
+++ b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_api.py
@@ -427,16 +427,16 @@ async def test_get_swarm_network_missing_network(
     dynamic_services_scheduler_settings: DynamicServicesSchedulerSettings,
     docker_swarm: None,
 ):
-    with pytest.raises(DynamicSidecarError) as excinfo:
+    with pytest.raises(
+        DynamicSidecarError,
+        match=r"Unexpected dynamic sidecar error: "
+        r"Swarm network name \(searching for \'\*test_network_name\*\'\) is not configured."
+        r"Found following networks: \[\]",
+    ):
         await docker_api.get_swarm_network(
             dynamic_services_scheduler_settings.SIMCORE_SERVICES_NETWORK_NAME
         )
 
-    assert str(excinfo.value) == (
-        "Swarm network name (searching for '*test_network_name*') is not configured."
-        "Found following networks: []"
-    )
-
 
 async def test_recreate_network_multiple_times(
     network_config: dict[str, Any],

From 9f68f9783df8c0c0154f5dab072d142af8f97f76 Mon Sep 17 00:00:00 2001
From: sanderegg <35365065+sanderegg@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:45:36 +0100
Subject: [PATCH 27/27] revert

---
 .../src/models_library/rest_ordering.py         | 17 +++++++----------
 .../models-library/tests/test_rest_ordering.py  |  9 +++++----
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/packages/models-library/src/models_library/rest_ordering.py b/packages/models-library/src/models_library/rest_ordering.py
index faac153400a2..b042950c3526 100644
--- a/packages/models-library/src/models_library/rest_ordering.py
+++ b/packages/models-library/src/models_library/rest_ordering.py
@@ -16,17 +16,14 @@ class OrderDirection(str, Enum):
 
 class OrderBy(BaseModel):
     # Based on https://google.aip.dev/132#ordering
-    field: Annotated[IDStr, Field(..., description="field name identifier")]
-    direction: Annotated[
-        OrderDirection,
-        Field(
-            default=OrderDirection.ASC,
-            description=(
-                f"As [A,B,C,...] if `{OrderDirection.ASC.value}`"
-                f" or [Z,Y,X, ...] if `{OrderDirection.DESC.value}`"
-            ),
+    field: IDStr = Field(..., description="field name identifier")
+    direction: OrderDirection = Field(
+        default=OrderDirection.ASC,
+        description=(
+            f"As [A,B,C,...] if `{OrderDirection.ASC.value}`"
+            f" or [Z,Y,X, ...] if `{OrderDirection.DESC.value}`"
         ),
-    ]
+    )
 
 
 class _BaseOrderQueryParams(RequestParameters):
diff --git a/packages/models-library/tests/test_rest_ordering.py b/packages/models-library/tests/test_rest_ordering.py
index a3f84bf0d2c9..4ceed67dea5f 100644
--- a/packages/models-library/tests/test_rest_ordering.py
+++ b/packages/models-library/tests/test_rest_ordering.py
@@ -120,10 +120,11 @@ def test_ordering_query_model_class__defaults():
 
     # checks  all defaults
     model = OrderQueryParamsModel()
-    assert model.order_by
-    assert isinstance(model.order_by, OrderBy)  # nosec
-    assert model.order_by.field == "modified_at"  # NOTE that this was mapped!
-    assert model.order_by.direction == OrderDirection.DESC
+    assert model.order_by is not None
+    assert (
+        model.order_by.field == "modified_at"  # pylint: disable=no-member
+    )  # NOTE that this was mapped!
+    assert model.order_by.direction is OrderDirection.DESC  # pylint: disable=no-member
 
     # partial defaults
     model = OrderQueryParamsModel.model_validate({"order_by": {"field": "name"}})