time out before processing stuff so that failure happens right away

sanderegg · sanderegg · commit f4e5d21b911a · 2025-09-02T16:19:33.000+02:00
diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py
@@ -15,7 +15,6 @@
 import datetime
 import logging
 from abc import ABC, abstractmethod
-from asyncio import tasks
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Final
@@ -272,9 +271,12 @@ async def _set_processing_done(
             )
 
     async def _set_states_following_failed_to_aborted(
-        self, project_id: ProjectID, dag: nx.DiGraph, run_id: PositiveInt
+        self,
+        project_id: ProjectID,
+        dag: nx.DiGraph,
+        tasks: dict[NodeIDStr, CompTaskAtDB],
+        run_id: PositiveInt,
     ) -> dict[NodeIDStr, CompTaskAtDB]:
-        tasks = await self._get_pipeline_tasks(project_id, dag)
         # Perform a reverse topological sort to ensure tasks are ordered from last to first
         sorted_node_ids = list(reversed(list(nx.topological_sort(dag))))
         tasks = {
@@ -634,15 +636,20 @@ async def apply(
                     user_id, project_id, iteration
                 )
                 dag = await self._get_pipeline_dag(project_id)
+                comp_tasks = await self._get_pipeline_tasks(project_id, dag)
                 # 1. Update our list of tasks with data from backend (state, results)
                 await self._update_states_from_comp_backend(
                     user_id, project_id, iteration, dag, comp_run
                 )
-                # 2. Any task following a FAILED task shall be ABORTED
+                # 2. timeout if waiting for cluster has been there for more than X minutes
+                comp_tasks = await self._timeout_if_waiting_for_cluster_too_long(
+                    user_id, project_id, comp_run, comp_tasks
+                )
+                # 3. Any task following a FAILED task shall be ABORTED
                 comp_tasks = await self._set_states_following_failed_to_aborted(
-                    project_id, dag, comp_run.run_id
+                    project_id, dag, comp_tasks, comp_run.run_id
                 )
-                # 3. do we want to stop the pipeline now?
+                # 4. do we want to stop the pipeline now?
                 if comp_run.cancelled:
                     comp_tasks = await self._schedule_tasks_to_stop(
                         user_id, project_id, comp_tasks, comp_run
@@ -664,10 +671,7 @@ async def apply(
                             iteration=iteration,
                         ),
                     )
-                # 4. timeout if waiting for cluster has been there for more than X minutes
-                comp_tasks = await self._timeout_if_waiting_for_cluster_too_long(
-                    user_id, project_id, comp_run, comp_tasks
-                )
+
                 # 5. send a heartbeat
                 await self._send_running_tasks_heartbeat(
                     user_id, project_id, comp_run.run_id, iteration, dag
diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py
@@ -8,6 +8,7 @@
 # pylint: disable=too-many-statements
 
 
+import datetime
 from unittest import mock
 
 import pytest
@@ -69,3 +70,15 @@ def with_disabled_scheduler_publisher(mocker: MockerFixture) -> mock.Mock:
         "simcore_service_director_v2.modules.comp_scheduler._manager.request_pipeline_scheduling",
         autospec=True,
     )
+
+
+@pytest.fixture
+def with_short_max_wait_for_clusters_keeper(
+    monkeypatch: pytest.MonkeyPatch, mocker: MockerFixture
+) -> datetime.timedelta:
+    short_time = datetime.timedelta(seconds=5)
+    setenvs_from_dict(
+        monkeypatch,
+        {"COMPUTATIONAL_BACKEND_MAX_WAITING_FOR_CLUSTER_TIMEOUT": f"{short_time}"},
+    )
+    return short_time
diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py
@@ -2048,9 +2048,10 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits(
     "get_or_create_exception",
     [ClustersKeeperNotAvailableError],
 )
-async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_waits(
+async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_waits_and_eventually_timesout_fails(
     with_disabled_auto_scheduling: mock.Mock,
     with_disabled_scheduler_publisher: mock.Mock,
+    with_short_max_wait_for_clusters_keeper: datetime.timedelta,
     initialized_app: FastAPI,
     scheduler_api: BaseCompScheduler,
     sqlalchemy_async_engine: AsyncEngine,
@@ -2061,8 +2062,6 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_waits(
     computational_pipeline_rabbit_client_parser: mock.AsyncMock,
     fake_collection_run_id: CollectionRunID,
 ):
-    # needs to change: https://github.com/ITISFoundation/osparc-simcore/issues/6817
-
     mocked_get_or_create_cluster.side_effect = get_or_create_exception
     # running the pipeline will trigger a call to the clusters-keeper
     assert published_project.project.prj_owner
@@ -2166,6 +2165,36 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_waits(
         expected_progress=None,
         run_id=run_in_db.run_id,
     )
+    await asyncio.sleep(with_short_max_wait_for_clusters_keeper.total_seconds() + 1)
+    # again will trigger the call again, but now it will start failing, first the task will be mark as FAILED
+    await scheduler_api.apply(
+        user_id=run_in_db.user_id,
+        project_id=run_in_db.project_uuid,
+        iteration=run_in_db.iteration,
+    )
+    mocked_get_or_create_cluster.assert_not_called()
+    await assert_comp_runs(
+        sqlalchemy_async_engine,
+        expected_total=1,
+        expected_state=RunningState.FAILED,
+        where_statement=and_(
+            comp_runs.c.user_id == published_project.project.prj_owner,
+            comp_runs.c.project_uuid == f"{published_project.project.uuid}",
+        ),
+    )
+    await _assert_message_received(
+        computational_pipeline_rabbit_client_parser,
+        1,
+        ComputationalPipelineStatusMessage.model_validate_json,
+    )
+    await assert_comp_tasks_and_comp_run_snapshot_tasks(
+        sqlalchemy_async_engine,
+        project_uuid=published_project.project.uuid,
+        task_ids=[t.node_id for t in expected_waiting_for_cluster_tasks],
+        expected_state=RunningState.FAILED,
+        expected_progress=1.0,
+        run_id=run_in_db.run_id,
+    )
 
 
 async def test_run_new_pipeline_called_twice_prevents_duplicate_runs(