Skip to content

Commit 4a20213

Browse files
committed
add some fixes
1 parent 8240850 commit 4a20213

File tree

1 file changed

+15
-13
lines changed
  • services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler

1 file changed

+15
-13
lines changed

services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import datetime
1616
import logging
1717
from abc import ABC, abstractmethod
18+
from asyncio import tasks
1819
from collections.abc import Callable
1920
from dataclasses import dataclass
2021
from typing import Final
@@ -76,9 +77,7 @@
7677

7778
_Previous = CompTaskAtDB
7879
_Current = CompTaskAtDB
79-
_MAX_WAITING_FOR_CLUSTER_TIMEOUT: Final[datetime.timedelta] = datetime.timedelta(
80-
minutes=10
81-
)
80+
8281
_MAX_WAITING_TIME_FOR_UNKNOWN_TASKS: Final[datetime.timedelta] = datetime.timedelta(
8382
seconds=30
8483
)
@@ -667,7 +666,7 @@ async def apply(
667666
)
668667
# 4. timeout if waiting for cluster has been there for more than X minutes
669668
comp_tasks = await self._timeout_if_waiting_for_cluster_too_long(
670-
user_id, project_id, comp_run.run_id, comp_tasks
669+
user_id, project_id, comp_run, comp_tasks
671670
)
672671
# 5. send a heartbeat
673672
await self._send_running_tasks_heartbeat(
@@ -902,31 +901,34 @@ async def _timeout_if_waiting_for_cluster_too_long(
902901
self,
903902
user_id: UserID,
904903
project_id: ProjectID,
905-
run_id: PositiveInt,
904+
comp_run: CompRunsAtDB,
906905
comp_tasks: dict[NodeIDStr, CompTaskAtDB],
907906
) -> dict[NodeIDStr, CompTaskAtDB]:
908-
if all(
909-
c.state is RunningState.WAITING_FOR_CLUSTER for c in comp_tasks.values()
910-
):
907+
if comp_run.result is RunningState.WAITING_FOR_CLUSTER:
908+
tasks_waiting_for_cluster = [
909+
t
910+
for t in comp_tasks.values()
911+
if t.state is RunningState.WAITING_FOR_CLUSTER
912+
]
911913
# get latest modified task
912914
latest_modified_of_all_tasks = max(
913-
comp_tasks.values(), key=lambda task: task.modified
915+
tasks_waiting_for_cluster, key=lambda task: task.modified
914916
).modified
915917

916918
if (
917919
arrow.utcnow().datetime - latest_modified_of_all_tasks
918-
) > _MAX_WAITING_FOR_CLUSTER_TIMEOUT:
920+
) > self.settings.COMPUTATIONAL_BACKEND_MAX_WAITING_FOR_CLUSTER_TIMEOUT:
919921
await CompTasksRepository.instance(
920922
self.db_engine
921923
).update_project_tasks_state(
922924
project_id,
923-
run_id,
924-
[NodeID(idstr) for idstr in comp_tasks],
925+
comp_run.run_id,
926+
[task.node_id for task in tasks_waiting_for_cluster],
925927
RunningState.FAILED,
926928
optional_progress=1.0,
927929
optional_stopped=arrow.utcnow().datetime,
928930
)
929-
for task in comp_tasks.values():
931+
for task in tasks_waiting_for_cluster:
930932
task.state = RunningState.FAILED
931933
msg = "Timed-out waiting for computational cluster! Please try again and/or contact Osparc support."
932934
_logger.error(msg)

0 commit comments

Comments
 (0)