trigger cluster termination is now able to force close

sanderegg · sanderegg · commit cbb4adf1d19f · 2025-04-01T11:49:34.000+02:00
diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py
@@ -177,6 +177,7 @@ def cancel_jobs(
 def trigger_cluster_termination(
     user_id: Annotated[int, typer.Option(help="the user ID")],
     wallet_id: Annotated[int, typer.Option(help="the wallet ID")],
+    force: Annotated[bool, typer.Option(help="will not ask for confirmation")] = False,
 ) -> None:
     """this will set the Heartbeat tag on the primary machine to 1 hour, thus ensuring the
     clusters-keeper will properly terminate that cluster.
@@ -185,7 +186,7 @@ def trigger_cluster_termination(
         user_id -- the user ID
         wallet_id -- the wallet ID
     """
-    asyncio.run(api.trigger_cluster_termination(state, user_id, wallet_id))
+    asyncio.run(api.trigger_cluster_termination(state, user_id, wallet_id, force=force))
 
 
 @app.command()
diff --git a/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py b/scripts/maintenance/computational-clusters/autoscaled_monitor/core.py
@@ -446,6 +446,7 @@ def _print_summary_as_json(
                     "user_id": cluster.primary.user_id,
                     "wallet_id": cluster.primary.wallet_id,
                     "disk_space": cluster.primary.disk_space.human_readable(),
+                    "last_heartbeat": cluster.primary.last_heartbeat.isoformat(),
                 },
                 "workers": [
                     {
@@ -561,6 +562,63 @@ async def _list_computational_clusters(
     )
 
 
+async def _cancel_all_jobs(
+    state: AppState,
+    the_cluster: ComputationalCluster,
+    *,
+    task_to_dask_job: list[tuple[ComputationalTask | None, DaskTask | None]],
+    abort_in_db: bool,
+) -> None:
+    rich.print("cancelling all tasks")
+    for comp_task, dask_task in task_to_dask_job:
+        if dask_task is not None and dask_task.state != "unknown":
+            await dask.trigger_job_cancellation_in_scheduler(
+                state,
+                the_cluster,
+                dask_task.job_id,
+            )
+            if comp_task is None:
+                # we need to clear it of the cluster
+                await dask.remove_job_from_scheduler(
+                    state,
+                    the_cluster,
+                    dask_task.job_id,
+                )
+        if comp_task is not None and abort_in_db:
+            await db.abort_job_in_db(state, comp_task.project_id, comp_task.node_id)
+
+        rich.print("cancelled all tasks")
+
+
+async def _get_job_id_to_dask_state_from_cluster(
+    cluster: ComputationalCluster,
+) -> dict[TaskId, TaskState]:
+    job_id_to_dask_state: dict[TaskId, TaskState] = {}
+    for job_state, job_ids in cluster.task_states_to_tasks.items():
+        for job_id in job_ids:
+            job_id_to_dask_state[job_id] = job_state
+    return job_id_to_dask_state
+
+
+async def _get_db_task_to_dask_job(
+    computational_tasks: list[ComputationalTask],
+    job_id_to_dask_state: dict[TaskId, TaskState],
+) -> list[tuple[ComputationalTask | None, DaskTask | None]]:
+    task_to_dask_job: list[tuple[ComputationalTask | None, DaskTask | None]] = []
+    for task in computational_tasks:
+        dask_task = None
+        if task.job_id:
+            dask_task = DaskTask(
+                job_id=task.job_id,
+                state=job_id_to_dask_state.pop(task.job_id, None) or "unknown",
+            )
+        task_to_dask_job.append((task, dask_task))
+    # keep the jobs still in the cluster
+    for job_id, dask_state in job_id_to_dask_state.items():
+        task_to_dask_job.append((None, DaskTask(job_id=job_id, state=dask_state)))
+    return task_to_dask_job
+
+
 async def cancel_jobs(  # noqa: C901, PLR0912
     state: AppState, user_id: int, wallet_id: int | None, *, force: bool
 ) -> None:
@@ -571,7 +629,7 @@ async def cancel_jobs(  # noqa: C901, PLR0912
     computational_clusters = await _list_computational_clusters(
         state, user_id, wallet_id
     )
-    job_id_to_dask_state: dict[TaskId, TaskState] = {}
+
     if computational_clusters:
         assert (
             len(computational_clusters) == 1
@@ -580,22 +638,10 @@ async def cancel_jobs(  # noqa: C901, PLR0912
         the_cluster = computational_clusters[0]
         rich.print(f"{the_cluster.task_states_to_tasks=}")
 
-        for job_state, job_ids in the_cluster.task_states_to_tasks.items():
-            for job_id in job_ids:
-                job_id_to_dask_state[job_id] = job_state
-
-    task_to_dask_job: list[tuple[ComputationalTask | None, DaskTask | None]] = []
-    for task in computational_tasks:
-        dask_task = None
-        if task.job_id:
-            dask_task = DaskTask(
-                job_id=task.job_id,
-                state=job_id_to_dask_state.pop(task.job_id, None) or "unknown",
-            )
-        task_to_dask_job.append((task, dask_task))
-    # keep the jobs still in the cluster
-    for job_id, dask_state in job_id_to_dask_state.items():
-        task_to_dask_job.append((None, DaskTask(job_id=job_id, state=dask_state)))
+    job_id_to_dask_state = await _get_job_id_to_dask_state_from_cluster(the_cluster)
+    task_to_dask_job: list[tuple[ComputationalTask | None, DaskTask | None]] = (
+        await _get_db_task_to_dask_job(computational_tasks, job_id_to_dask_state)
+    )
 
     if not task_to_dask_job:
         rich.print("[red]nothing found![/red]")
@@ -611,27 +657,12 @@ async def cancel_jobs(  # noqa: C901, PLR0912
             if response == "none":
                 rich.print("[yellow]not cancelling anything[/yellow]")
             elif response == "all":
-                rich.print("cancelling all tasks")
-                for comp_task, dask_task in task_to_dask_job:
-                    if dask_task is not None and dask_task.state != "unknown":
-                        await dask.trigger_job_cancellation_in_scheduler(
-                            state,
-                            the_cluster,
-                            dask_task.job_id,
-                        )
-                        if comp_task is None:
-                            # we need to clear it of the cluster
-                            await dask.remove_job_from_scheduler(
-                                state,
-                                the_cluster,
-                                dask_task.job_id,
-                            )
-                    if comp_task is not None and force:
-                        await db.abort_job_in_db(
-                            state, comp_task.project_id, comp_task.node_id
-                        )
-
-                rich.print("cancelled all tasks")
+                await _cancel_all_jobs(
+                    state,
+                    the_cluster,
+                    task_to_dask_job=task_to_dask_job,
+                    abort_in_db=force,
+                )
             else:
                 try:
                     # Split the response and handle ranges
@@ -673,7 +704,7 @@ async def cancel_jobs(  # noqa: C901, PLR0912
 
 
 async def trigger_cluster_termination(
-    state: AppState, user_id: int, wallet_id: int
+    state: AppState, user_id: int, wallet_id: int, *, force: bool
 ) -> None:
     assert state.ec2_resource_clusters_keeper
     computational_instances = await ec2.list_computational_instances_from_ec2(
@@ -692,8 +723,20 @@ async def trigger_cluster_termination(
         state.environment,
         state.ec2_resource_clusters_keeper.meta.client.meta.region_name,
     )
-    if typer.confirm("Are you sure you want to trigger termination of that cluster?"):
+    if (force is True) or typer.confirm(
+        "Are you sure you want to trigger termination of that cluster?"
+    ):
         the_cluster = computational_clusters[0]
+
+        computational_tasks = await db.list_computational_tasks_from_db(state, user_id)
+        job_id_to_dask_state = await _get_job_id_to_dask_state_from_cluster(the_cluster)
+        task_to_dask_job: list[tuple[ComputationalTask | None, DaskTask | None]] = (
+            await _get_db_task_to_dask_job(computational_tasks, job_id_to_dask_state)
+        )
+        await _cancel_all_jobs(
+            state, the_cluster, task_to_dask_job=task_to_dask_job, abort_in_db=force
+        )
+
         new_heartbeat_tag: TagTypeDef = {
             "Key": "last_heartbeat",
             "Value": f"{arrow.utcnow().datetime - datetime.timedelta(hours=1)}",