Make slurm job canceling more robust (#1317)

daniel-wer · web-flow · commit e99df506f463 · 2025-07-16T10:53:24.000+02:00
* Make slurm job canceling more robust, by killing all remaining jobs after a short wait time * [Debug] Log times for scancel handling * Fix signal handling for cluster executors which stopped working if multiple executors were instantiated in the same process before. * First stop the file wait thread before canceling the jobs to avoid checking the job state after canceling which will log lots of errors for jobs that were canceled before running, because they don't turn up in the slurm accounting * Cleanup * Properly mock (and restore) env variables * Update changelog * Add f-string * Exempt completing jobs when checking whether cancellation worked fast enough * Test whether monkeypatching works * add time logs to find out where job cancellation time is spent * Fix setting sigterm_wait_in_s to 0 during tests * Improve slurm cancellation test to assert that original sigint handler was called * Format * Fix typing * Fix nonlocal variable access * Add test for signal handling regression when multiple executors are instantiated * Garbage collect after first executor ran to provoke regression * Apply some PR feedback * Delete executor1 in test to provoke bug * Add pytest-timeout to avoid hanging tests and wait for futures in test * Cleanup and fix hanging tests * Add comment * Remove pytest-timeout dependency again * Restore uv.lock * When shutting down cluster executor and wait if False, treat as if executor was killed * Decrease SIGTERM_WAIT_IN_S for new test and assert that shutdown hooks are cleaned up * Format * Remove dask executor from cluster tools * Linting * Update changelog * Actually deregister shutdown hook and use with statements to ensure executor shutdown * Also update webknossos uv.lock * Add pytest-timeout * Unify the two variables tracking executor shutdown * Fix kubernetes dependency * Assert that no jobs run before the tests * Add debug logging * Fix signal handling test * Revert "Update changelog" This reverts commit 31c472a. * Revert "Remove dask executor from cluster tools" This reverts commit 9866368. * Revert "Linting" This reverts commit 4ee81c0.
diff --git a/cluster_tools/Changelog.md b/cluster_tools/Changelog.md
@@ -16,6 +16,8 @@ For upgrade instructions, please check the respective *Breaking Changes* section
 ### Changed
 
 ### Fixed
+- Fixed that sometimes not all slurm jobs were canceled when an executor was killed. [#1317](https://github.com/scalableminds/webknossos-libs/pull/1317)
+- Fixed that when multiple cluster executors were instantiated in the same process, the original SIGINT handler sometimes was no longer called, leading to the main application not shutting down correctly after a SIGINT signal. [#1317](https://github.com/scalableminds/webknossos-libs/pull/1317)
 
 
 ## [2.4.4](https://github.com/scalableminds/webknossos-libs/releases/tag/v2.4.4) - 2025-07-14
diff --git a/cluster_tools/cluster_tools/schedulers/cluster_executor.py b/cluster_tools/cluster_tools/schedulers/cluster_executor.py
@@ -9,13 +9,13 @@
 from concurrent import futures
 from concurrent.futures import Future
 from functools import partial
+from types import FrameType, TracebackType
 from typing import (
     Any,
     Literal,
     TypeVar,
     cast,
 )
-from weakref import ReferenceType, ref
 
 from typing_extensions import ParamSpec
 
@@ -38,18 +38,6 @@
 _S = TypeVar("_S")
 
 
-def _handle_kill_through_weakref(
-    executor_ref: "ReferenceType[ClusterExecutor]",
-    existing_sigint_handler: Any,
-    signum: int | None,
-    frame: Any,
-) -> None:
-    executor = executor_ref()
-    if executor is None:
-        return
-    executor.handle_kill(existing_sigint_handler, signum, frame)
-
-
 def join_messages(strings: list[str]) -> str:
     return " ".join(x.strip() for x in strings if x.strip())
 
@@ -79,6 +67,9 @@ class RemoteTimeLimitException(RemoteResourceLimitException):
 class ClusterExecutor(futures.Executor):
     """Futures executor for executing jobs on a cluster."""
 
+    _shutdown_hooks: list[Callable[[], None]] = []
+    _installed_signal_handler: bool = False
+
     def __init__(
         self,
         debug: bool = False,
@@ -103,7 +94,6 @@ def __init__(
         self.job_resources = job_resources
         self.additional_setup_lines = additional_setup_lines or []
         self.job_name = job_name
-        self.was_requested_to_shutdown = False
         self.cfut_dir = (
             cfut_dir if cfut_dir is not None else os.getenv("CFUT_DIR", ".cfut")
         )
@@ -130,15 +120,7 @@ def __init__(
 
         os.makedirs(self.cfut_dir, exist_ok=True)
 
-        # Clean up if a SIGINT signal is received. However, do not interfere with the
-        # existing signal handler of the process or the
-        # shutdown of the main process which sends SIGTERM signals to terminate all
-        # child processes.
-        existing_sigint_handler = signal.getsignal(signal.SIGINT)
-        signal.signal(
-            signal.SIGINT,
-            partial(_handle_kill_through_weakref, ref(self), existing_sigint_handler),
-        )
+        self._register_shutdown_hook(self.handle_kill)
 
         self.metadata = {}
         assert not ("logging_config" in kwargs and "logging_setup_fn" in kwargs), (
@@ -158,26 +140,81 @@ def as_completed(cls, futs: list[Future[_T]]) -> Iterator[Future[_T]]:
     def executor_key(cls) -> str:
         pass
 
-    def handle_kill(
-        self, existing_sigint_handler: Any, signum: int | None, frame: Any
+    @classmethod
+    def _ensure_signal_handlers_are_installed(cls) -> None:
+        # Only overwrite the signal handler once
+        if cls._installed_signal_handler:
+            return
+
+        # Clean up if a SIGINT or SIGTERM signal is received. However, do not
+        # interfere with the existing signal handler of the process and execute
+        # it afterwards.
+        existing_sigint_handler = signal.getsignal(signal.SIGINT)
+        signal.signal(
+            signal.SIGINT,
+            partial(cls._handle_shutdown, existing_sigint_handler),
+        )
+        existing_sigterm_handler = signal.getsignal(signal.SIGTERM)
+        signal.signal(
+            signal.SIGTERM,
+            partial(cls._handle_shutdown, existing_sigterm_handler),
+        )
+
+        cls._installed_signal_handler = True
+
+    @classmethod
+    def _register_shutdown_hook(cls, hook: Callable[[], None]) -> None:
+        cls._shutdown_hooks.append(hook)
+        cls._ensure_signal_handlers_are_installed()
+
+    @classmethod
+    def _deregister_shutdown_hook(cls, hook: Callable[[], None]) -> None:
+        if hook in cls._shutdown_hooks:
+            cls._shutdown_hooks.remove(hook)
+        else:
+            logging.warning(
+                "Cannot deregister executors shutdown hook since it's not registered."
+            )
+
+    @classmethod
+    def _handle_shutdown(
+        cls,
+        existing_signal_handler: Callable[[int, FrameType | None], None] | int | None,
+        signum: int,
+        frame: Any,
     ) -> None:
+        logging.critical(
+            f"[{cls.__name__}] Caught signal {signal.Signals(signum).name}, running shutdown hooks"
+        )
+        try:
+            for hook in cls._shutdown_hooks:
+                hook()
+        except Exception as e:
+            print(f"Error during shutdown: {e}")
+
+        if (
+            callable(existing_signal_handler)
+            and existing_signal_handler
+            not in (
+                signal.SIG_DFL,  # For completeness sake (since it's not callable anyways). The system's default signal handler
+                signal.SIG_IGN,  # For completeness sake (since it's not callable anyways). The instruction to ignore a signal
+                signal.default_int_handler,  # Python's default SIGINT handler
+            )
+        ):
+            existing_signal_handler(signum, frame)
+
+    def handle_kill(self) -> None:
         if self.is_shutting_down:
             return
 
         self.is_shutting_down = True
 
-        self.inner_handle_kill(signum, frame)
         self.wait_thread.stop()
+        self.inner_handle_kill()
         self.clean_up()
 
-        if (
-            existing_sigint_handler != signal.default_int_handler
-            and callable(existing_sigint_handler)  # Could also be signal.SIG_IGN
-        ):
-            existing_sigint_handler(signum, frame)
-
     @abstractmethod
-    def inner_handle_kill(self, _signum: Any, _frame: Any) -> None:
+    def inner_handle_kill(self) -> None:
         pass
 
     @abstractmethod
@@ -363,9 +400,9 @@ def _completion(self, jobid: str, failed_early: bool) -> None:
         self._maybe_mark_logs_for_cleanup(jobid)
 
     def ensure_not_shutdown(self) -> None:
-        if self.was_requested_to_shutdown:
+        if self.is_shutting_down:
             raise RuntimeError(
-                "submit() was invoked on a ClusterExecutor instance even though shutdown() was executed for that instance."
+                "submit() was invoked on a ClusterExecutor instance even though shutdown() or handle_kill() was executed for that instance."
             )
 
     def create_enriched_future(self) -> Future:
@@ -591,17 +628,35 @@ def register_jobs(
                     should_keep_output,
                 )
 
+    # Overwrite the context manager __exit as it doesn't forward the information whether an exception was thrown or not otherwise
+    # which may lead to a deadlock if an exception is thrown within a cluster executor with statement, because self.jobs_empty_cond.wait()
+    # never succeeds.
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        _exc_val: BaseException | None,
+        _exc_tb: TracebackType | None,
+    ) -> Literal[False]:
+        # Don't wait if an exception was thrown
+        self.shutdown(wait=exc_type is None)
+        return False
+
     def shutdown(self, wait: bool = True, cancel_futures: bool = True) -> None:
         """Close the pool."""
+        if self.is_shutting_down:
+            return
+
+        self.is_shutting_down = True
         if not cancel_futures:
             logging.warning(
                 "The provided cancel_futures argument is ignored by ClusterExecutor."
             )
-        self.was_requested_to_shutdown = True
         if wait:
             with self.jobs_lock:
                 if self.jobs and self.wait_thread.is_alive():
                     self.jobs_empty_cond.wait()
+        else:
+            self.inner_handle_kill()
 
         self.wait_thread.stop()
         self.wait_thread.join()
@@ -617,6 +672,7 @@ def clean_up(self) -> None:
                     f"Could not delete file during clean up. Path: {file_to_clean_up} Exception: {exc}. Continuing..."
                 )
         self.files_to_clean_up = []
+        self._deregister_shutdown_hook(self.handle_kill)
 
     def map(  # type: ignore[override]
         self,
diff --git a/cluster_tools/cluster_tools/schedulers/kube.py b/cluster_tools/cluster_tools/schedulers/kube.py
@@ -113,11 +113,7 @@ def get_job_id_string(cls) -> str:
             return job_id
         return cls.get_jobid_with_index(job_id, job_index)
 
-    def inner_handle_kill(
-        self,
-        *args: Any,  # noqa: ARG002 Unused method argument: `args`
-        **kwargs: Any,  # noqa: ARG002 Unused method argument: `kwargs`
-    ) -> None:
+    def inner_handle_kill(self) -> None:
         job_ids = ",".join(str(job_id) for job_id in self.jobs.keys())
 
         print(
diff --git a/cluster_tools/cluster_tools/schedulers/pbs.py b/cluster_tools/cluster_tools/schedulers/pbs.py
@@ -4,7 +4,7 @@
 import os
 import re
 from concurrent.futures import Future
-from typing import Any, Literal
+from typing import Literal
 
 from cluster_tools._utils.call import call, chcall
 from cluster_tools._utils.string_ import random_string
@@ -56,7 +56,7 @@ def format_log_file_name(job_id_with_index: str, suffix: str = ".stdout") -> str
     def get_job_id_string(cls) -> str:
         return cls.get_current_job_id()
 
-    def inner_handle_kill(self, *args: Any, **kwargs: Any) -> None:  # noqa: ARG002 Unused method argument: `args`, kwargs
+    def inner_handle_kill(self) -> None:
         scheduled_job_ids: list[int | str] = list(self.jobs.keys())
 
         if len(scheduled_job_ids):
diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py
@@ -244,7 +244,7 @@ def submit_text(cls, job: str, cfut_dir: str) -> str:
 
         return str(int(job_id))  # int() ensures coherent parsing
 
-    def inner_handle_kill(self, *args: Any, **kwargs: Any) -> None:  # noqa ARG002 Unused method argument: `args`, kwargs
+    def inner_handle_kill(self) -> None:
         for submit_thread in self.submit_threads:
             submit_thread.stop()
 
@@ -260,10 +260,15 @@ def inner_handle_kill(self, *args: Any, **kwargs: Any) -> None:  # noqa ARG002 U
             # but can be canceled together using the job_id.
             unique_job_ids = set(map(lambda x: str(x).split("_")[0], scheduled_job_ids))
             job_id_string = " ".join(unique_job_ids)
+            # Allow to speed up the shutdown, for example, when running voxelytics locally
+            sigterm_wait_in_s_env = float(os.environ.get("SIGTERM_WAIT_IN_S", 5))
             # Send SIGINT signal to running jobs instead of terminating the jobs right away. This way, the jobs can
             # react to the signal, safely shutdown and signal (cancel) jobs they possibly scheduled, recursively.
+            # After a short waiting time kill all jobs that are still running (due to race conditions or because they
+            # didn't react to the SIGINT signal for some reason).
             _, stderr, _ = call(
-                f"scancel --state=PENDING {job_id_string}; scancel -s SIGINT --state=RUNNING {job_id_string}; scancel --state=SUSPENDED {job_id_string}"
+                f"scancel --state=PENDING {job_id_string}; scancel -s SIGINT --state=RUNNING {job_id_string};"
+                + f"scancel --state=SUSPENDED {job_id_string}; sleep {sigterm_wait_in_s_env}; scancel {job_id_string}"
             )
 
             maybe_error_or_warning = (
diff --git a/cluster_tools/pyproject.toml b/cluster_tools/pyproject.toml
@@ -17,15 +17,16 @@ Changelog = "https://github.com/scalableminds/webknossos-libs/blob/master/cluste
 
 
 [project.optional-dependencies]
-kubernetes = ["distributed ~=2023.9.1"]
-dask = ["kubernetes ~=27.2.0"]
+dask = ["distributed ~=2023.9.1"]
+kubernetes = ["kubernetes ~=27.2.0"]
 all=["cluster_tools[kubernetes]", "cluster_tools[dask]"]
 
 [tool.uv]
 dev-dependencies = [
     "icecream ~=2.1.1",
     "mypy ~=1.15.0",
     "pytest ~=8.3.3",
+    "pytest-timeout>=2.4.0",
     "ruff ~=0.11.0",
 ]
 
diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py
diff --git a/cluster_tools/uv.lock b/cluster_tools/uv.lock
diff --git a/webknossos/uv.lock b/webknossos/uv.lock