Cancel cluster jobs on shutdown (#838)

daniel-wer · web-flow · commit a7186c351cf1 · 2023-01-05T09:50:01.000Z
* cancel all running slurm and pbs jobs in case the executor is killed
* avoid using the logging module during shutdown to avoid additional errors
* try to close multiprocessing logging handler on SystemExit
* revert logger closing
* do not react to sigterm and do not call sys.exit to allow clean shutdown by calling process
* update changelog
* adapt test
* use logging during shutdown since it shouldn't cause additional errors
* apply PR feedback
* format
* fix exception during shutdown for non-array jobs
* add args and kwargs to ignored-argument-names for pylint
* Merge branch 'master' of github.com:scalableminds/webknossos-libs into cancel-cluster-jobs
* signal jobs with SIGINT instead of SIGTERM to allow to cancel recursively scheduled jobs
* Only send SIGINT to running jobs as scancel stalls otherwise. Use scancel without a signal parameter to cancel pending jobs.
* Cancel pending jobs even if canceling running jobs did not yield exit code 0
* Do not interfere with existing SIGINT handlers. Call it after signal handling in case one exists.
* Avoid dead lock in executor shutdown
* First cancel the pending slurm jobs, then the running ones to avoid race conditions
* fix handle_kill call in tests
* Merge branch 'master' of github.com:scalableminds/webknossos-libs into cancel-cluster-jobs
* improve troubleshooting instructions in dockered slurm README
* Add test for slurm job cancellation and prepare slurm version update
* fix typing
* use new slurm docker image with updated slurm version
* fix linting
* correctly restore SLURM_MAX_RUNNING_SIZE env variable in tests
diff --git a/cluster_tools/.pylintrc b/cluster_tools/.pylintrc
@@ -354,7 +354,7 @@ max-args=5
 
 # Argument names that match this expression will be ignored. Default to name
 # with leading underscore
-ignored-argument-names=_.*
+ignored-argument-names=_.*|args|kwargs
 
 # Maximum number of locals for function / method body
 max-locals=15
diff --git a/cluster_tools/Changelog.md b/cluster_tools/Changelog.md
@@ -14,6 +14,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section
 ### Added
 
 ### Changed
+- When using the slurm or pbs distribution strategy, scheduled jobs are automatically canceled when aborting a run, i.e. if the SIGINT signal is received. [#838](https://github.com/scalableminds/webknossos-libs/pull/838)
 
 ### Fixed
 
diff --git a/cluster_tools/cluster_tools/schedulers/cluster_executor.py b/cluster_tools/cluster_tools/schedulers/cluster_executor.py
@@ -22,6 +22,8 @@
     with_preliminary_postfix,
 )
 
+NOT_YET_SUBMITTED_STATE = "NOT_YET_SUBMITTED"
+
 
 def join_messages(strings: List[str]) -> str:
     return " ".join(x.strip() for x in strings if x.strip())
@@ -86,14 +88,19 @@ def __init__(
         self.jobs_lock = threading.Lock()
         self.jobs_empty_cond = threading.Condition(self.jobs_lock)
         self.keep_logs = keep_logs
+        self.is_shutting_down = False
 
         self.wait_thread = FileWaitThread(self._completion, self)
         self.wait_thread.start()
 
         os.makedirs(self.cfut_dir, exist_ok=True)
 
-        signal.signal(signal.SIGINT, self.handle_kill)
-        signal.signal(signal.SIGTERM, self.handle_kill)
+        # Clean up if a SIGINT signal is received. However, do not interfere with the
+        # existing signal handler of the process or the
+        # shutdown of the main process which sends SIGTERM signals to terminate all
+        # child processes.
+        existing_sigint_handler = signal.getsignal(signal.SIGINT)
+        signal.signal(signal.SIGINT, partial(self.handle_kill, existing_sigint_handler))
 
         self.meta_data = {}
         assert not (
@@ -109,15 +116,24 @@ def __init__(
     def executor_key(cls):
         pass
 
-    def handle_kill(self, _signum, _frame):
+    def handle_kill(self, existing_sigint_handler, signum, frame):
+        if self.is_shutting_down:
+            return
+
+        self.is_shutting_down = True
+
+        self.inner_handle_kill(signum, frame)
         self.wait_thread.stop()
-        job_ids = ",".join(str(id) for id in self.jobs.keys())
-        logging.debug(
-            "A termination signal was registered. The following jobs are still running on the cluster:\n{}".format(
-                job_ids
-            )
-        )
-        sys.exit(130)
+
+        if (
+            existing_sigint_handler  # pylint: disable=comparison-with-callable
+            != signal.default_int_handler
+        ):
+            existing_sigint_handler(signum, frame)
+
+    @abstractmethod
+    def inner_handle_kill(self, _signum, _frame):
+        pass
 
     @abstractmethod
     def check_job_state(
@@ -426,7 +442,7 @@ def map_to_futures(self, fun, allArgs, output_pickle_path_getter=None):
                 # Register the job in the jobs array, although the jobid is not known yet.
                 # Otherwise it might happen that self.jobs becomes empty, but some of the jobs were
                 # not even submitted yet.
-                self.jobs[workerid_with_index] = "pending"
+                self.jobs[workerid_with_index] = NOT_YET_SUBMITTED_STATE
 
         job_count = len(allArgs)
         job_name = get_function_name(fun)
@@ -495,7 +511,7 @@ def shutdown(self, wait=True):
         self.was_requested_to_shutdown = True
         if wait:
             with self.jobs_lock:
-                if self.jobs:
+                if self.jobs and self.wait_thread.is_alive():
                     self.jobs_empty_cond.wait()
 
         self.wait_thread.stop()
diff --git a/cluster_tools/cluster_tools/schedulers/kube.py b/cluster_tools/cluster_tools/schedulers/kube.py
@@ -81,6 +81,15 @@ def get_job_id_string(cls) -> Optional[str]:
             return job_id
         return cls.get_jobid_with_index(job_id, job_index)
 
+    def inner_handle_kill(self, *args, **kwargs):
+        job_ids = ",".join(str(job_id) for job_id in self.jobs.keys())
+
+        print(
+            "Couldn't automatically cancel all Kubernetes jobs. The following jobs are still running on the cluster:\n{}".format(
+                job_ids
+            )
+        )
+
     def ensure_kubernetes_namespace(self):
         kubernetes_client = KubernetesClient()
         try:
diff --git a/cluster_tools/cluster_tools/schedulers/pbs.py b/cluster_tools/cluster_tools/schedulers/pbs.py
@@ -4,7 +4,7 @@
 import os
 import re
 from concurrent import futures
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 from typing_extensions import Literal
 
@@ -53,6 +53,31 @@ def format_log_file_name(job_id_with_index, suffix=".stdout"):
     def get_job_id_string(cls):
         return cls.get_current_job_id()
 
+    def inner_handle_kill(self, *args, **kwargs):
+        scheduled_job_ids: List[Union[int, str]] = list(self.jobs.keys())
+
+        if len(scheduled_job_ids):
+            # Array jobs (whose id looks like `<job_id>_<array_index>`) don't need to be canceled individually,
+            # but can be canceled together using the job_id.
+            split_job_ids = map(lambda x: str(x).split("_"), scheduled_job_ids)
+            # However array job ids need to include [] in the end.
+            unique_job_ids = set(
+                job_id_parts[0] if len(job_id_parts) == 1 else f"{job_id_parts[0]}[]"
+                for job_id_parts in split_job_ids
+            )
+            # Send SIGINT signal instead of SIGTERM using qdel. This way, the jobs can
+            # react to the signal, safely shutdown and signal (cancel) jobs they possibly scheduled, recursively.
+            _stdout, stderr, exit_code = call(
+                f"qsig -s SIGINT {' '.join(unique_job_ids)}"
+            )
+
+            if exit_code == 0:
+                logging.debug(f"Canceled PBS jobs {', '.join(unique_job_ids)}.")
+            else:
+                logging.warning(
+                    f"Couldn't automatically cancel all PBS jobs. Reason: {stderr}"
+                )
+
     def submit_text(self, job):
         """Submits a PBS job represented as a job file string. Returns
         the job ID.
diff --git a/cluster_tools/cluster_tools/schedulers/slurm.py b/cluster_tools/cluster_tools/schedulers/slurm.py
@@ -7,13 +7,14 @@
 import sys
 import threading
 from functools import lru_cache
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union
 
 from typing_extensions import Literal
 
 from cluster_tools.util import call, chcall, random_string
 
 from .cluster_executor import (
+    NOT_YET_SUBMITTED_STATE,
     ClusterExecutor,
     RemoteException,
     RemoteOutOfMemoryException,
@@ -163,18 +164,23 @@ def get_max_submit_jobs():
         return max_submit_jobs
 
     @staticmethod
-    def get_number_of_submitted_jobs():
+    def get_number_of_submitted_jobs(state: Optional[str] = None):
         number_of_submitted_jobs = 0
+        state_string = f"-t {state}" if state else ""
         # --array so that each job array element is displayed on a separate line and -h to hide the header
-        stdout, stderr, exit_code = call("squeue --array -u $USER -h | wc -l")
+        stdout, stderr, exit_code = call(
+            f"squeue --array -u $USER -h {state_string} | wc -l"
+        )
+
+        job_state_string = f"with state {state} " if state else ""
         if exit_code == 0:
             number_of_submitted_jobs = int(stdout.decode("utf8"))
             logging.debug(
-                f"Number of currently submitted jobs is {number_of_submitted_jobs}."
+                f"Number of currently submitted jobs {job_state_string}is {number_of_submitted_jobs}."
             )
         else:
             logging.warning(
-                f"Number of currently submitted jobs couldn't be determined. Reason: {stderr}"
+                f"Number of currently submitted jobs {job_state_string}couldn't be determined. Reason: {stderr}"
             )
         return number_of_submitted_jobs
 
@@ -197,10 +203,36 @@ def submit_text(cls, job, cfut_dir):
 
         return int(job_id)
 
-    def handle_kill(self, *args, **kwargs):
+    def inner_handle_kill(self, *args, **kwargs):
         for submit_thread in self.submit_threads:
             submit_thread.stop()
-        super().handle_kill(*args, **kwargs)
+
+        # Jobs with a NOT_YET_SUBMITTED_STATE have not been submitted to the cluster yet
+        scheduled_job_ids: List[Union[int, str]] = [
+            job_id
+            for job_id, job_state in self.jobs.items()
+            if job_state != NOT_YET_SUBMITTED_STATE
+        ]
+
+        if len(scheduled_job_ids):
+            # Array jobs (whose id looks like `<job_id>_<array_index>`) don't need to be signaled individually,
+            # but can be canceled together using the job_id.
+            unique_job_ids = set(map(lambda x: str(x).split("_")[0], scheduled_job_ids))
+            job_id_string = " ".join(unique_job_ids)
+            # Send SIGINT signal to running jobs instead of terminating the jobs right away. This way, the jobs can
+            # react to the signal, safely shutdown and signal (cancel) jobs they possibly scheduled, recursively.
+            _, stderr, _ = call(
+                f"scancel --state=PENDING {job_id_string}; scancel -s SIGINT --state=RUNNING {job_id_string}; scancel --state=SUSPENDED {job_id_string}"
+            )
+
+            maybe_error_or_warning = (
+                f"\nErrors and warnings (if all jobs were pending 'Invalid job id' errors are expected):\n{stderr.decode('utf8')}"
+                if stderr
+                else ""
+            )
+            print(
+                f"Canceled slurm jobs {', '.join(unique_job_ids)}.{maybe_error_or_warning}"
+            )
 
     def cleanup_submit_threads(self):
         self.submit_threads = [
diff --git a/cluster_tools/dockered-slurm/README.md b/cluster_tools/dockered-slurm/README.md
@@ -32,6 +32,8 @@ Run `docker-compose` to instantiate the cluster:
 $ docker-compose up -d
 ```
 
+> Note: If you encounter permission errors (`Failed to check keyfile "/etc/munge/munge.key": Permission denied`), follow the steps from the "Deleting the Cluster" section and run the previous command again.
+
 ## Register the Cluster with SlurmDBD
 
 To register the cluster to the slurmdbd daemon, run the `register_cluster.sh`
@@ -48,6 +50,8 @@ $ ./register_cluster.sh
 > You can check the status of the cluster by viewing the logs: `docker-compose
 > logs -f`
 
+> Note: If you encounter an error that the daemon is not running (`Error response from daemon: Container <...> is not running`), the start of the containers was not successful. Check the logs using `docker-compose logs -f` and revisit the last step.
+
 ## Accessing the Cluster
 
 Use `docker exec` to run a bash shell on the controller container:
diff --git a/cluster_tools/dockered-slurm/docker-compose.yml b/cluster_tools/dockered-slurm/docker-compose.yml
@@ -15,7 +15,7 @@ services:
       - ..:/cluster_tools
 
   slurmdbd:
-    image: scalableminds/slurm-docker-cluster:master__2021363052
+    image: scalableminds/slurm-docker-cluster:master__3840662994
     command: ["slurmdbd"]
     container_name: slurmdbd
     hostname: slurmdbd
@@ -29,7 +29,7 @@ services:
       - mysql
 
   slurmctld:
-    image: scalableminds/slurm-docker-cluster:master__2021363052
+    image: scalableminds/slurm-docker-cluster:master__3840662994
     command: ["slurmctld"]
     container_name: slurmctld
     environment:
@@ -48,7 +48,7 @@ services:
       - "slurmdbd"
 
   c1:
-    image: scalableminds/slurm-docker-cluster:master__2021363052
+    image: scalableminds/slurm-docker-cluster:master__3840662994
     command: ["slurmd"]
     hostname: c1
     container_name: c1
@@ -64,7 +64,7 @@ services:
       - "slurmctld"
 
   c2:
-    image: scalableminds/slurm-docker-cluster:master__2021363052
+    image: scalableminds/slurm-docker-cluster:master__3840662994
     command: ["slurmd"]
     hostname: c2
     container_name: c2
diff --git a/cluster_tools/dockered-slurm/slurm.conf b/cluster_tools/dockered-slurm/slurm.conf
@@ -23,7 +23,6 @@ SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
 SlurmdPidFile=/var/run/slurmd/slurmd.pid
 ProctrackType=proctrack/linuxproc
 #PluginDir=
-CacheGroups=0
 #FirstJobId=
 ReturnToService=0
 #MaxJobCount=
@@ -58,7 +57,6 @@ SchedulerType=sched/backfill
 #SchedulerRootFilter=
 SelectType=select/cons_res
 SelectTypeParameters=CR_CPU_Memory
-FastSchedule=1
 #PriorityType=priority/multifactor
 #PriorityDecayHalfLife=14-0
 #PriorityUsageResetPeriod=14-0
@@ -83,7 +81,6 @@ JobAcctGatherFrequency=30
 AccountingStorageType=accounting_storage/slurmdbd
 AccountingStorageHost=slurmdbd
 AccountingStoragePort=6819
-AccountingStorageLoc=slurm_acct_db
 #AccountingStoragePass=
 #AccountingStorageUser=
 #
diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py