Skip to content

Commit b985df5

Browse files
Add detection for exceeded time limits for slurm jobs (#929)
* Add detection for exceeded time limits for slurm jobs. Add tests to check that job kills due to memory or time are detected by the cluster_tools. Use a new version of the slurm docker image. * use newest dockered-slurm version * update changelog * differentiate between memory and time limit exceptions * Update cluster_tools/cluster_tools/schedulers/slurm.py Co-authored-by: Philipp Otto <[email protected]> * apply PR feedback * rename multiprocessing module to avoid clash with standard lib module which led to a crash in pylint * fix linting setup and fix all linting errors * fix formatting * Print docker logs in CI * only execute cluster tools slurm tests in CI * use cgroup/v1 plugin as v2 fails to initialize correctly in the CI * revert changes to ci.yml * skip slurm time limit test, because it takes >1min --------- Co-authored-by: Philipp Otto <[email protected]>
1 parent b5c639a commit b985df5

File tree

24 files changed

+227
-95
lines changed

24 files changed

+227
-95
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ jobs:
6464
sleep 10
6565
done
6666
67+
# Show log output for debugging
68+
docker logs slurmctld
69+
docker logs c1
70+
docker logs c2
71+
6772
# Run setup.py on all three nodes
6873
docker exec -w /cluster_tools slurmctld bash -c "poetry install" &
6974
docker exec -w /cluster_tools c1 bash -c "poetry install" &

cluster_tools/Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section
1212
### Breaking Changes
1313

1414
### Added
15+
- From now on the cluster tools will detect if slurm jobs were killed, because they exceeded the configured time limit, and throw a `RemoteTimeLimitException` indicating the issue instead of a generic `RemoteException`. If it is unclear whether a job was killed because of the time or memory limit, a `RemoteResourceLimitException` is thrown. [#929](https://github.com/scalableminds/webknossos-libs/pull/929)
1516

1617
### Changed
1718

cluster_tools/cluster_tools/__init__.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,16 @@
1-
import logging
2-
import multiprocessing
3-
import os
4-
import tempfile
5-
from concurrent import futures
6-
from concurrent.futures import ProcessPoolExecutor
7-
from functools import partial
8-
from pathlib import Path
9-
from shutil import rmtree
101
from typing import Any, Union, overload
112

123
from typing_extensions import Literal
134

14-
from cluster_tools._utils.warning import enrich_future_with_uncaught_warning
155
from cluster_tools.executors.debug_sequential import DebugSequentialExecutor
16-
from cluster_tools.executors.multiprocessing import MultiprocessingExecutor
17-
from cluster_tools.executors.pickle import PickleExecutor
6+
from cluster_tools.executors.multiprocessing_ import MultiprocessingExecutor
7+
from cluster_tools.executors.pickle_ import PickleExecutor
188
from cluster_tools.executors.sequential import SequentialExecutor
199
from cluster_tools.schedulers.cluster_executor import (
2010
ClusterExecutor,
2111
RemoteOutOfMemoryException,
12+
RemoteResourceLimitException,
13+
RemoteTimeLimitException,
2214
)
2315
from cluster_tools.schedulers.kube import KubernetesExecutor
2416
from cluster_tools.schedulers.pbs import PBSExecutor

cluster_tools/cluster_tools/_utils/call.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ def call(command: str, stdin: Optional[str] = None) -> Tuple[str, str, int]:
1313
stdin_flag = None
1414
p = subprocess.run(
1515
command,
16+
stdin=stdin_flag,
17+
check=False,
1618
shell=True,
1719
capture_output=True,
1820
text=True,

cluster_tools/cluster_tools/_utils/file_wait_thread.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
import threading
44
import time
5-
from typing import TYPE_CHECKING, Callable, Dict, Tuple
5+
from typing import TYPE_CHECKING, Callable, Dict
66

77
if TYPE_CHECKING:
88
from cluster_tools.schedulers.cluster_executor import ClusterExecutor

cluster_tools/cluster_tools/_utils/multiprocessing_logging_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from logging.handlers import QueueHandler
1010
from queue import Empty as QueueEmpty
1111
from queue import Queue
12-
from typing import Any, Callable, List, Tuple
12+
from typing import Any, Callable, List
1313

1414
# Inspired by https://stackoverflow.com/a/894284
1515

cluster_tools/cluster_tools/_utils/reflection.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
import os
2-
import pickle
3-
import sys
42
from typing import Callable
53

64
WARNING_TIMEOUT = 10 * 60 # seconds

cluster_tools/cluster_tools/_utils/string.py renamed to cluster_tools/cluster_tools/_utils/string_.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import random
33
import string
44

5+
# The module name includes a _-suffix to avoid name clashes with the standard library string module.
6+
57

68
def local_filename(filename: str = "") -> str:
79
return os.path.join(os.getenv("CFUT_DIR", ".cfut"), filename)

cluster_tools/cluster_tools/_utils/tailf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def follow(self, seconds: int = 1) -> None:
3333
"""
3434

3535
self.check_file_validity(self.tailed_file)
36-
with open(self.tailed_file, errors="replace") as file_:
36+
with open(self.tailed_file, encoding="utf-8", errors="replace") as file_:
3737
# Don't seek, since we want to print the entire file here.
3838
while True:
3939
line = file_.readline()

cluster_tools/cluster_tools/_utils/warning.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
import logging
2-
import os
3-
import string
42
import threading
53
import time
64
from concurrent.futures import Future

0 commit comments

Comments
 (0)