scalableminds
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 47 additions & 6 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 47 additions & 6 deletions
diff --git a/‎cluster_tools/Changelog.md‎
Lines changed: 1 addition & 0 deletions b/‎cluster_tools/Changelog.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cluster_tools/README.md‎
Lines changed: 25 additions & 2 deletions b/‎cluster_tools/README.md‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎cluster_tools/cluster_tools/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎cluster_tools/cluster_tools/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cluster_tools/cluster_tools/remote.py‎
Lines changed: 19 additions & 15 deletions b/‎cluster_tools/cluster_tools/remote.py‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎cluster_tools/cluster_tools/schedulers/cluster_executor.py‎
Lines changed: 23 additions & 9 deletions b/‎cluster_tools/cluster_tools/schedulers/cluster_executor.py‎
Lines changed: 23 additions & 9 deletions
@@ -30,12 +30,18 @@ jobs:
     needs: changes
     if: ${{ needs.changes.outputs.cluster_tools == 'true' }}
     runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      max-parallel: 4
+      matrix:
+        executors: [multiprocessing, slurm, kubernetes]
     defaults:
       run:
         working-directory: cluster_tools
     steps:
       - uses: actions/checkout@v2
       - name: Build/pull dockered-slurm image
+        if: ${{ matrix.executors == 'slurm' }}
         run: |
           cd ./dockered-slurm
 
@@ -50,26 +56,61 @@ jobs:
           done
 
           # Run setup.py on all three nodes
-          docker exec slurmctld bash -c "cd /cluster_tools && poetry install"
-          docker exec c1 bash -c "cd /cluster_tools && poetry install"
-          docker exec c2 bash -c "cd /cluster_tools && poetry install"
+          docker exec -w /cluster_tools slurmctld bash -c "poetry install" &
+          docker exec -w /cluster_tools c1 bash -c "poetry install" &
+          docker exec -w /cluster_tools c2 bash -c "poetry install" &
+          wait
+      
+      - name: Setup Kubernetes-in-Docker
+        if: ${{ matrix.executors == 'kubernetes' }}
+        run: |
+          curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64
+          chmod +x ./kind
+          sed -i "s#__PATH__#$(pwd)#g" tests/cluster-config.yaml
+          ./kind create cluster --config=tests/cluster-config.yaml
+          ./kind export kubeconfig
+
+          docker build -f tests/Dockerfile -t scalableminds/cluster-tools:latest .
+          ./kind load docker-image scalableminds/cluster-tools:latest
 
       - name: Install dependencies (without docker)
+        if: ${{ matrix.executors == 'multiprocessing' || matrix.executors == 'kubernetes' }}
         run: |
           pip install poetry
           poetry install
 
       - name: Check formatting
+        if: ${{ matrix.executors == 'multiprocessing' }}
         run: ./format.sh check
 
       - name: Lint code
+        if: ${{ matrix.executors == 'multiprocessing' }}
         run: ./lint.sh
 
-      - name: Run tests
+      - name: Run multiprocessing tests
+        if: ${{ matrix.executors == 'multiprocessing' }}
+        run: |
+          cd tests
+          PYTEST_EXECUTORS=multiprocessing,sequential,test_pickling,debug_sequential \
+            poetry run python -m pytest -sv test_all.py test_multiprocessing.py
+      
+      - name: Run slurm tests
+        if: ${{ matrix.executors == 'slurm' }}
         run: |
           cd ./dockered-slurm
-          docker exec slurmctld bash -c "cd /cluster_tools/tests && poetry run python -m pytest -s tests.py"
-          docker exec slurmctld bash -c "cd /cluster_tools/tests && poetry run python tests.py"
+          docker exec \
+            -w /cluster_tools/tests \
+            -e PYTEST_EXECUTORS=slurm \
+            slurmctld bash -c "poetry run python -m pytest -sv test_all.py test_slurm.py"
+          docker exec \
+            -w /cluster_tools/tests \
+            slurmctld bash -c "poetry run python test_deref_main.py"
+      
+      - name: Run kubernetes tests
+        if: ${{ matrix.executors == 'kubernetes' }}
+        run: |
+          cd tests
+          PYTEST_EXECUTORS=kubernetes poetry run python -m pytest -sv test_all.py test_kubernetes.py
 
   webknossos_linux:
     needs: changes
 
@@ -12,6 +12,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section
 ### Breaking Changes
 
 ### Added
+* Added `KubernetesExecutor` for parallelizing Python scripts on a Kubernetes cluster. [#600](https://github.com/scalableminds/webknossos-libs/pull/600)
 
 ### Changed
 
 
@@ -4,7 +4,6 @@
 
 This package provides python `Executor` classes for distributing tasks on a slurm cluster or via multi processing.
 
-
 ## Example
 
 ```python
@@ -24,11 +23,35 @@ if __name__ == '__main__':
 
 ### Slurm
 
-The cluster_tools automatically determine the slurm limit for maximum array job size and split up larger job batches into multiple smaller batches.
+The `cluster_tools` automatically determine the slurm limit for maximum array job size and split up larger job batches into multiple smaller batches.
 Also, the slurm limit for the maximum number of jobs which are allowed to be submitted by a user at the same time is honored by looking up the number of currently submitted jobs and only submitting new batches if they fit within the limit.
 
 If you would like to configure these limits independently, you can do so by setting the `SLURM_MAX_ARRAY_SIZE` and `SLURM_MAX_SUBMIT_JOBS` environment variables.
 
+### Kubernetes
+
+#### Resource configuration
+
+| Key                 | Description                                                                                                                                                                                                                                                              | Example                                 |
+| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------- |
+| `namespace`         | Kubernetes namespace for the resources to be created. Will be created if not existent.                                                                                                                                                                                   | `cluster-tools`                         |
+| `node_selector`     | Which nodes to utilize for the processing. Needs to be a [Kubernetes `nodeSelector` object](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/).                                                                                                   | `{"kubernetes.io/hostname": "node001"}` |
+| `image`             | The docker image for the containerized jobs to run in. The image needs to have the same version of `cluster_tools` and the code to run installed and in the `PYTHONPATH`.                                                                                                | `scalableminds/voxelytics:latest`       |
+| `mounts`            | Additional mounts for the containerized jobs. The current working directory and the `.cfut` directory are automatically mounted.                                                                                                                                         | `["/srv", "/data"]`                     |
+| `cpu`               | [CPU requirements](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) for this job.                                                                                                                                                         | `4`                                     |
+| `memory`            | [Memory requirements](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) for this job. Not required, but highly recommended to avoid congestion. Without resource requirements, all jobs will be run in parallel and RAM will run out soon. | `16G`                                   |
+| `python_executable` | The python executable may differ in the docker image from the one in the current environment. For images based of `FROM python`, it should be `python`. Defaults to `python`.                                                                                            | `python3.8`                             |
+| `umask`             | `umask` for the jobs.                                                                                                                                                                                                                                                    | `0002`                                  |
+
+#### Notes
+
+- The jobs are run with the current `uid:gid`.
+- The jobs are removed 7 days after completion (successful or not).
+- The logs are stored in the `.cfut` directory. This is actually redundant, because Kubernetes also stores them.
+- Pods are not restarted upon error.
+- Requires Kubernetes ≥ 1.23.
+- [Kubernetes cluster configuration](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is expected to be the same as for `kubectl`, i.e. in `~/.kube/config` or similar.
+
 ## Dev Setup
 
 ```
 
@@ -10,6 +10,7 @@
 
 from . import pickling
 from .multiprocessing_logging_handler import get_multiprocessing_logging_setup_fn
+from .schedulers.kube import KubernetesExecutor
 from .schedulers.pbs import PBSExecutor
 from .schedulers.slurm import SlurmExecutor
 from .util import enrich_future_with_uncaught_warning
@@ -326,6 +327,8 @@ def get_executor(environment, **kwargs):
         return SlurmExecutor(**kwargs)
     elif environment == "pbs":
         return PBSExecutor(**kwargs)
+    elif environment == "kubernetes":
+        return KubernetesExecutor(**kwargs)
     elif environment == "multiprocessing":
         global did_start_test_multiprocessing
         if not did_start_test_multiprocessing:
 
@@ -4,17 +4,20 @@
 import sys
 import traceback
 
+from cluster_tools.schedulers.kube import KubernetesExecutor
 from cluster_tools.schedulers.pbs import PBSExecutor
 from cluster_tools.schedulers.slurm import SlurmExecutor
 from cluster_tools.util import with_preliminary_postfix
 
 from . import pickling
 
 
-def get_executor_class():
-    for executor in [SlurmExecutor, PBSExecutor]:
-        if executor.get_current_job_id() is not None:
-            return executor
+def get_executor_class(executor_key):
+    return {
+        "slurm": SlurmExecutor,
+        "pbs": PBSExecutor,
+        "kubernetes": KubernetesExecutor,
+    }.get(executor_key)
 
 
 def format_remote_exc():
@@ -23,16 +26,16 @@ def format_remote_exc():
     return "".join(traceback.format_exception(typ, value, tb))
 
 
-def get_custom_main_path(workerid):
+def get_custom_main_path(workerid, executor):
     custom_main_path = None
-    main_meta_path = get_executor_class().get_main_meta_path(cfut_dir, workerid)
+    main_meta_path = executor.get_main_meta_path(cfut_dir, workerid)
     if os.path.exists(main_meta_path):
         with open(main_meta_path, "r") as file:
             custom_main_path = file.read()
     return custom_main_path
 
 
-def worker(workerid, job_array_index, job_array_index_offset, cfut_dir):
+def worker(executor, workerid, job_array_index, job_array_index_offset, cfut_dir):
     """Called to execute a job on a remote host."""
 
     if job_array_index is not None:
@@ -42,13 +45,12 @@ def worker(workerid, job_array_index, job_array_index_offset, cfut_dir):
     else:
         workerid_with_idx = worker_id
 
-    executor = get_executor_class()
     try:
         input_file_name = executor.format_infile_name(cfut_dir, workerid_with_idx)
         print("trying to read: ", input_file_name)
         print("working dir: ", os.getcwd())
 
-        custom_main_path = get_custom_main_path(workerid)
+        custom_main_path = get_custom_main_path(workerid, executor)
         with open(input_file_name, "rb") as f:
             unpickled_tuple = pickling.load(f, custom_main_path)
             if len(unpickled_tuple) == 4:
@@ -129,12 +131,14 @@ def setup_logging(meta_data, executor, cfut_dir):
 
 
 if __name__ == "__main__":
-    worker_id = sys.argv[1]
-    cfut_dir = sys.argv[2]
-    job_array_index_offset = sys.argv[3] if len(sys.argv) > 3 else "0"
-    job_array_index = get_executor_class().get_job_array_index()
-
-    worker(worker_id, job_array_index, job_array_index_offset, cfut_dir)
+    executor_key = sys.argv[1]
+    executor = get_executor_class(executor_key)
+    worker_id = sys.argv[2]
+    cfut_dir = sys.argv[3]
+    job_array_index_offset = sys.argv[4] if len(sys.argv) > 4 else "0"
+    job_array_index = executor.get_job_array_index()
+
+    worker(executor, worker_id, job_array_index, job_array_index_offset, cfut_dir)
     # This is a workaround for the case that some subprocesses are still hanging around and are waited for.
     # If this point is reached, results were written to disk and we can "safely" shut down everything.
     sys.exit()
@@ -93,6 +93,11 @@ def __init__(
         if "logging_setup_fn" in kwargs:
             self.meta_data["logging_setup_fn"] = kwargs["logging_setup_fn"]
 
+    @classmethod
+    @abstractmethod
+    def executor_key(cls):
+        pass
+
     def handle_kill(self, _signum, _frame):
         self.wait_thread.stop()
         job_ids = ",".join(str(id) for id in self.jobs.keys())
@@ -104,17 +109,19 @@ def handle_kill(self, _signum, _frame):
         sys.exit(130)
 
     @abstractmethod
-    def check_for_crashed_job(self, job_id) -> Union["failed", "ignore", "completed"]:
+    def check_for_crashed_job(
+        self, job_id_with_index
+    ) -> Union["failed", "ignore", "completed"]:
         pass
 
     def _start(self, workerid, job_count=None, job_name=None):
         """Start job(s) with the given worker ID and return IDs
         identifying the new job(s). The job should run ``python -m
-        cfut.remote <workerid>.
+        cfut.remote <executorkey> <workerid>.
         """
 
         jobids_futures, job_index_ranges = self.inner_submit(
-            f"{sys.executable} -m cluster_tools.remote {workerid} {self.cfut_dir}",
+            f"{self.get_python_executable()} -m cluster_tools.remote {self.executor_key()} {workerid} {self.cfut_dir}",
             job_name=self.job_name if self.job_name is not None else job_name,
             additional_setup_lines=self.additional_setup_lines,
             job_count=job_count,
@@ -145,12 +152,14 @@ def _cleanup(self, jobid):
 
     @staticmethod
     @abstractmethod
-    def format_log_file_name(jobid, suffix=".stdout"):
+    def format_log_file_name(job_id_with_index, suffix=".stdout"):
         pass
 
     @classmethod
-    def format_log_file_path(cls, cfut_dir, jobid, suffix=".stdout"):
-        return os.path.join(cfut_dir, cls.format_log_file_name(jobid, suffix))
+    def format_log_file_path(cls, cfut_dir, job_id_with_index, suffix=".stdout"):
+        return os.path.join(
+            cfut_dir, cls.format_log_file_name(job_id_with_index, suffix)
+        )
 
     @classmethod
     @abstractmethod
@@ -169,6 +178,9 @@ def format_infile_name(cfut_dir, job_id):
     def format_outfile_name(cfut_dir, job_id):
         return os.path.join(cfut_dir, "cfut.out.%s.pickle" % job_id)
 
+    def get_python_executable(self):
+        return sys.executable
+
     def _completion(self, jobid, failed_early):
         """Called whenever a job finishes."""
         with self.jobs_lock:
@@ -278,7 +290,7 @@ def submit(self, fun, *args, **kwargs):
         jobid = jobids_futures[0].result()
 
         if self.debug:
-            print("job submitted: %i" % jobid, file=sys.stderr)
+            print(f"job submitted: {jobid}", file=sys.stderr)
 
         # Thread will wait for it to finish.
         self.wait_thread.waitFor(preliminary_output_pickle_path, jobid)
@@ -289,10 +301,12 @@ def submit(self, fun, *args, **kwargs):
         fut.cluster_jobid = jobid
         return fut
 
-    def get_workerid_with_index(self, workerid, index):
+    @classmethod
+    def get_workerid_with_index(cls, workerid, index):
         return workerid + "_" + str(index)
 
-    def get_jobid_with_index(self, jobid, index):
+    @classmethod
+    def get_jobid_with_index(cls, jobid, index):
         return str(jobid) + "_" + str(index)
 
     def get_function_pickle_path(self, workerid):