torchx/schedulers: add support for slurm (#87)

d4l3k · facebook-github-bot · commit ccfe332059f8 · 2021-06-23T15:04:58.000-07:00
Summary: Pull Request resolved: #87 SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects that slurm CLI tools are locally installed and job accounting is enabled. Each app def is scheduled using a heterogenous job via sbatch. Each replica of each role has a unique shell script generated with it's resource allocations and args and then sbatch is used to launch all of them together. Logs are written to the default slurm log file. For more info see: * https://slurm.schedmd.com/sbatch.html * https://slurm.schedmd.com/heterogeneous_jobs.html ``` $ torchx run --scheduler slurm utils.echo --msg hello slurm://torchx_user/1234 $ torchx status slurm://torchx_user/1234 $ less slurm-1234.out ``` Pull Request resolved: #78 Test Plan: Setup test slurm cluster with job accounting enabled ``` $ python setup.py bdist_wheel; and scp dist/torchx-0.1.0.dev2-py3-none-any.whl user@host: $ ssh user@host $ python3.8 -m pip install --user ./torchx-0.1.0.dev2-py3-none-any.whl $ torchx run --scheduler slurm tests.echo {"session": "", "scheduler": "slurm", "api": "schedule", "app_id": "55", "runcfg": "{}", "raw_exception": null, "source": "<unknown>"} === RUN RESULT === Launched app: slurm://torchx_ubuntu/55 {"session": "55", "scheduler": "slurm", "api": "status", "app_id": "55", "runcfg": null, "raw_exception": null, "source": "<unknown>"} App status: { "state": 0, "num_restarts": -1, "msg": "<NONE>", "ui_url": null, "roles": [], "structured_error_msg": "<NONE>" } $ torchx status slurm://torchx_ubuntu/55 {"session": "55", "scheduler": "slurm", "api": "status", "app_id": "55", "runcfg": null, "raw_exception": null, "source": "<unknown>"} AppDef: State: SUCCEEDED Num Restarts: -1 Roles: $ cat slurm-55.out hello world ``` Reviewed By: tierex Differential Revision: D29282763 Pulled By: kiukchung fbshipit-source-id: 576fadf3ccea6e1a87b567692280fac4f43401d2
diff --git a/docs/source/schedulers/slurm.rst b/docs/source/schedulers/slurm.rst
@@ -1,4 +1,8 @@
 Slurm
 =================
-<COMING SOON>
 
+.. automodule:: torchx.schedulers.slurm_scheduler
+.. currentmodule:: torchx.schedulers.slurm_scheduler
+
+.. autoclass:: SlurmScheduler
+   :members:
diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py
@@ -152,6 +152,12 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
             help="Does not actually submit the app,"
             " just prints the scheduler request",
         )
+        subparser.add_argument(
+            "--wait",
+            action="store_true",
+            default=False,
+            help="Wait for the app to finish before exiting.",
+        )
         subparser.add_argument(
             "conf_file",
             type=str,
@@ -183,3 +189,7 @@ def run(self, args: argparse.Namespace) -> None:
                 status = runner.status(app_handle)
                 print(f"App status: {status}")
                 print(f"Job URL: {none_throws(status).ui_url}")
+
+                if args.wait:
+                    print("Waiting for the app to finish...")
+                    runner.wait(app_handle)
diff --git a/torchx/schedulers/__init__.py b/torchx/schedulers/__init__.py
@@ -8,6 +8,7 @@
 from typing import Dict
 
 import torchx.schedulers.local_scheduler as local_scheduler
+import torchx.schedulers.slurm_scheduler as slurm_scheduler
 from torchx.schedulers.api import Scheduler
 from torchx.specs.api import SchedulerBackend
 from torchx.util.entrypoints import load_group
@@ -24,6 +25,7 @@ def get_schedulers(
         default={
             "local": local_scheduler.create_scheduler,
             "default": local_scheduler.create_scheduler,
+            "slurm": slurm_scheduler.create_scheduler,
         },
         ignore_missing=True,
     )
diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+import os.path
+import shlex
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from typing import Any, Dict, List, Mapping, Optional
+
+from torchx.schedulers.api import AppDryRunInfo, DescribeAppResponse, Scheduler
+from torchx.specs.api import (
+    NONE,
+    AppDef,
+    AppState,
+    Role,
+    RunConfig,
+    SchedulerBackend,
+    macros,
+)
+
+
+SLURM_STATES: Mapping[str, AppState] = {
+    "BOOT_FAIL": AppState.FAILED,
+    "CANCELLED": AppState.CANCELLED,
+    "COMPLETED": AppState.SUCCEEDED,
+    "DEADLINE": AppState.FAILED,
+    "FAILED": AppState.FAILED,
+    "NODE_FAIL": AppState.FAILED,
+    "OUT_OF_MEMORY": AppState.FAILED,
+    "PENDING": AppState.PENDING,
+    "PREEMPTED": AppState.FAILED,
+    "RUNNING": AppState.RUNNING,
+    "REQUEUED": AppState.PENDING,
+    "RESIZING": AppState.PENDING,
+    "REVOKED": AppState.FAILED,
+    "SUSPENDED": AppState.PENDING,
+    "TIMEOUT": AppState.FAILED,
+}
+
+
+def _slurm_escape(s: str) -> str:
+    """
+    _slurm_escape escapes the argument and substitutes in the macros.app_id with
+    a shell expression that fills in SLURM_JOB_ID from env.
+    """
+    escaped_parts = [shlex.quote(part) for part in s.split(macros.app_id)]
+    return '"$SLURM_JOB_ID"'.join(escaped_parts)
+
+
+@dataclass
+class SlurmReplicaRequest:
+    """
+    Holds parameters for a single replica running on slurm and can be materialized down to a bash script.
+    """
+
+    dir: str
+    entrypoint: str
+    args: List[str]
+    opts: Dict[str, str]
+    env: Dict[str, str]
+
+    @classmethod
+    def from_role(cls, role: Role, cfg: RunConfig) -> "SlurmReplicaRequest":
+        opts = {k: str(v) for k, v in cfg.cfgs.items()}
+
+        if (resource := role.resource) != NONE:
+            if (cpu := resource.cpu) > 0:
+                opts["cpus-per-task"] = str(cpu)
+            if (memMB := resource.memMB) > 0:
+                opts["mem"] = str(memMB)
+            if (gpu := resource.gpu) > 0:
+                opts["gpus-per-task"] = str(gpu)
+
+        return cls(
+            dir=role.image,
+            entrypoint=role.entrypoint,
+            args=list(role.args),
+            opts=opts,
+            env=dict(role.env),
+        )
+
+    def materialize(self) -> str:
+        sbatch_opts = [f"#SBATCH --{key}={value}" for key, value in self.opts.items()]
+        sbatch_opts += [
+            f"#SBATCH --export={key}={value}" for key, value in self.env.items()
+        ]
+        sbatch_opts_str = "\n".join(sbatch_opts)
+
+        escaped_args = [_slurm_escape(arg) for arg in self.args]
+
+        return f"""#!/bin/sh
+{sbatch_opts_str}
+
+# exit on error
+set -e
+
+srun --chdir={self.dir} {self.entrypoint} {" ".join(escaped_args)}
+"""
+
+
+@dataclass
+class SlurmBatchRequest:
+    """
+    Holds parameters used to launch a slurm job via sbatch.
+    """
+
+    cmd: List[str]
+    replicas: Dict[str, SlurmReplicaRequest]
+
+
+class SlurmScheduler(Scheduler):
+    """
+    SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
+    that slurm CLI tools are locally installed and job accounting is enabled.
+
+    Each app def is scheduled using a heterogenous job via sbatch.
+    Each replica of each role has a unique shell script generated with it's
+    resource allocations and args and then sbatch is used to launch all of them
+    together.
+
+    Logs are written to the default slurm log file.
+
+    Any scheduler options passed to it are added as SBATCH arguments to each replica.
+
+    For more info see:
+
+    * https://slurm.schedmd.com/sbatch.html
+    * https://slurm.schedmd.com/heterogeneous_jobs.html
+
+    .. code-block:: bash
+
+        $ torchx run --scheduler slurm utils.echo --msg hello
+        slurm://torchx_user/1234
+        $ torchx status slurm://torchx_user/1234
+        $ less slurm-1234.out
+        ...
+    """
+
+    def __init__(self, session_name: str) -> None:
+        super().__init__("slurm", session_name)
+
+    def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
+        req = dryrun_info.request
+        with tempfile.TemporaryDirectory() as tmpdir:
+            for i, (name, body) in enumerate(req.replicas.items()):
+                path = os.path.join(tmpdir, name)
+                with open(path, "w") as f:
+                    f.write(body.materialize())
+
+                if i > 0:
+                    req.cmd.append(":")
+                req.cmd.append(path)
+
+            p = subprocess.run(req.cmd, stdout=subprocess.PIPE, check=True)
+            return p.stdout.decode("utf-8").strip()
+
+    def _submit_dryrun(
+        self, app: AppDef, cfg: RunConfig
+    ) -> AppDryRunInfo[SlurmBatchRequest]:
+        cmd = ["sbatch", "--parsable", "--job-name", app.name]
+        replicas = {}
+        for i, role in enumerate(app.roles):
+            for replica_id in range(role.num_replicas):
+                values = macros.Values(
+                    img_root=role.image,
+                    app_id=macros.app_id,
+                    replica_id=str(replica_id),
+                )
+                name = f"role-{i}-{role.name}-{replica_id}.sh"
+                replica_role = values.apply(role)
+                replicas[name] = SlurmReplicaRequest.from_role(replica_role, cfg)
+        req = SlurmBatchRequest(
+            cmd=cmd,
+            replicas=replicas,
+        )
+        return AppDryRunInfo(req, repr)
+
+    def _validate(self, app: AppDef, scheduler: SchedulerBackend) -> None:
+        # Skip validation step for slurm
+        pass
+
+    def _cancel_existing(self, app_id: str) -> None:
+        subprocess.run(["scancel", app_id], check=True)
+
+    def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        p = subprocess.run(
+            ["sacct", "--parsable2", "-j", app_id], stdout=subprocess.PIPE, check=True
+        )
+        output = p.stdout.decode("utf-8").split("\n")
+        if len(output) <= 1:
+            return None
+
+        reader = csv.DictReader(output, delimiter="|")
+
+        resp = DescribeAppResponse(
+            app_id=app_id,
+        )
+        for row in reader:
+            if row["JobID"] == app_id:
+                state = row["State"]
+                resp.msg = state
+                state_enum = SLURM_STATES.get(state)
+                assert (
+                    state_enum
+                ), f"failed to translate slurm state {state} to torchx state"
+                resp.state = state_enum
+
+        return resp
+
+
+def create_scheduler(session_name: str, **kwargs: Any) -> SlurmScheduler:
+    return SlurmScheduler(
+        session_name=session_name,
+    )
diff --git a/torchx/schedulers/test/slurm_scheduler_test.py b/torchx/schedulers/test/slurm_scheduler_test.py