meta-pytorch
diff --git a/‎torchx/schedulers/slurm_scheduler.py‎
Lines changed: 103 additions & 3 deletions b/‎torchx/schedulers/slurm_scheduler.py‎
Lines changed: 103 additions & 3 deletions
@@ -72,6 +72,45 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
 
 
+def _parse_slurm_version(version_str: str) -> Tuple[int, int]:
+    """
+    Parse Slurm version string (e.g., '24.05', '25.11.2') into (major, minor) tuple.
+    Raises ValueError if parsing fails.
+    """
+    parts = version_str.split(".")
+    if len(parts) < 2:
+        raise ValueError(
+            f"Invalid Slurm version string: {version_str}. Expected format: '24.05' or '25.11.2'"
+        )
+
+    try:
+        major = int(parts[0])
+        minor = int(parts[1])
+    except (ValueError, IndexError) as err:
+        raise ValueError(
+            f"Invalid Slurm version string: {version_str}. Expected format: '24.05' or '25.11.2'"
+        ) from err
+
+    return (major, minor)
+
+
+def _should_use_gpus_per_node_from_version(version_str: Optional[str]) -> bool:
+    """
+    Determine whether to use gpus-per-node based on version string.
+    Returns True if version > 24.11, False otherwise or if version cannot be parsed.
+    """
+    if not version_str:
+        return False
+
+    try:
+        major, minor = _parse_slurm_version(version_str)
+    except ValueError:
+        return False
+
+    # Use gpus-per-node if version > 24.11
+    return major > 24 or (major == 24 and minor > 11)
+
+
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -81,6 +120,7 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     "partition",
     "time",
     "constraint",
+    "qos",
 }
 
 log: logging.Logger = logging.getLogger(__name__)
@@ -106,6 +146,8 @@ def _apply_app_id_env(s: str) -> str:
         "mail-user": Optional[str],
         "mail-type": Optional[str],
         "job_dir": Optional[str],
+        "qos": Optional[str],
+        "slurm_version": Optional[str],
     },
     total=False,
 )
@@ -126,7 +168,11 @@ class SlurmReplicaRequest:
 
     @classmethod
     def from_role(
-        cls, name: str, role: Role, cfg: SlurmOpts, nomem: bool
+        cls,
+        name: str,
+        role: Role,
+        cfg: SlurmOpts,
+        nomem: bool,
     ) -> "SlurmReplicaRequest":
         """
         ``from_role`` creates a SlurmReplicaRequest for the specific role and
@@ -149,7 +195,12 @@ def from_role(
             if not nomem and resource.memMB > 0:
                 sbatch_opts.setdefault("mem", str(resource.memMB))
             if resource.gpu > 0:
-                sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
+                # Use smart GPU allocation based on Slurm version from config
+                slurm_version = cfg.get("slurm_version")
+                if _should_use_gpus_per_node_from_version(slurm_version):
+                    sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
+                else:
+                    sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
 
         srun_opts = {
             "output": f"slurm-{macros.app_id}-{name}.out",
@@ -378,6 +429,18 @@ def _run_opts(self) -> runopts:
             iteration, jobs will be tracked in ``.torchxslurmjobdirs``.
             """,
         )
+        opts.add(
+            "qos",
+            type_=str,
+            help="Quality of Service (QoS) to assign to the job.",
+        )
+        opts.add(
+            "slurm_version",
+            type_=str,
+            help="""Slurm version (e.g., '24.05', '25.11'). If version > 24.11,
+            uses gpus-per-node instead of gpus-per-task for GPU allocation.
+            """,
+        )
         return opts
 
     def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
@@ -401,6 +464,37 @@ def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
 
             return job_id
 
+    def _get_slurm_version(self) -> str:
+        """
+        _get_slurm_version returns the Slurm version string (e.g., "24.05").
+        Raises ValueError if version cannot be determined.
+        """
+        try:
+            p = subprocess.run(
+                ["sinfo", "--version"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+        except FileNotFoundError as e:
+            raise ValueError("Slurm is not available (sinfo command not found)") from e
+
+        if p.returncode != 0:
+            raise ValueError(
+                f"Failed to get Slurm version: {p.stderr.decode('utf-8').strip()}"
+            )
+
+        output = p.stdout.decode("utf-8").strip().lower()
+        if not output.startswith("slurm "):
+            raise ValueError(f"Unexpected sinfo --version output format: {output}")
+
+        # Remove "slurm " prefix and extract version (e.g., "24.05.4" -> "24.05")
+        version_full = output.replace("slurm", "").strip()
+        version_parts = version_full.split(".")
+        if len(version_parts) < 2:
+            raise ValueError(f"Invalid Slurm version format: {version_full}")
+
+        return f"{version_parts[0]}.{version_parts[1]}"
+
     def _partition_memmb(self, partition: Optional[str]) -> Optional[int]:
         """
         _partition_memmb returns the memory allocation for the given partition
@@ -441,6 +535,12 @@ def _submit_dryrun(
         partition = cfg.get("partition")
         assert partition is None or isinstance(partition, str), "partition must be str"
 
+        # Create a new config with the resolved slurm version
+        resolved_cfg = cfg.copy()
+        resolved_cfg["slurm_version"] = cfg.get(
+            "slurm_version", self._get_slurm_version()
+        )
+
         # check if the partition has at least 1GB memory, if we're not sure,
         # default to using memory allocations
         memmb = self._partition_memmb(partition)
@@ -460,7 +560,7 @@ def _submit_dryrun(
                 replicas[name] = SlurmReplicaRequest.from_role(
                     name,
                     replica_role,
-                    cfg,
+                    resolved_cfg,
                     nomem=nomem,
                 )
         cmd = ["sbatch", "--parsable"]