Merge pull request #9 from compspec/add-features

vsoch · web-flow · commit 1c90468bdd43 · 2025-08-01T03:53:16.000-06:00
feat: support for job manager features;
diff --git a/fractale/transformer/base.py b/fractale/transformer/base.py
@@ -78,13 +78,13 @@ def add(self, name: str, value=None):
 
         # Determine if it's a short (-n) or long (--tasks) option
         prefix = "-" if len(name) == 1 else "--"
-        self.script_lines.append(f"{self.directive}: {prefix}{name}={value}")
+        self.script_lines.append(f"{self.directive} {prefix}{name}={value}")
 
     def add_flag(self, name: str):
         """
         Add a boolean flag (e.g., --exclusive).
         """
-        self.script_lines.append(f"{self.directive}: --{name}")
+        self.script_lines.append(f"{self.directive} --{name}")
 
     def render(self) -> str:
         """
diff --git a/fractale/transformer/cobalt/transform.py b/fractale/transformer/cobalt/transform.py
@@ -154,9 +154,12 @@ def convert(self, spec):
             )
             qsub_cmd.extend(["--dependencies", dep_str])
 
-        # Node constraints are handled by --attrs
-        if spec.constraints:
-            qsub_cmd.extend(["--attrs", ":".join(spec.constraints)])
+        # Node constraints and GPU type are handled by --attrs
+        attrs = list(spec.constraints)
+        if spec.gpu_type:
+            attrs.append(f"gpu_type={spec.gpu_type}")
+        if attrs:
+            qsub_cmd.extend(["--attrs", ":".join(attrs)])
 
         # -O sets the prefix for output/error files, which is derived from the job name.
         qsub_cmd.extend(["-O", job_name])
@@ -284,7 +287,11 @@ def _parse(self, content, return_unhandled=False):
                 elif key == "dependencies":
                     spec.depends_on = value.split(":")
                 elif key == "attrs":
-                    spec.constraints = value.split(":")
+                    for attr in value.split(":"):
+                        if attr.startswith("gpu_type="):
+                            spec.gpu_type = attr.split("=", 1)[1]
+                        else:
+                            spec.constraints.append(attr)
                 elif key == "M":
                     spec.mail_user = value
                 elif key == "notify" and value == "user":
diff --git a/fractale/transformer/common.py b/fractale/transformer/common.py
@@ -1,4 +1,3 @@
-import sys
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
@@ -32,6 +31,7 @@ class JobSpec:
     cpus_per_task: int = 1
     mem_per_task: Optional[str] = None
     gpus_per_task: int = 0
+    gpu_type: Optional[str] = None
 
     # Scheduling and Constraints
     wall_time: Optional[int] = None
diff --git a/fractale/transformer/flux/transform.py b/fractale/transformer/flux/transform.py
@@ -1,3 +1,7 @@
+import re
+import shlex
+from typing import Optional
+
 from fractale.logger.generate import JobNamer
 from fractale.transformer.base import Script, TransformerBase
 from fractale.transformer.flux.validate import Validator
@@ -15,22 +19,15 @@ def priority_to_flux_priority(class_name):
     numerical priority value. This is the reverse of map_numeric_priority_to_class_name.
     """
     # Define the mapping from the string class back to a representative number.
-    mapping = {
-        "low": 15,
-        "normal": 16,
-        "high": 50,
-        "urgent": 100,
-    }
+    mapping = {"low": 15, "normal": 16, "high": 50, "urgent": 100}
+
     # If we don't get it, default to Flux's default
     return mapping.get(class_name, 16)
 
 
 class FluxTransformer(TransformerBase):
     """
-    A Flux Transformer is a very manual way to transform a subsystem into
-    a batch script. I am not even using jinja templates, I'm just
-    parsing the subsystems in a sort of manual way. This a filler,
-    and assuming that we will have an LLM that can replace this.
+    A Flux Transformer for converting a generic JobSpec into a Flux batch script.
     """
 
     def _parse(self, jobspec, return_unhandled=False):
@@ -97,6 +94,10 @@ def convert(self, spec):
         script.add("c", spec.cpus_per_task if spec.cpus_per_task > 1 else None)
         script.add("gpus-per-task", spec.gpus_per_task if spec.gpus_per_task > 0 else None)
 
+        # Add a constraint for the specific GPU type, if provided
+        # We could probably add gpu_type to requires if an admin configures it,
+        # but it's too risky.
+
         # Scheduling Directives
         if spec.exclusive_access:
             script.add_flag("exclusive")
@@ -106,7 +107,7 @@ def convert(self, spec):
         script.add("t", spec.wall_time)
 
         flux_prio = priority_to_flux_priority(spec.priority)
-        if flux_prio != 0:
+        if flux_prio != 16:
             script.add("urgency", flux_prio)
         script.newline()
 
diff --git a/fractale/transformer/kubernetes/transform.py b/fractale/transformer/kubernetes/transform.py
@@ -10,6 +10,7 @@
 
 # Assume GPUs are NVIDIA
 gpu_resource_name = "nvidia.com/gpu"
+gpu_product_label = "nvidia.com/gpu.product"
 
 
 def normalize_cpu_request(cpus: int) -> str:
@@ -37,7 +38,7 @@ def normalize_memory_request(mem_str):
     return mem_str
 
 
-def parse_memory(self, mem_str: str) -> str:
+def parse_memory(mem_str: str) -> str:
     """
     Converts K8s memory (e.g., 1Gi) to JobSpec format (e.g., 1G).
     """
@@ -53,7 +54,7 @@ def parse_memory(self, mem_str: str) -> str:
     return mem_str
 
 
-def parse_cpu(self, cpu_str: str) -> int:
+def parse_cpu(cpu_str: str) -> int:
     """
     Converts K8s CPU string to an integer. Assumes no millicores.
     """
@@ -125,12 +126,14 @@ def convert(self, spec):
         if spec.environment:
             container["env"] = [{"name": k, "value": v} for k, v in spec.environment.items()]
 
+        # This is the spec for the pod template
+        template_pod_spec = {"containers": [container], "restartPolicy": "Never"}
         pod_spec = {
             "apiVersion": "batch/v1",
             "kind": "Job",
             "metadata": {"name": job_name},
             "spec": {
-                "template": {"spec": {"containers": [container], "restartPolicy": "Never"}},
+                "template": {"spec": template_pod_spec},
                 "backoffLimit": 0,
             },
         }
@@ -158,7 +161,7 @@ def convert(self, spec):
         job_spec = {
             "parallelism": spec.num_nodes,
             "completions": spec.num_nodes,
-            "backoffLimit": 4,  # A sensible default
+            "backoffLimit": 4,
             "template": {"metadata": {"labels": {"job-name": spec.job_name}}, "spec": pod_spec},
         }
 
@@ -241,6 +244,11 @@ def parse(self, job_manifest):
             if cpu_val == 1:
                 spec.cpus_per_task = 1
 
+        # GPU Type from Node Selector
+        node_selector = pod_spec.get("nodeSelector", {})
+        if gpu_product_label in node_selector:
+            spec.gpu_type = node_selector[gpu_product_label]
+
         # Scheduling
         if pod_spec.get("priorityClassName"):
             try:
diff --git a/fractale/transformer/lsf/transform.py b/fractale/transformer/lsf/transform.py
@@ -160,8 +160,15 @@ def convert(self, spec):
 
         # Build the complex -R "select[...] span[...] rusage[...]" string
         r_parts = []
-        if spec.constraints:
-            r_parts.append(f'select[{":".join(spec.constraints)}]')
+
+        # Handle select criteria, including GPU type
+        select_criteria = list(spec.constraints)
+
+        # I'm not sure this would actually work
+        if spec.gpu_type:
+            select_criteria.append(spec.gpu_type)
+        if select_criteria:
+            r_parts.append(f'select[{":".join(select_criteria)}]')
 
         if spec.num_nodes > 1 and spec.num_tasks > 0:
             tasks_per_node = spec.num_tasks // spec.num_nodes
@@ -252,6 +259,9 @@ def _parse(self, content, return_unhandled=False):
         command_lines = []
         not_handled = set()
 
+        # Heuristic list of common GPU names to identify as gpu_type
+        known_gpu_types = {"a100", "v100", "h100", "a30", "a40", "mi250"}
+
         for line in content.splitlines():
             if not line.strip():
                 continue
@@ -314,7 +324,13 @@ def _parse(self, content, return_unhandled=False):
                         if spec.num_tasks > 0 and tasks_per_node > 0:
                             spec.num_nodes = spec.num_tasks // tasks_per_node
                     if select_match:
-                        spec.constraints.extend(select_match.group(1).split(":"))
+                        criteria = select_match.group(1).split(":")
+                        for criterion in criteria:
+                            # If a criterion is a known GPU type, set it and move on
+                            if criterion.lower() in known_gpu_types:
+                                spec.gpu_type = criterion
+                            else:
+                                spec.constraints.append(criterion)
                 else:
                     not_handled.add(key)
                 continue
diff --git a/fractale/transformer/moab/transform.py b/fractale/transformer/moab/transform.py
@@ -1,6 +1,6 @@
 import re
 import shlex
-from datetime import timedelta
+from datetime import datetime, timedelta
 
 import fractale.utils as utils
 from fractale.logger.generate import JobNamer
@@ -169,8 +169,19 @@ def convert(self, spec) -> str:
 
         # Resource Requests
         resource_parts = []
-        if spec.num_nodes and spec.cpus_per_task:
-            resource_parts.append(f"nodes={spec.num_nodes}:ppn={spec.cpus_per_task}")
+        node_spec = []
+        if spec.num_nodes:
+            node_spec.append(f"nodes={spec.num_nodes}")
+        if spec.cpus_per_task:
+            node_spec.append(f"ppn={spec.cpus_per_task}")
+        if spec.gpus_per_task > 0:
+            node_spec.append(f"gpus={spec.gpus_per_task}")
+        if spec.gpu_type:
+            # Add gpu type as a feature request
+            node_spec.append(spec.gpu_type)
+
+        if node_spec:
+            resource_parts.append(":".join(node_spec))
 
         if spec.generic_resources:
             resource_parts.append(f"gres={spec.generic_resources}")
@@ -307,15 +318,26 @@ def _parse(self, filename, return_unhandled=False):
         for part in shlex.split(full_l_string):
 
             # Split combined node:ppn requests first
-            if "nodes" in part and ":" in part:
+            if ":" in part:
+                node_features = []
                 for subpart in part.split(":"):
                     if "=" not in subpart:
+                        # This is a feature request, like "gtx1080"
+                        node_features.append(subpart)
                         continue
+
                     k, v = subpart.split("=", 1)
                     if k == "nodes":
                         spec.num_nodes = int(v)
                     elif k == "ppn":
                         spec.cpus_per_task = int(v)
+                    elif k == "gpus":
+                        spec.gpus_per_task = int(v)
+
+                # Heuristic: If we found GPUs and other features, assume the first
+                # other feature is the gpu_type.
+                if spec.gpus_per_task > 0 and node_features:
+                    spec.gpu_type = node_features[0]
 
             elif "=" in part:
                 k, v = part.split("=", 1)
@@ -329,7 +351,7 @@ def _parse(self, filename, return_unhandled=False):
                     spec.num_tasks = int(v)
                 elif k == "mem":
                     spec.mem_per_task = v
-                elif k == "gres" or k == "gpus":
+                elif k == "gres":
                     spec.generic_resources = v
                 elif k == "depend":
                     spec.depends_on = v
diff --git a/fractale/transformer/oar/transform.py b/fractale/transformer/oar/transform.py
@@ -182,6 +182,10 @@ def convert(self, spec):
             # This requests nodes that *each* have at least this many GPUs.
             l_parts.append(f"/gpunum={spec.gpus_per_task}")
 
+        # Add the specific GPU type as a resource property
+        if spec.gpu_type:
+            l_parts.append(f"/gpu_model='{spec.gpu_type}'")
+
         resource_str = "".join(l_parts)
 
         # Node constraints are added as properties to the resource string.
@@ -311,6 +315,8 @@ def _parse(self, content, return_unhandled=False):
                                     spec.cpus_per_task = int(v)
                             elif k == "gpunum":
                                 spec.gpus_per_task = int(v)
+                            elif k == "gpu_model":
+                                spec.gpu_type = v.strip("'")
                         else:
                             # Assume parts without '=' are constraints
                             spec.constraints.append(part.strip().strip("'"))
diff --git a/fractale/transformer/pbs/transform.py b/fractale/transformer/pbs/transform.py
@@ -148,16 +148,21 @@ def convert(self, spec):
         # Resource Selection (-l)
         select_parts = []
         if spec.num_nodes > 0:
-            select_parts.append(f"select={spec.num_nodes}")
-
-        # mpiprocs is often used to specify total tasks, which works well with our spec
-        if spec.num_tasks > 1:
-            select_parts.append(f"mpiprocs={spec.num_tasks}")
+            # Build the select statement parts
+            node_spec = [f"select={spec.num_nodes}"]
+            if spec.cpus_per_task > 1:
+                node_spec.append(f"ncpus={spec.cpus_per_task}")
+            if spec.gpus_per_task > 0:
+                node_spec.append(f"ngpus={spec.gpus_per_task}")
+            # I am not clear difference between gpus and accelerators
+            # but this seems supported - would need to test
+            if spec.gpu_type:
+                node_spec.append(f"accelerator_type={spec.gpu_type}")
+            # mpiprocs is often used to specify total tasks, which works well with our spec
+            if spec.num_tasks > 1:
+                node_spec.append(f"mpiprocs={spec.num_tasks}")
 
-        if spec.cpus_per_task > 1:
-            select_parts.append(f"ncpus={spec.cpus_per_task}")
-        if spec.gpus_per_task > 0:
-            select_parts.append(f"ngpus={spec.gpus_per_task}")
+            select_parts.append(":".join(node_spec))
 
         # PBS memory format often includes units like gb or mb
         if spec.mem_per_task:
@@ -167,7 +172,7 @@ def convert(self, spec):
                 mem_val += "b"
             select_parts.append(f"mem={mem_val}")
 
-        resource_str = ":".join(select_parts)
+        resource_str = ",".join(select_parts)
 
         wt = seconds_to_pbs(spec.wall_time)
         if wt:
@@ -289,7 +294,7 @@ def _parse(self, content, return_unhandled=False):
                         if k == "walltime":
                             spec.wall_time = pbs_time_to_seconds(v)
                         elif k == "select":
-                            # select=N:ncpus=C:mpiprocs=T...
+                            # select=N:ncpus=C:mpiprocs=T:gpu_type=a100...
                             select_parts = v.split(":")
                             spec.num_nodes = int(select_parts[0])
                             for sp in select_parts[1:]:
@@ -298,6 +303,8 @@ def _parse(self, content, return_unhandled=False):
                                     spec.cpus_per_task = int(sv)
                                 elif sk == "ngpus":
                                     spec.gpus_per_task = int(sv)
+                                elif sk == "gpu_type":
+                                    spec.gpu_type = sv
                                 elif sk == "mem":
                                     spec.mem_per_task = sv.upper().replace("B", "")
                                 elif sk == "mpiprocs":
diff --git a/fractale/transformer/slurm/transform.py b/fractale/transformer/slurm/transform.py