NVIDIA-NeMo
diff --git a/‎.github/workflows/config/typos.yml‎ ‎.github/workflows/config/typos.toml‎.github/workflows/config/typos.yml renamed to .github/workflows/config/typos.toml
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/config/typos.yml‎ ‎.github/workflows/config/typos.toml‎.github/workflows/config/typos.yml renamed to .github/workflows/config/typos.toml
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/spelling.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/spelling.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/nemo_run/__init__.py‎
Lines changed: 3 additions & 2 deletions b/‎src/nemo_run/__init__.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/nemo_run/core/execution/base.py‎
Lines changed: 5 additions & 52 deletions b/‎src/nemo_run/core/execution/base.py‎
Lines changed: 5 additions & 52 deletions
diff --git a/‎src/nemo_run/core/execution/launcher.py‎
Lines changed: 184 additions & 0 deletions b/‎src/nemo_run/core/execution/launcher.py‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎src/nemo_run/core/execution/skypilot.py‎
Lines changed: 1 addition & 2 deletions b/‎src/nemo_run/core/execution/skypilot.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/nemo_run/core/execution/slurm.py‎
Lines changed: 7 additions & 4 deletions b/‎src/nemo_run/core/execution/slurm.py‎
Lines changed: 7 additions & 4 deletions
@@ -4,3 +4,7 @@ extend-exclude = [
     "test/",
 ]
 ignore-hidden = false
+
+
+[default.extend-words]
+typ = "typ"
@@ -15,4 +15,4 @@ jobs:
         - uses: crate-ci/typos@master
           with:
             files: .
-            config: ./.github/workflows/config/typos.yml
+            config: ./.github/workflows/config/typos.toml
@@ -19,12 +19,11 @@
 from nemo_run.core.execution.base import (
     Executor,
     ExecutorMacros,
-    FaultTolerance,
-    Torchrun,
     import_executor,
 )
 from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
 from nemo_run.core.execution.docker import DockerExecutor
+from nemo_run.core.execution.launcher import FaultTolerance, SlurmRay, SlurmTemplate, Torchrun
 from nemo_run.core.execution.local import LocalExecutor
 from nemo_run.core.execution.skypilot import SkypilotExecutor
 from nemo_run.core.execution.slurm import SlurmExecutor
@@ -69,6 +68,8 @@
     "SlurmExecutor",
     "SSHTunnel",
     "Torchrun",
+    "SlurmRay",
+    "SlurmTemplate",
 ]
 
 try:
 
@@ -18,67 +18,17 @@
 import os
 from dataclasses import asdict, dataclass, field
 from string import Template
-from typing import Optional, Protocol, Type, Union, runtime_checkable
+from typing import Optional, Protocol, Union, runtime_checkable
 
 import fiddle as fdl
 from torchx.specs import Role
 from typing_extensions import Self
 
 from nemo_run.config import NEMORUN_HOME, ConfigurableMixin
+from nemo_run.core.execution.launcher import LAUNCHER_MAP, Launcher
 from nemo_run.core.packaging.base import Packager
 
 
-@dataclass(kw_only=True)
-class Launcher(ConfigurableMixin):
-    nsys_profile: bool = False
-    nsys_folder: str = "nsys_profile"
-    nsys_trace: list[str] = field(default_factory=lambda: ["nvtx", "cuda"])
-
-    def get_nsys_prefix(self, profile_dir: str) -> Optional[list[str]]:
-        """Make a command prefix for nsys profiling"""
-        if self.nsys_profile:
-            profile_out_path = os.path.join(profile_dir, self.nsys_folder)
-            args = [
-                "profile",
-                "-s",
-                "none",
-                "-t",
-                ",".join(self.nsys_trace),
-                "-o",
-                f"{profile_out_path}/profile_%p",
-                "--force-overwrite",
-                "true",
-                "--capture-range=cudaProfilerApi",
-                "--capture-range-end=stop",
-                "--cuda-graph-trace=node",
-            ]
-            return args
-
-
-@dataclass(kw_only=True)
-class Torchrun(Launcher):
-    rdzv_backend: str = "c10d"
-    rdzv_port: int = 29500
-
-
-@dataclass(kw_only=True)
-class FaultTolerance(Launcher):
-    cfg_path: str = ""
-    finished_flag_file: str = ""
-    job_results_file: str = ""
-    rdzv_backend: str = "c10d"
-    rdzv_port: int = 29500
-    workload_check_interval: Optional[float] = None
-    initial_rank_heartbeat_timeout: Optional[float] = None
-    rank_heartbeat_timeout: Optional[float] = None
-    rank_termination_signal: Optional[str] = None
-    log_level: Optional[str] = None
-    max_restarts: Optional[int] = None
-
-
-LAUNCHER_MAP: dict[str, Type[Launcher]] = {"torchrun": Torchrun, "ft": FaultTolerance}
-
-
 @dataclass(kw_only=True)
 class ExecutorMacros(ConfigurableMixin):
     """
@@ -215,6 +165,9 @@ def get_launcher_prefix(self) -> Optional[list[str]]:
             os.makedirs(os.path.join(self.job_dir, launcher.nsys_folder), exist_ok=True)
             return launcher.get_nsys_prefix(profile_dir=self.job_dir)
 
+    def supports_launcher_transform(self) -> bool:
+        return False
+
     def package_configs(self, *cfgs: tuple[str, str]) -> list[str]:
         filenames = []
         basepath = os.path.join(self.job_dir, "configs")
 
@@ -0,0 +1,184 @@
+import os
+import pathlib
+from dataclasses import dataclass, field
+from typing import Optional, Type
+
+import jinja2
+
+from nemo_run.config import ConfigurableMixin, Script
+from nemo_run.core.execution.utils import fill_template
+
+
+@dataclass(kw_only=True)
+class Launcher(ConfigurableMixin):
+    nsys_profile: bool = False
+    nsys_folder: str = "nsys_profile"
+    nsys_trace: list[str] = field(default_factory=lambda: ["nvtx", "cuda"])
+
+    def get_nsys_prefix(self, profile_dir: str) -> Optional[list[str]]:
+        """Make a command prefix for nsys profiling"""
+        if self.nsys_profile:
+            profile_out_path = os.path.join(profile_dir, self.nsys_folder)
+            args = [
+                "profile",
+                "-s",
+                "none",
+                "-t",
+                ",".join(self.nsys_trace),
+                "-o",
+                f"{profile_out_path}/profile_%p",
+                "--force-overwrite",
+                "true",
+                "--capture-range=cudaProfilerApi",
+                "--capture-range-end=stop",
+                "--cuda-graph-trace=node",
+            ]
+            return args
+
+    def transform(self, cmd: list[str]) -> Optional[Script]: ...
+
+
+@dataclass(kw_only=True)
+class Torchrun(Launcher):
+    rdzv_backend: str = "c10d"
+    rdzv_port: int = 29500
+
+
+@dataclass(kw_only=True)
+class FaultTolerance(Launcher):
+    cfg_path: str = ""
+    finished_flag_file: str = ""
+    job_results_file: str = ""
+    rdzv_backend: str = "c10d"
+    rdzv_port: int = 29500
+    workload_check_interval: Optional[float] = None
+    initial_rank_heartbeat_timeout: Optional[float] = None
+    rank_heartbeat_timeout: Optional[float] = None
+    rank_termination_signal: Optional[str] = None
+    log_level: Optional[str] = None
+    max_restarts: Optional[int] = None
+
+
+@dataclass(kw_only=True)
+class SlurmTemplate(Launcher):
+    """
+    A generic launcher that uses Jinja2 templates to wrap commands.
+    The template can be provided either as inline content or as a path to a template file.
+    """
+
+    template_path: Optional[str] = None
+    template_inline: Optional[str] = None
+    template_vars: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        # Ensure at least one template source is provided
+        if not self.template_path and not self.template_inline:
+            raise ValueError("Either template_path or template_inline must be provided")
+
+    def get_template_content(self) -> str:
+        """
+        Get the template content either from the file or inline content.
+        """
+        if self.template_inline:
+            return self.template_inline
+
+        if self.template_path:
+            # Check if the path is absolute
+            path = pathlib.Path(self.template_path)
+            if path.is_absolute():
+                # Read the template from the absolute path
+                with open(path, "r") as f:
+                    return f.read()
+            else:
+                # Use the template from the templates directory
+                template_dir = os.path.join(os.path.dirname(__file__), "templates")
+                template_path = os.path.join(template_dir, self.template_path)
+                if os.path.exists(template_path):
+                    with open(template_path, "r") as f:
+                        return f.read()
+                else:
+                    raise FileNotFoundError(f'Template "{self.template_path}" does not exist.')
+
+        # This should not happen due to the check in __post_init__
+        raise ValueError("No template available")
+
+    def render_template(self, cmd: list[str]) -> str:
+        """
+        Render the template with the command and additional variables.
+        """
+        # If using a template file from the templates directory
+        if self.template_path and not os.path.isabs(self.template_path):
+            # Create variables dictionary with command and additional variables
+            vars_dict = {"command": " ".join(cmd), **self.template_vars}
+            # Use the project's template rendering utility
+            return fill_template(self.template_path, vars_dict)
+
+        # If using inline template or absolute path template
+        template_content = self.get_template_content()
+        env = jinja2.Environment(autoescape=jinja2.select_autoescape(["html", "xml"]))
+        template = env.from_string(template_content)
+
+        # Create variables dictionary with command and additional variables
+        vars_dict = {"command": " ".join(cmd), **self.template_vars}
+
+        # Render the template
+        return template.render(**vars_dict)
+
+    def transform(self, cmd: list[str]) -> Optional[Script]:
+        """
+        Transform the command using the template.
+        """
+        rendered_script = self.render_template(cmd)
+        return Script(inline=rendered_script)
+
+
+@dataclass(kw_only=True)
+class SlurmRay(SlurmTemplate):
+    """
+    Transforms a provided cmd into a Ray launcher bash script for SlurmExecutor.
+    The Ray launcher script sets up a Ray cluster on Slurm nodes, with the head node starting Ray head
+    and executing the provided command. Worker nodes start Ray and wait.
+    """
+
+    gcs_server_port: int = 6379
+    dashboard_port: int = 8265
+    object_manager_port: int = 8076
+    node_manager_port: int = 8077
+    dashboard_agent_port: int = 52365
+    dashboard_agent_grpc_port: int = 52366
+    metrics_port: int = 9002
+    display_nvidia_smi_output: bool = False
+    head_setup: Optional[str] = None
+    head_init_wait_time: int = 10
+    worker_init_wait_time: int = 60
+    env_vars: Optional[dict] = None
+
+    def __post_init__(self):
+        # Set the template path to the Ray template
+        self.template_path = "slurm_ray.sh.j2"
+        # Fill in the template variables
+        self.template_vars["gcs_server_port"] = self.gcs_server_port
+        self.template_vars["dashboard_port"] = self.dashboard_port
+        self.template_vars["object_manager_port"] = self.object_manager_port
+        self.template_vars["node_manager_port"] = self.node_manager_port
+        self.template_vars["dashboard_agent_port"] = self.dashboard_agent_port
+        self.template_vars["dashboard_agent_grpc_port"] = self.dashboard_agent_grpc_port
+        self.template_vars["metrics_port"] = self.metrics_port
+        self.template_vars["display_nvidia_smi_output"] = self.display_nvidia_smi_output
+        self.template_vars["head_setup"] = self.head_setup
+        self.template_vars["head_init_wait_time"] = self.head_init_wait_time
+        self.template_vars["worker_init_wait_time"] = self.worker_init_wait_time
+        if self.env_vars:
+            self.template_vars["env_vars"] = "\n".join(
+                [f'export {k}="{v}"' for k, v in self.env_vars.items()]
+            )
+        # Call parent's post_init
+        super().__post_init__()
+
+
+LAUNCHER_MAP: dict[str, Type[Launcher]] = {
+    "torchrun": Torchrun,
+    "ft": FaultTolerance,
+    "slurm_ray": SlurmRay,
+    "slurm_template": SlurmTemplate,
+}
@@ -26,9 +26,8 @@
 from nemo_run.core.execution.base import (
     Executor,
     ExecutorMacros,
-    FaultTolerance,
-    Torchrun,
 )
+from nemo_run.core.execution.launcher import FaultTolerance, Torchrun
 from nemo_run.core.packaging.base import Packager
 from nemo_run.core.packaging.git import GitArchivePackager
 
 
@@ -33,10 +33,8 @@
 from nemo_run.core.execution.base import (
     Executor,
     ExecutorMacros,
-    FaultTolerance,
-    Launcher,
-    Torchrun,
 )
+from nemo_run.core.execution.launcher import FaultTolerance, Launcher, SlurmTemplate, Torchrun
 from nemo_run.core.execution.utils import fill_template
 from nemo_run.core.frontend.console.api import CONSOLE
 from nemo_run.core.packaging.base import Packager
@@ -544,6 +542,9 @@ def get_launcher_prefix(self) -> Optional[list[str]]:
         if launcher.nsys_profile:
             return launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}")
 
+    def supports_launcher_transform(self) -> bool:
+        return True if isinstance(self.get_launcher(), SlurmTemplate) else False
+
     def package_configs(self, *cfgs: tuple[str, str]) -> list[str]:
         filenames = []
         basepath = os.path.join(self.job_dir, "configs")
@@ -825,7 +826,9 @@ def materialize(self) -> str:
 
         sbatch_flags = []
         if self.slurm_config.heterogeneous:
-            assert len(self.jobs) == len(self.slurm_config.resource_group)
+            assert (
+                len(self.jobs) == len(self.slurm_config.resource_group)
+            ), f"Number of jobs {len(self.jobs)} must match number of resource group requests {len(self.slurm_config.resource_group)}.\nIf you are just submitting a single job, make sure that heterogeneous=False in the executor."
             final_group_index = len(self.slurm_config.resource_group) - 1
             if self.slurm_config.het_group_indices:
                 final_group_index = self.slurm_config.het_group_indices.index(
Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,7 @@ extend-exclude = [`
`4`	`4`	`"test/",`
`5`	`5`	`]`
`6`	`6`	`ignore-hidden = false`
	`7`	`+`
	`8`	`+`
	`9`	`+[default.extend-words]`
	`10`	`+typ = "typ"`