Modify Slurm Launcher to allow arbitrary scripts (#163)

ryantwolf · web-flow · commit d20769a84970 · 2025-03-03T12:53:38.000-08:00
* Alter launcher script

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

* Refactor to allow arbitrary template

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

* Add custom slurm ray launcher

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

* Remove duplicate script

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

* Fix command being run

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

* Remove curator specific references

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

* Rename template

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;

---------

Signed-off-by: Ryan Wolf &lt;rywolf@nvidia.com&gt;
diff --git a/src/nemo_run/core/execution/launcher.py b/src/nemo_run/core/execution/launcher.py
@@ -1,8 +1,12 @@
 import os
+import pathlib
 from dataclasses import dataclass, field
 from typing import Optional, Type
 
+import jinja2
+
 from nemo_run.config import ConfigurableMixin, Script
+from nemo_run.core.execution.utils import fill_template
 
 
 @dataclass(kw_only=True)
@@ -56,73 +60,124 @@ class FaultTolerance(Launcher):
 
 
 @dataclass(kw_only=True)
-class SlurmRay(Launcher):
+class SlurmTemplate(Launcher):
     """
-    Transforms a provided cmd into a Ray launcher bash script for SlurmExecutor.
-    The Ray launcher script sets up a Ray cluster on Slurm nodes, with the head node starting Ray head
-    and executing the provided command. Worker nodes start Ray and wait.
+    A generic launcher that uses Jinja2 templates to wrap commands.
+    The template can be provided either as inline content or as a path to a template file.
     """
 
-    port: int = 6379
+    template_path: Optional[str] = None
+    template_inline: Optional[str] = None
+    template_vars: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        # Ensure at least one template source is provided
+        if not self.template_path and not self.template_inline:
+            raise ValueError("Either template_path or template_inline must be provided")
+
+    def get_template_content(self) -> str:
+        """
+        Get the template content either from the file or inline content.
+        """
+        if self.template_inline:
+            return self.template_inline
+
+        if self.template_path:
+            # Check if the path is absolute
+            path = pathlib.Path(self.template_path)
+            if path.is_absolute():
+                # Read the template from the absolute path
+                with open(path, "r") as f:
+                    return f.read()
+            else:
+                # Use the template from the templates directory
+                template_dir = os.path.join(os.path.dirname(__file__), "templates")
+                template_path = os.path.join(template_dir, self.template_path)
+                if os.path.exists(template_path):
+                    with open(template_path, "r") as f:
+                        return f.read()
+                else:
+                    raise FileNotFoundError(f'Template "{self.template_path}" does not exist.')
+
+        # This should not happen due to the check in __post_init__
+        raise ValueError("No template available")
+
+    def render_template(self, cmd: list[str]) -> str:
+        """
+        Render the template with the command and additional variables.
+        """
+        # If using a template file from the templates directory
+        if self.template_path and not os.path.isabs(self.template_path):
+            # Create variables dictionary with command and additional variables
+            vars_dict = {"command": " ".join(cmd), **self.template_vars}
+            # Use the project's template rendering utility
+            return fill_template(self.template_path, vars_dict)
+
+        # If using inline template or absolute path template
+        template_content = self.get_template_content()
+        template = jinja2.Template(template_content)
+
+        # Create variables dictionary with command and additional variables
+        vars_dict = {"command": " ".join(cmd), **self.template_vars}
+
+        # Render the template
+        return template.render(**vars_dict)
 
     def transform(self, cmd: list[str]) -> Optional[Script]:
         """
-        Transforms the provided cmd into a Ray launcher bash script for SlurmExecutor.
+        Transform the command using the template.
         """
-        cmd_to_run = " ".join(cmd)
-        # Build the Ray launcher bash script. Braces in shell variables are escaped as {{ and }}
-        ray_script = f"""
-# Check that a command was provided.
-if [ "$#" -lt 1 ]; then
-    echo "Usage: $0 <command>"
-    exit 1
-fi
-
-# Function to start the Ray head node.
-start_head() {{
-    echo "Starting Ray head node on ${{HEAD_IP}}"
-    ray start --head --node-ip-address=${{HEAD_IP}} --port={self.port}
-    export RAY_ADDRESS="${{HEAD_IP}}:{self.port}"
-}}
-
-# Function to start a Ray worker node.
-start_worker() {{
-    # Obtain the head node's hostname from the SLURM_NODELIST.
-    echo "Starting Ray worker node. Connecting to head ${{HEAD_IP}}"
-    ray start --address=${{HEAD_IP}}:{self.port}
-}}
-
-# If this is the head node, start the head; otherwise, start a worker.
-if [ -z "$SLURM_NODEID" ] || [ "$SLURM_NODEID" == "0" ]; then
-    start_head
-else
-    start_worker
-fi
-
-# Only the head node executes the command.
-if [ -z "$SLURM_NODEID" ] || [ "$SLURM_NODEID" == "0" ]; then
-    echo "Running command: {cmd_to_run}"
-    # Use eval so the given command is executed with its arguments.
-    eval "{cmd_to_run}"
-    echo "Command finished. Shutting down Ray on head node."
-    ray stop
-    # Optionally, you could touch a file to signal the worker nodes to shut down.
-fi
-
-# For worker nodes, simply wait so that Ray stays active.
-if [ -n "$SLURM_NODEID" ] && [ "$SLURM_NODEID" != "0" ]; then
-    echo "Worker node running. Waiting for the Ray head to finish."
-    while true; do
-        sleep 15
-    done
-fi
-"""
-        # Return a new Script object with the inline content
-        return Script(inline=ray_script)
+        rendered_script = self.render_template(cmd)
+        return Script(inline=rendered_script)
+
+
+@dataclass(kw_only=True)
+class SlurmRay(SlurmTemplate):
+    """
+    Transforms a provided cmd into a Ray launcher bash script for SlurmExecutor.
+    The Ray launcher script sets up a Ray cluster on Slurm nodes, with the head node starting Ray head
+    and executing the provided command. Worker nodes start Ray and wait.
+    """
+
+    gcs_server_port: int = 6379
+    dashboard_port: int = 8265
+    object_manager_port: int = 8076
+    node_manager_port: int = 8077
+    dashboard_agent_port: int = 52365
+    dashboard_agent_grpc_port: int = 52366
+    metrics_port: int = 9002
+    display_nvidia_smi_output: bool = False
+    head_setup: Optional[str] = None
+    head_init_wait_time: int = 10
+    worker_init_wait_time: int = 60
+    env_vars: Optional[dict] = None
+
+    def __post_init__(self):
+        # Set the template path to the Ray template
+        self.template_path = "slurm_ray.sh.j2"
+        # Fill in the template variables
+        self.template_vars["gcs_server_port"] = self.gcs_server_port
+        self.template_vars["dashboard_port"] = self.dashboard_port
+        self.template_vars["object_manager_port"] = self.object_manager_port
+        self.template_vars["node_manager_port"] = self.node_manager_port
+        self.template_vars["dashboard_agent_port"] = self.dashboard_agent_port
+        self.template_vars["dashboard_agent_grpc_port"] = self.dashboard_agent_grpc_port
+        self.template_vars["metrics_port"] = self.metrics_port
+        self.template_vars["display_nvidia_smi_output"] = self.display_nvidia_smi_output
+        self.template_vars["head_setup"] = self.head_setup
+        self.template_vars["head_init_wait_time"] = self.head_init_wait_time
+        self.template_vars["worker_init_wait_time"] = self.worker_init_wait_time
+        if self.env_vars:
+            self.template_vars["env_vars"] = "\n".join(
+                [f'export {k}="{v}"' for k, v in self.env_vars.items()]
+            )
+        # Call parent's post_init
+        super().__post_init__()
 
 
 LAUNCHER_MAP: dict[str, Type[Launcher]] = {
     "torchrun": Torchrun,
     "ft": FaultTolerance,
     "slurm_ray": SlurmRay,
+    "slurm_template": SlurmTemplate,
 }
diff --git a/src/nemo_run/core/execution/templates/slurm_ray.sh.j2 b/src/nemo_run/core/execution/templates/slurm_ray.sh.j2
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+# Required environment variables
+REQUIRED_VARS=("SLURM_NNODES" "HEAD_NODE_ADDR")
+for var in "${REQUIRED_VARS[@]}"; do
+    if [ -z "${!var}" ]; then
+        echo "Error: $var is not set."
+        exit 1
+    fi
+done
+
+echo "Environment Variables:"
+echo "SLURM_NNODES=${SLURM_NNODES}"
+echo "HEAD_NODE_ADDR=${HEAD_NODE_ADDR}"
+{{ env_vars }}
+
+# Extract Ray ports from environment variables or hardcode them
+GCS_SERVER_PORT={{ gcs_server_port }}
+DASHBOARD_PORT={{ dashboard_port }}
+OBJECT_MANAGER_PORT={{ object_manager_port }}
+NODE_MANAGER_PORT={{ node_manager_port }}
+RAY_DASHBOARD_AGENT_PORT={{ dashboard_agent_port }}
+RAY_DASHBOARD_AGENT_GRPC_PORT={{ dashboard_agent_grpc_port }}
+METRICS_PORT={{ metrics_port }}
+
+get_ray_worker_count() {
+    local ray_status_output
+    ray_status_output=$(ray status 2>&1)  # Capture both stdout and stderr
+
+    # Check if the output contains the expected "Active" workers section
+    if echo "$ray_status_output" | grep -q "Active:"; then
+        # Extract the number of active workers by counting lines containing "node_"
+        worker_count=$(echo "$ray_status_output" | awk '/Active:/,/Pending:/' | grep -c "node_")
+
+        # Ensure worker_count is valid, otherwise set it to 0
+        if [[ -z "$worker_count" || "$worker_count" -lt 0 ]]; then
+            worker_count=0
+        fi
+    else
+        # Handle the case where "ray status" doesn't return valid data yet
+        worker_count=-1
+    fi
+
+    echo "$worker_count"
+}
+
+display_nvidia_smi() {
+    echo "NVIDIA SMI for $SLURMD_NODENAME"
+    which nvidia-smi && nvidia-smi || echo "nvidia-smi not in container"
+}
+
+ray_pid=""
+
+# Display nvidia-smi output
+{% if display_nvidia_smi_output | default(false) %}
+display_nvidia_smi
+{% endif %}
+
+
+# Function to start the Ray head node.
+start_head() {
+    # Start Ray head node
+
+    echo "Starting Ray head node"
+    ray start --head \
+        --node-ip-address=$(hostname -i) \
+        --port=${GCS_SERVER_PORT} \
+        --object-manager-port=${OBJECT_MANAGER_PORT} \
+        --node-manager-port=${NODE_MANAGER_PORT} \
+        --system-config='{"local_fs_capacity_threshold": 0.90, "object_spilling_config": "{ \"type\": \"filesystem\", \"params\": {\"directory_path\": \"/tmp/ray_spill\", \"buffer_size\": 1000000 } }"}' \
+        --metrics-export-port=${METRICS_PORT} \
+        --dashboard-host 0.0.0.0 --include-dashboard 1 \
+        --disable-usage-stats \
+        --dashboard-agent-grpc-port=${RAY_DASHBOARD_AGENT_GRPC_PORT} \
+        --dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_PORT} |  tee -a /tmp/ray.log
+    ray_pid=$!
+    echo "Ray head node started with PID $ray_pid"
+
+    ready_set=false
+    # Periodically check Ray status
+    while true; do
+        worker_count=$(get_ray_worker_count)
+        echo "Current workers ready: $worker_count"
+        if [[ "$worker_count" -eq -1 ]]; then
+            echo "Ray cluster status not available. Waiting for cluster."
+            sleep 5
+            continue
+        fi
+        if [[ "$worker_count" -eq 1 && "$ready_set" == "false" ]]; then
+            echo "Ray cluster is ready. Setting head node pod status to ready."
+            # TODO: enable once health server is ready
+            # curl -X POST http://localhost:8000/set-ready -H "Content-Type: application/json" -d '{"status": true}'
+            touch /tmp/is_ready
+
+            # Set ready_set to true after the curl request is sent
+            ready_set=true
+        fi
+
+        # Proceed only if the worker_count is a valid integer and >= expected_workers
+        if [[ "$worker_count" -ge "$SLURM_NNODES" ]]; then
+            echo "Enough workers connected. Proceeding to start the Python command."
+            break
+        fi
+
+        echo "Waiting for workers to connect..."
+        sleep {{ head_init_wait_time }}
+    done
+
+    {{ head_setup }}
+}
+
+# Function to start a Ray worker node.
+start_worker() {
+    sleep {{ worker_init_wait_time }}
+    set +x
+
+    # Start Ray worker node and connect to head
+    echo "Starting Ray worker node and connecting to head at ${HEAD_NODE_ADDR}:${GCS_SERVER_PORT}"
+    ray start --address="${HEAD_NODE_ADDR}:${GCS_SERVER_PORT}" \
+        --block \
+        --node-ip-address=$(hostname -i) \
+        --object-manager-port=${OBJECT_MANAGER_PORT} \
+        --node-manager-port=${NODE_MANAGER_PORT} \
+        --metrics-export-port=${METRICS_PORT} \
+        --dashboard-agent-grpc-port=${RAY_DASHBOARD_AGENT_GRPC_PORT} \
+        --dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_PORT} \
+        --disable-usage-stats
+
+
+    # Check if Ray worker node started successfully by reading the exit code
+    if [ $? -ne 0 ]; then
+        echo "Error: Ray worker node failed to start."
+        exit 1
+    fi
+
+    echo "Ray start --block ... exited"
+
+}
+
+# If this is the head node, start the head; otherwise, start a worker.
+if [ -z "$SLURM_NODEID" ] || [ "$SLURM_NODEID" == "0" ]; then
+    start_head
+else
+    start_worker
+fi
+
+# Only the head node executes the Python command.
+if [ -z "$SLURM_NODEID" ] || [ "$SLURM_NODEID" == "0" ]; then
+    echo "Running Python command: {{ command }}"
+    # Use eval so the given command is executed with its arguments.
+    eval "{{ command }}"
+    echo "Python script finished. Shutting down Ray on head node."
+    ray stop
+    sleep 30
+fi