NVIDIA-NeMo
diff --git a/‎docs/guides/ray.md‎
Lines changed: 76 additions & 4 deletions b/‎docs/guides/ray.md‎
Lines changed: 76 additions & 4 deletions
diff --git a/‎nemo_run/core/execution/lepton.py‎
Lines changed: 2 additions & 0 deletions b/‎nemo_run/core/execution/lepton.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo_run/run/ray/cluster.py‎
Lines changed: 3 additions & 0 deletions b/‎nemo_run/run/ray/cluster.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎nemo_run/run/ray/job.py‎
Lines changed: 11 additions & 2 deletions b/‎nemo_run/run/ray/job.py‎
Lines changed: 11 additions & 2 deletions
@@ -25,17 +25,19 @@
 
 | Object      | What it abstracts | Back-ends supported |
 |-------------|-------------------|---------------------|
-| `run.ray.cluster.RayCluster` | Lifecycle of a Ray **cluster** (create ⇒ wait ⇢ status ⇢ port-forward ⇢ delete). | `KubeRayExecutor`, `SlurmExecutor` |
+| `run.ray.cluster.RayCluster` | Lifecycle of a Ray **cluster** (create ⇒ wait ⇢ status ⇢ port-forward ⇢ delete). | `KubeRayExecutor`, `SlurmExecutor`, `LeptonExecutor` |
 | `run.ray.job.RayJob`         | Lifecycle of a Ray **job**   (submit ⇒ monitor ⇢ logs ⇢ cancel). | same |
 
-The two helpers share a uniform API; the chosen *Executor* decides whether we talk to the **KubeRay** operator (K8s) or a **Slurm** job under the hood.
+The two helpers share a uniform API; the chosen *Executor* decides whether we talk to the **KubeRay** operator (K8s), **DGX Cloud Lepton's RayCluster**, or a **Slurm** job under the hood.
 
 ```mermaid
 classDiagram
     RayCluster <|-- KubeRayCluster
     RayCluster <|-- SlurmRayCluster
+    RayCluster <|-- LeptonRayCluster
     RayJob     <|-- KubeRayJob
     RayJob     <|-- SlurmRayJob
+    RayJob     <|-- LeptonRayJob
 ```
 
 ## 2.  KubeRay quick-start
@@ -183,7 +185,77 @@ cluster.stop()
 * `executor.packager = run.GitArchivePackager()` if you prefer packaging a git tree instead of rsync.
 * `cluster.port_forward()` opens an SSH tunnel from *your laptop* to the Ray dashboard running on the head node.
 
-## 4.  API reference cheat-sheet
+## 4.  DGX Cloud Lepton RayCluster quick-start
+
+```python
+import os
+from pathlib import Path
+
+import nemo_run as run
+from nemo_run.core.execution.lepton import LeptonExecutor
+from nemo_run.run.ray.cluster import RayCluster
+from nemo_run.run.ray.job import RayJob
+
+# 1) Create a LeptonExecutor and tweak defaults
+mounts = [
+    {
+        "path": "/",
+        "mount_path": "/nemo-workspace",
+        "from": "node-nfs:lepton-shared-fs",
+    }
+]
+
+executor = LeptonExecutor(
+    resource_shape="gpu.8xh100",
+    container_image="rayproject/ray:2.49.2-gpu",
+    nemo_run_dir="/nemo-workspace/nemo-run",
+    head_resource_shape="cpu.large",
+    ray_version="2.49.2",
+    mounts=mounts,
+    node_group="my-node-group",
+    nodes=1,
+    nprocs_per_node=8,
+    env_vars={
+        "TORCH_HOME": "/nemo-workspace/.cache",
+    },
+    secret_vars=[
+        {"WANDB_API_KEY": "WANDB_API_KEY"},
+        {"HF_TOKEN": "HUGGING_FACE_HUB_TOKEN"},
+    ],
+    launcher="torchrun",
+    image_pull_secrets=[],
+    pre_launch_commands=[],
+)
+
+# 2) Bring up the RayCluster on DGX Cloud Lepton and show the status
+cluster = RayCluster(
+    name="lepton-ray-cluster",
+    executor=executor,
+)
+cluster.start(timeout=1800)
+cluster.status(display=True)
+
+# 3) Submit a RayJob that runs inside the created RayCluster
+job = RayJob(
+    name="demo-lepton-ray-job",
+    executor=executor,
+    cluster_name="lepton-ray-cluster",
+)
+job.start(
+    command="uv run python train.py --config cfgs/train.yaml cluster.num_nodes=2",
+    workdir="/path/to/project/",  # rsync'ed from local to the RayCluster
+)
+job.status(display=True)  # Display the RayJob status
+job.logs(follow=True)  # Tail the job logs as it runs
+
+# 4) Tear down the RayCluster and free up resources
+cluster.stop()
+```
+
+### Tips for DGX Cloud Lepton users
+* This assumes the [DGX Cloud Lepton CLI](https://docs.nvidia.com/dgx-cloud/lepton/reference/cli/get-started/) is installed and has been authenticated.
+
+## 5.  API reference cheat-sheet
 
 ```python
 cluster = RayCluster(name, executor)
@@ -201,7 +273,7 @@ job.stop()
 
 All methods are synchronous and **return immediately** when their work is done; the helpers hide the messy details (kubectl, squeue, ssh, …).
 
-## 5.  Rolling your own CLI
+## 6.  Rolling your own CLI
 
 Because `RayCluster` and `RayJob` are plain Python, you can compose them inside **argparse**, **Typer**, **Click** – anything. Here is a minimal **argparse** script:
 
 
@@ -81,6 +81,8 @@ class LeptonExecutor(Executor):
     )  # Image pull secrets for container registry authentication
     custom_spec: dict[str, Any] = field(default_factory=dict)
     pre_launch_commands: list[str] = field(default_factory=list)  # Custom commands before launch
+    head_resource_shape: Optional[str] = ""  # Only used for LeptonRayCluster
+    ray_version: Optional[str] = None  # Only used for LeptonRayCluster
 
     def stop_job(self, job_id: str):
         """
 
@@ -17,8 +17,10 @@
 from typing import Optional, Type
 
 from nemo_run.core.execution.base import Executor
+from nemo_run.core.execution.lepton import LeptonExecutor
 from nemo_run.core.execution.slurm import SlurmExecutor
 from nemo_run.core.frontend.console.api import configure_logging
+from nemo_run.run.ray.lepton import LeptonRayCluster
 from nemo_run.run.ray.slurm import SlurmRayCluster
 
 # Import guard for Kubernetes dependencies
@@ -43,6 +45,7 @@ def __post_init__(self):
         configure_logging(level=self.log_level)
         backend_map: dict[Type[Executor], Type] = {
             SlurmExecutor: SlurmRayCluster,
+            LeptonExecutor: LeptonRayCluster,
         }
 
         if _KUBERAY_AVAILABLE and KubeRayExecutor is not None and KubeRayCluster is not None:
 
@@ -17,8 +17,10 @@
 from typing import Any, Optional, Type
 
 from nemo_run.core.execution.base import Executor
+from nemo_run.core.execution.lepton import LeptonExecutor
 from nemo_run.core.execution.slurm import SlurmExecutor
 from nemo_run.core.frontend.console.api import configure_logging
+from nemo_run.run.ray.lepton import LeptonRayJob
 from nemo_run.run.ray.slurm import SlurmRayJob
 
 # Import guard for Kubernetes dependencies
@@ -41,10 +43,13 @@ class RayJob:
     executor: Executor
     pre_ray_start_commands: Optional[list[str]] = None
     log_level: str = "INFO"
+    cluster_name: Optional[str] = None  # Used to connect to existing RayCluster
+    cluster_ready_timeout: Optional[int] = 1800  # Only used for LeptonRayJob
 
     def __post_init__(self) -> None:  # noqa: D401 – simple implementation
         configure_logging(level=self.log_level)
         backend_map: dict[Type[Executor], Type[Any]] = {
+            LeptonExecutor: LeptonRayJob,
             SlurmExecutor: SlurmRayJob,
         }
 
@@ -57,6 +62,10 @@ def __post_init__(self) -> None:  # noqa: D401 – simple implementation
         backend_cls = backend_map[self.executor.__class__]
         self.backend = backend_cls(name=self.name, executor=self.executor)
 
+        if isinstance(self.executor, LeptonExecutor):
+            self.backend.cluster_name = self.cluster_name
+            self.backend.cluster_ready_timeout = self.cluster_ready_timeout
+
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -84,8 +93,8 @@ def start(
             dryrun=dryrun,
         )
 
-    def stop(self) -> None:
-        self.backend.stop()  # type: ignore[attr-defined]
+    def stop(self, wait: bool = False) -> None:
+        self.backend.stop(wait=wait)  # type: ignore[attr-defined]
 
     def status(self, display: bool = True):
         return self.backend.status(display=display)  # type: ignore[attr-defined]