NVIDIA-NeMo
diff --git a/‎docs/source/conf.py‎ ‎docs/conf.py‎docs/source/conf.py renamed to docs/conf.py
Lines changed: 18 additions & 6 deletions b/‎docs/source/conf.py‎ ‎docs/conf.py‎docs/source/conf.py renamed to docs/conf.py
Lines changed: 18 additions & 6 deletions
diff --git a/‎docs/documentation.md‎
Lines changed: 47 additions & 0 deletions b/‎docs/documentation.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎docs/source/faqs.md‎ ‎docs/faqs.md‎docs/source/faqs.md renamed to docs/faqs.md b/‎docs/source/faqs.md‎ ‎docs/faqs.md‎docs/source/faqs.md renamed to docs/faqs.md
diff --git a/‎docs/source/guides/cli.md‎ ‎docs/guides/cli.md‎docs/source/guides/cli.md renamed to docs/guides/cli.md
Lines changed: 10 additions & 11 deletions b/‎docs/source/guides/cli.md‎ ‎docs/guides/cli.md‎docs/source/guides/cli.md renamed to docs/guides/cli.md
Lines changed: 10 additions & 11 deletions
diff --git a/‎docs/source/guides/configuration.md‎ ‎docs/guides/configuration.md‎docs/source/guides/configuration.md renamed to docs/guides/configuration.md
Lines changed: 4 additions & 1 deletion b/‎docs/source/guides/configuration.md‎ ‎docs/guides/configuration.md‎docs/source/guides/configuration.md renamed to docs/guides/configuration.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/source/guides/execution.md‎ ‎docs/guides/execution.md‎docs/source/guides/execution.md renamed to docs/guides/execution.md
Lines changed: 15 additions & 5 deletions b/‎docs/source/guides/execution.md‎ ‎docs/guides/execution.md‎docs/source/guides/execution.md renamed to docs/guides/execution.md
Lines changed: 15 additions & 5 deletions
diff --git a/‎docs/source/guides/index.md‎ ‎docs/guides/index.md‎docs/source/guides/index.md renamed to docs/guides/index.md
Lines changed: 6 additions & 6 deletions b/‎docs/source/guides/index.md‎ ‎docs/guides/index.md‎docs/source/guides/index.md renamed to docs/guides/index.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/source/guides/management.md‎ ‎docs/guides/management.md‎docs/source/guides/management.md renamed to docs/guides/management.md
Lines changed: 4 additions & 2 deletions b/‎docs/source/guides/management.md‎ ‎docs/guides/management.md‎docs/source/guides/management.md renamed to docs/guides/management.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/source/guides/ray.md‎ ‎docs/guides/ray.md‎docs/source/guides/ray.md renamed to docs/guides/ray.md b/‎docs/source/guides/ray.md‎ ‎docs/guides/ray.md‎docs/source/guides/ray.md renamed to docs/guides/ray.md
diff --git a/‎docs/source/guides/why-use-nemo-run.md‎ ‎docs/guides/why-use-nemo-run.md‎docs/source/guides/why-use-nemo-run.md renamed to docs/guides/why-use-nemo-run.md b/‎docs/source/guides/why-use-nemo-run.md‎ ‎docs/guides/why-use-nemo-run.md‎docs/source/guides/why-use-nemo-run.md renamed to docs/guides/why-use-nemo-run.md
@@ -33,18 +33,34 @@
     "sphinx.ext.githubpages",
     "sphinx.ext.napoleon",
     "sphinxcontrib.mermaid",
+    "sphinx_copybutton",
+    "sphinx_new_tab_link",
 ]
 
 templates_path = ["_templates"]
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md"]
+
+# -- Options for MyST Parser (Markdown) --------------------------------------
+# MyST Parser settings
+myst_enable_extensions = [
+    "dollarmath",  # Enables dollar math for inline math
+    "amsmath",  # Enables LaTeX math for display mode
+    "colon_fence",  # Enables code blocks using ::: delimiters instead of ```
+    "deflist",  # Supports definition lists with term: definition format
+    "fieldlist",  # Enables field lists for metadata like :author: Name
+    "tasklist",  # Adds support for GitHub-style task lists with [ ] and [x]
+]
+myst_heading_anchors = 5  # Generates anchor links for headings up to level 5
+myst_fence_as_directive = ["mermaid"]
+
 python_maximum_signature_line_length = 88
 
 # Autoapi settings
 autoapi_generate_api_docs = True
 autoapi_keep_files = False
 autoapi_add_toctree_entry = False
 autoapi_type = "python"
-autoapi_dirs = ["../../nemo_run"]
+autoapi_dirs = ["../nemo_run"]
 autoapi_file_pattern = "*.py"
 autoapi_root = "api"
 autoapi_options = [
@@ -58,10 +74,6 @@
 # Autodoc settings
 autodoc_typehints = "signature"
 
-# MyST settings
-myst_heading_anchors = 3
-myst_fence_as_directive = ["mermaid"]
-
 # Napoleon settings
 napoleon_google_docstring = True
 napoleon_numpy_docstring = True
 
@@ -0,0 +1,47 @@
+# Documentation Development
+
+- [Documentation Development](#documentation-development)
+  - [Build the Documentation](#build-the-documentation)
+  - [Live Building](#live-building)
+
+
+## Build the Documentation
+
+The following sections describe how to set up and build the NeMo RL documentation.
+
+Switch to the documentation source folder and generate HTML output.
+
+```sh
+cd docs/
+uv run --group docs sphinx-build . _build/html
+```
+
+* The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder.
+* The generated python API docs are placed in `apidocs` under the `docs/` folder.
+
+## Checking for Broken Links
+
+To check for broken http links in the docs, run this command:
+
+```sh
+cd docs/
+uv run --group docs sphinx-build --builder linkcheck . _build/linkcheck
+```
+
+It will output a JSON file at `_build/linkcheck/output.json` with links it found while building the
+docs. Records will have a status of `broken` if the link is not reachable. The `docs/conf.py` file is
+configured to ignore github links because the CI test will often experience rate limit errors.
+Comment out the `linkcheck_ignore` variable there to check all the links.
+
+## Live Building
+
+When writing documentation, it can be helpful to serve the documentation and have it update live while you edit.
+
+To do so, run:
+
+```sh
+cd docs/
+uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
+```
+
+Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output.
@@ -2,7 +2,7 @@
 
 NeMo Run CLI is a Python-based command-line tool designed to efficiently configure and execute machine learning experiments. It provides a type-safe, Python-centric alternative to argparse and Hydra, streamlining workflows from prototyping to scaling across diverse environments.
 
-## 1. Introduction
+## Introduction
 
 NeMo Run CLI simplifies experiment management by leveraging Python's capabilities:
 
@@ -65,15 +65,15 @@ def train():
 - **Typer**: General-purpose CLIs with good documentation that don't require nested configuration
 - **argparse**: Simple scripts with minimal configuration needs and standard library requirements
 
-## 2. Core Concepts
+## Core Concepts
 
 - **Entrypoints**: Python functions decorated with `@run.cli.entrypoint` serving as primary CLI commands.
 - **Factories**: Functions decorated with `@run.cli.factory` that configure complex objects (e.g., models, optimizers).
 - **Partials**: Reusable, partially configured functions enabling flexible experiment definitions.
 - **Experiments**: Groups of tasks executed sequentially or concurrently.
 - **RunContext**: Manages execution settings, including executor configurations.
 
-## 3. Getting Started
+## Getting Started
 
 ### Example 1: Basic Entrypoint
 
@@ -116,7 +116,7 @@ Output:
 Unknown argument 'epocks'. Did you mean 'epochs'?
 ```
 
-## 4. Advanced Configuration
+## Advanced Configuration
 
 ### Nested Configurations with Dataclasses
 
@@ -213,25 +213,25 @@ File contents:
 │ 2 target = "main.train_model"
 │ 3 batch_size = 32
 │ 4 epochs = 10
-│ 5 
+│ 5
 │ 6 [model]
 │ 7 target = "main.Model"
 │ 8 activation = "relu"
 │ 9 hidden_size = 256
 │ 10 num_layers = 5
-│ 11 
+│ 11
 │ 12 [optimizer]
 │ 13 target = "main.Optimizer"
 │ 14 betas = [ 0.9, 0.999,]
 │ 15 learning_rate = 0.001
 │ 16 weight_decay = 1e-5
-│ 17 
+│ 17
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 Export complete. Skipping execution.
 
 ```
 
-## 5. Executors
+## Executors
 
 Executors determine where your code runs, such as local environments, Docker containers, or Slurm clusters.
 
@@ -283,7 +283,7 @@ def slurm_cluster() -> run.Executor:
         job_dir=BASE_DIR,
         container_image="nvcr.io/nvidia/nemo:dev",
         container_mounts=[
-            f"/home/{USER}:/home/{USER}", 
+            f"/home/{USER}:/home/{USER}",
             "/lustre:/lustre",
         ],
         time="4:00:00",
@@ -298,7 +298,7 @@ Execute lazily:
 python script.py --lazy model=alexnet epochs=5 run.executor=slurm_cluster run.executor.nodes=2
 ```
 
-## 6. Advanced CLI Features
+## Advanced CLI Features
 
 ### Dry Runs and Help Messages
 
@@ -393,4 +393,3 @@ The help output clearly shows:
 5. Registered factory functions for each complex argument type
 
 This makes it easy for users to discover what factory functions they can use to configure complex arguments like `model` and `optimizer`, along with information about where these factories are defined (module name and line number).
-
@@ -113,7 +113,10 @@ In our context, this is equivalent to:
      _target_: nemo.collections.llm.gpt.model.llama.Llama3Config8B
      seq_length: 16384
 ```
-> Note: we've used the [Hydra instantiation](https://hydra.cc/docs/advanced/instantiate_objects/overview/) syntax here.
+
+```{note}
+We've used the [Hydra instantiation](https://hydra.cc/docs/advanced/instantiate_objects/overview/) syntax here.
+```
 
 Python operations are performed on the config rather than directly on the class. For example:
 
 
@@ -14,9 +14,13 @@ A tuple of task and executor form an execution unit. A key goal of NeMo-Run is t
 Once an execution unit is created, the next step is to run it. The `run.run` function executes a single task, whereas `run.Experiment` offers more fine-grained control to define complex experiments. `run.run` wraps `run.Experiment` with a single task. `run.Experiment` is an API to launch and manage multiple tasks all using pure Python.
 The `run.Experiment` takes care of storing the run metadata, launching it on the specified cluster, and syncing the logs, etc. Additionally, `run.Experiment` also provides management tools to easily inspect and reproduce past experiments. The `run.Experiment` is inspired from [xmanager](https://github.com/google-deepmind/xmanager/tree/main) and uses [TorchX](https://pytorch.org/torchx/latest/) under the hood to handle execution.
 
-> **_NOTE:_** NeMo-Run assumes familiarity with Docker and uses a docker image as the environment for remote execution. This means you must provide a Docker image that includes all necessary dependencies and configurations when using a remote executor.
+```{note}
+NeMo-Run assumes familiarity with Docker and uses a docker image as the environment for remote execution. This means you must provide a Docker image that includes all necessary dependencies and configurations when using a remote executor.
+```
 
-> **_NOTE:_** All the experiment metadata is stored under `NEMORUN_HOME` env var on the machine where you launch the experiments. By default, the value for `NEMORUN_HOME` value is `~/.run`. Be sure to change this according to your needs.
+```{note}
+All the experiment metadata is stored under `NEMORUN_HOME` env var on the machine where you launch the experiments. By default, the value for `NEMORUN_HOME` value is `~/.run`. Be sure to change this according to your needs.
+```
 
 ## Executors
 Executors are dataclasses that configure your remote executor and set up the packaging of your code. All supported executors inherit from the base class `run.Executor`, but have configuration parameters specific to their execution environment. There is an initial cost to understanding the specifics of your executor and setting it up, but this effort is easily amortized over time.
@@ -29,7 +33,9 @@ We support the following `launchers`:
 - `torchrun` or `run.Torchrun`: This will launch the task using `torchrun`. See the `Torchrun` class for configuration options. You can use it using `executor.launcher = "torchrun"` or `executor.launcher = Torchrun(...)`.
 - `ft` or `run.core.execution.FaultTolerance`: This will launch the task using NVIDIA's fault tolerant launcher. See the `FaultTolerance` class for configuration options. You can use it using `executor.launcher = "ft"` or `executor.launcher = FaultTolerance(...)`.
 
-> **_NOTE:_** Launcher may not work very well with `run.Script`. Please report any issues at https://github.com/NVIDIA-NeMo/Run/issues.
+```{attention}
+Launcher may not work very well with `run.Script`. Please report any issues at [https://github.com/NVIDIA-NeMo/Run/issues](https://github.com/NVIDIA-NeMo/Run/issues).
+```
 
 ### Packagers
 
@@ -65,7 +71,9 @@ Your working directory at the time of execution will look like:
 ```
 If you're executing a Python function, this working directory will automatically be included in your Python path.
 
-> **_NOTE:_** git archive doesn't package uncommitted changes. In the future, we may add support for including uncommitted changes while honoring `.gitignore`.
+```{note}
+Git archive doesn't package uncommitted changes. In the future, we may add support for including uncommitted changes while honoring `.gitignore`.
+```
 
 `run.PatternPackager` is a packager that uses a pattern to package your code. It is useful for packaging code that is not under version control. For example, if you have a directory structure like this:
 ```
@@ -228,7 +236,9 @@ As demonstrated in the examples, defining executors in Python offers great flexi
 
 The `DGXCloudExecutor` integrates with a DGX Cloud cluster's Run:ai API to launch distributed jobs. It uses REST API calls to authenticate, identify the target project and cluster, and submit the job specification.
 
-> **_WARNING:_** Currently, the `DGXCloudExecutor` is only supported when launching experiments *from* a pod running on the DGX Cloud cluster itself. Furthermore, this launching pod must have access to a Persistent Volume Claim (PVC) where the experiment/job directories will be created, and this same PVC must also be configured to be mounted by the job being launched.
+```{warning}
+Currently, the `DGXCloudExecutor` is only supported when launching experiments *from* a pod running on the DGX Cloud cluster itself. Furthermore, this launching pod must have access to a Persistent Volume Claim (PVC) where the experiment/job directories will be created, and this same PVC must also be configured to be mounted by the job being launched.
+```
 
 Here's an example configuration:
 
 
@@ -1,7 +1,7 @@
-Guides
-=================
+# Guides
 
-```{toctree}
+
+:::{toctree}
 :maxdepth: 2
 :hidden:
 
@@ -11,7 +11,7 @@ execution
 management
 ray
 cli
-```
+:::
 
 Welcome to the NeMo-Run guides! This section provides comprehensive documentation on how to use NeMo-Run effectively for your machine learning experiments.
 
@@ -36,7 +36,7 @@ For more advanced usage:
 NeMo-Run is built around three core responsibilities:
 
 1. **Configuration** - Define your ML experiments using a flexible, Pythonic configuration system.
-2. **Execution** - Run your experiments seamlessly across local machines, Slurm clusters, cloud providers, and more.
-3. **Management** - Track, reproduce, and organize your experiments with built-in experiment management.
+1. **Execution** - Run your experiments seamlessly across local machines, Slurm clusters, cloud providers, and more.
+1. **Management** - Track, reproduce, and organize your experiments with built-in experiment management.
 
 Each guide dives deep into these concepts with practical examples and best practices. Choose a guide above to get started!
@@ -12,7 +12,9 @@ exp = Experiment("My Experiment")
 
 When executed, it will automatically generate a unique experiment ID for you, which represents one unique run of the experiment.
 
-> [!NOTE] > `Experiment` is a context manager and `Experiment.add` and `Experiment.run` methods can currently only be used after entering the context manager.
+```{note}
+`Experiment` is a context manager and `Experiment.add` and `Experiment.run` methods can currently only be used after entering the context manager.
+```
 
 ## Add Tasks
 
@@ -73,7 +75,7 @@ You can check the status of an experiment using the `status` method:
 exp.status()
 ```
 
-This method will display information about the status of each task in the experiment. The following is a sample output from the status of experiment in [hello_scripts.py](../../../examples/hello-world/hello_scripts.py):
+This method will display information about the status of each task in the experiment. The following is a sample output from the status of experiment in [hello_scripts.py](../../examples/hello-world/hello_scripts.py):
 
 ```bash
 Experiment Status for experiment_with_scripts_1730761155