NVIDIA-NeMo
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎3rdparty/Megatron-Bridge-workspace/pyproject.toml‎
Lines changed: 1 addition & 5 deletions b/‎3rdparty/Megatron-Bridge-workspace/pyproject.toml‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎3rdparty/Megatron-LM-workspace/pyproject.toml‎
Lines changed: 2 additions & 5 deletions b/‎3rdparty/Megatron-LM-workspace/pyproject.toml‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎3rdparty/Penguin-workspace/pyproject.toml‎
Lines changed: 1 addition & 4 deletions b/‎3rdparty/Penguin-workspace/pyproject.toml‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎docs/Makefile‎
Lines changed: 83 additions & 0 deletions b/‎docs/Makefile‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎docs/about/algorithms/dapo.md‎
Lines changed: 84 additions & 0 deletions b/‎docs/about/algorithms/dapo.md‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎docs/about/algorithms/dpo.md‎
Lines changed: 58 additions & 0 deletions b/‎docs/about/algorithms/dpo.md‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎docs/about/algorithms/grpo.md‎
Lines changed: 102 additions & 0 deletions b/‎docs/about/algorithms/grpo.md‎
Lines changed: 102 additions & 0 deletions
@@ -16,6 +16,12 @@ repos:
       args: ["check", "--select", "I", "--fix"]
     - id: ruff-format
 
+  - repo: https://github.com/ComPWA/mirrors-taplo
+    rev: v0.9.3
+    hooks:
+    - id: taplo-format
+      name: Format TOML files with taplo
+
   - repo: local
     hooks:
       - id: no-underscore-md
 
@@ -1,8 +1,5 @@
 [build-system]
-requires = [
-    "setuptools>=61.0",
-    "wheel",
-]
+requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -11,4 +8,3 @@ dynamic = ["dependencies", "version"]
 authors = [{ name = "NVIDIA", email = "[email protected]" }]
 description = "Standalone packaging for the Megatron Bridge sub-module."
 requires-python = ">=3.10"
-
@@ -1,15 +1,12 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 
 [build-system]
-requires = [
-    "setuptools",
-    "pybind11",
-]
+requires = ["setuptools", "pybind11"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "megatron-core"
 dynamic = ["dependencies", "version"]
 description = "Megatron Core - a library for efficient and scalable training of transformer based models"
 authors = [{ name = "NVIDIA", email = "[email protected]" }]
-maintainers = [{ name = "NVIDIA", email = "[email protected]" }]
+maintainers = [{ name = "NVIDIA", email = "[email protected]" }]
@@ -1,8 +1,5 @@
 [build-system]
-requires = [
-    "setuptools>=61.0",
-    "wheel",
-]
+requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
 
@@ -0,0 +1,83 @@
+# Makefile for building documentation with isolated UV environment
+
+.DEFAULT_GOAL := help
+
+# Cross-platform venv paths
+ifeq ($(OS),Windows_NT)
+    VENV_DIR = .venv/Scripts
+    PYTHON = $(VENV_DIR)/python.exe
+    ACTIVATE_CMD = .venv\Scripts\activate
+    RM = if exist _build rmdir /s /q _build
+else
+    VENV_DIR = .venv/bin
+    PYTHON = $(VENV_DIR)/python
+    ACTIVATE_CMD = source .venv/bin/activate
+    RM = rm -rf _build
+endif
+
+# ------------------------------
+# Help
+# ------------------------------
+help:
+	@echo ""
+	@echo "📚 Documentation Build System"
+	@echo "=============================="
+	@echo ""
+	@echo "Available targets:"
+	@echo "  make docs-html      Build HTML documentation"
+	@echo "  make docs-live      Start live-reload server"
+	@echo "  make docs-publish   Build docs (fail on warnings)"
+	@echo "  make docs-clean     Clean built documentation"
+	@echo ""
+	@echo "The environment is automatically set up on first run."
+	@echo "To manually activate the docs environment, run:"
+	@echo "  $(ACTIVATE_CMD)"
+	@echo ""
+
+# ------------------------------
+# Ensure UV and isolated docs environment
+# ------------------------------
+ensure-docs-env:
+	@command -v uv >/dev/null 2>&1 || ( \
+		echo ""; \
+		echo "❌ uv is not installed. See https://docs.astral.sh/uv/getting-started/installation/"; \
+		exit 1 \
+	)
+	@if [ ! -x "$(PYTHON)" ]; then \
+		echo "📦 Creating isolated docs environment..."; \
+		uv venv .venv; \
+		uv sync --no-config; \
+		echo "✅ Docs environment ready."; \
+		echo "📝 To activate it: $(ACTIVATE_CMD)"; \
+	fi
+
+# ------------------------------
+# Build HTML docs
+# ------------------------------
+docs-html: ensure-docs-env
+	@echo "Building HTML documentation..."
+	$(PYTHON) -m sphinx -b html . _build/html
+
+# ------------------------------
+# Build docs for publication (fail on warnings)
+# ------------------------------
+docs-publish: ensure-docs-env
+	@echo "Building HTML documentation (fail on warnings)..."
+	$(PYTHON) -m sphinx --fail-on-warning -b html . _build/html
+
+# ------------------------------
+# Start live-reload server
+# ------------------------------
+docs-live: ensure-docs-env
+	@echo "Starting live-reload server..."
+	$(PYTHON) -m sphinx_autobuild . _build/html --port 8001
+	@echo ""
+	@echo "📝 To manually activate the docs environment in a shell:"
+	@echo "  $(ACTIVATE_CMD)"
+
+# ------------------------------
+# Clean built docs
+# ------------------------------
+docs-clean:
+	@echo "Cleaning built documentation..."
+	$(RM)
@@ -0,0 +1,84 @@
+# DAPO
+
+[Dual-Clip Asymmetric Policy Optimization (DAPO)](https://arxiv.org/pdf/2503.14476) extends GRPO by allowing asymmetric clipping with distinct minimum and maximum clip parameters. This provides more fine-grained control over policy updates.
+
+DAPO is implemented through the same `ClippedPGLossFn` as GRPO, but with the ability to set different values for `ratio_clip_min` and `ratio_clip_max`. For standard GRPO/PPO, these parameters are set to the same value.
+
+## Key Differences from GRPO
+
+- **Asymmetric Clipping**: DAPO allows `ratio_clip_min` ≠ `ratio_clip_max`, providing asymmetric bounds on the probability ratio
+- **Same Infrastructure**: Uses the same training infrastructure and configurations as GRPO
+
+## DAPO Single Node
+
+To run DAPO on a single GPU, use the GRPO script with asymmetric clip parameters:
+
+```sh
+# Run DAPO with asymmetric clipping
+uv run python examples/run_grpo_math.py \
+  policy.model_name="Qwen/Qwen2.5-1.5B" \
+  grpo.ratio_clip_min=0.15 \
+  grpo.ratio_clip_max=0.25 \
+  checkpointing.checkpoint_dir="results/dapo_math" \
+  logger.wandb_enabled=True \
+  logger.wandb.name="dapo-math"
+```
+
+For multi-GPU setups:
+
+```sh
+uv run python examples/run_grpo_math.py \
+  cluster.gpus_per_node=8 \
+  grpo.ratio_clip_min=0.15 \
+  grpo.ratio_clip_max=0.25 \
+  checkpointing.checkpoint_dir="results/dapo_8gpu" \
+  logger.wandb_enabled=True \
+  logger.wandb.name="dapo-8gpu"
+```
+
+## DAPO Multi-node
+
+DAPO can be run on multiple nodes using the same approach as GRPO:
+
+```sh
+# Run from the root of NeMo RL repo
+NUM_ACTOR_NODES=2
+
+COMMAND="uv run ./examples/run_grpo_math.py \
+  --config examples/configs/grpo_math_8B.yaml \
+  cluster.num_nodes=2 \
+  grpo.ratio_clip_min=0.15 \
+  grpo.ratio_clip_max=0.25 \
+  checkpointing.checkpoint_dir='results/dapo_2nodes' \
+  logger.wandb_enabled=True \
+  logger.wandb.name='dapo-multinode'" \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
+```
+
+## Configuration
+
+DAPO uses the same configuration structure as GRPO. The key parameters are:
+
+```yaml
+grpo:
+  ratio_clip_min: 0.15  # Minimum clip value (can be different from max)
+  ratio_clip_max: 0.25  # Maximum clip value (can be different from min)
+  # ... other GRPO parameters ...
+```
+
+For more details on other configuration options, refer to the [GRPO documentation](grpo.md).
+
+## Additional Resources
+
+- [DAPO Paper](https://arxiv.org/pdf/2503.14476)
+- [GRPO Documentation](grpo.md)
+- [Training Backends](../../design-docs/training-backends.md)
@@ -0,0 +1,58 @@
+# DPO
+
+We provide a sample DPO experiment that uses the [HelpSteer3 dataset](https://huggingface.co/datasets/nvidia/HelpSteer3) for preference-based training.
+
+## DPO Single Node
+
+The default DPO experiment is configured to run on a single GPU. To launch the experiment:
+
+```sh
+uv run python examples/run_dpo.py
+```
+
+This trains `Llama3.2-1B-Instruct` on 1 GPU.
+
+If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration and switch to an 8B Llama3.1 Instruct model:
+
+```sh
+uv run python examples/run_dpo.py \
+  policy.model_name="meta-llama/Llama-3.1-8B-Instruct" \
+  policy.train_global_batch_size=256 \
+  cluster.gpus_per_node=8
+```
+
+Any of the DPO parameters can be customized from the command line. For example:
+
+```sh
+uv run python examples/run_dpo.py \
+  dpo.sft_loss_weight=0.1 \
+  dpo.preference_average_log_probs=True \
+  checkpointing.checkpoint_dir="results/llama_dpo_sft" \
+  logger.wandb_enabled=True \
+  logger.wandb.name="llama-dpo-sft"
+```
+
+Refer to `examples/configs/dpo.yaml` for a full list of parameters that can be overridden. For an in-depth explanation of how to add your own DPO dataset, refer to the [DPO documentation](../../guides/dpo.md).
+
+## DPO Multi-node
+
+For distributed DPO training across multiple nodes, modify the following script for your use case:
+
+```sh
+# Run from the root of NeMo RL repo
+## number of nodes to use for your job
+NUM_ACTOR_NODES=2
+
+COMMAND="uv run ./examples/run_dpo.py --config examples/configs/dpo.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 dpo.val_global_batch_size=32 checkpointing.checkpoint_dir='results/dpo_llama81_2nodes' logger.wandb_enabled=True logger.wandb.name='dpo-llama1b'" \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
+```
+
@@ -0,0 +1,102 @@
+# GRPO
+
+We provide a reference GRPO configuration for math benchmarks using the [OpenInstructMath2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2) dataset.
+
+You can read about the details of the GRPO implementation [here](../../guides/grpo.md).
+
+## GRPO Single Node
+
+To run GRPO on a single GPU for `Qwen/Qwen2.5-1.5B`:
+
+```sh
+# Run the GRPO math example using a 1B parameter model
+uv run python examples/run_grpo_math.py
+```
+
+By default, this uses the configuration in `examples/configs/grpo_math_1B.yaml`. You can customize parameters with command-line overrides. For example, to run on 8 GPUs:
+
+```sh
+# Run the GRPO math example using a 1B parameter model using 8 GPUs
+uv run python examples/run_grpo_math.py \
+  cluster.gpus_per_node=8
+```
+
+You can override any of the parameters listed in the YAML configuration file. For example:
+
+```sh
+uv run python examples/run_grpo_math.py \
+  policy.model_name="meta-llama/Llama-3.2-1B-Instruct" \
+  checkpointing.checkpoint_dir="results/llama1b_math" \
+  logger.wandb_enabled=True \
+  logger.wandb.name="grpo-llama1b_math" \
+  logger.num_val_samples_to_print=10
+```
+
+The default configuration uses the DTensor training backend. We also provide a config `examples/configs/grpo_math_1B_megatron.yaml` which is set up to use the Megatron backend out of the box.
+
+To train using this config on a single GPU:
+
+```sh
+# Run a GRPO math example on 1 GPU using the Megatron backend
+uv run python examples/run_grpo_math.py \
+  --config examples/configs/grpo_math_1B_megatron.yaml
+```
+
+For additional details on supported backends and how to configure the training backend to suit your setup, refer to the [Training Backends documentation](../../design-docs/training-backends.md).
+
+## GRPO Multi-node
+
+```sh
+# Run from the root of NeMo RL repo
+NUM_ACTOR_NODES=2
+
+# grpo_math_8b uses Llama-3.1-8B-Instruct model
+COMMAND="uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml cluster.num_nodes=2 checkpointing.checkpoint_dir='results/llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='grpo-llama8b_math'" \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
+```
+
+The required `CONTAINER` can be built by following the instructions in the [Docker documentation](../../docker.md).
+
+## GRPO Qwen2.5-32B
+
+This section outlines how to run GRPO for Qwen2.5-32B with a 16k sequence length.
+
+```sh
+# Run from the root of NeMo RL repo
+NUM_ACTOR_NODES=32
+
+# Download Qwen before the job starts to avoid spending time downloading during the training loop
+HF_HOME=/path/to/hf_home huggingface-cli download Qwen/Qwen2.5-32B
+
+# Ensure HF_HOME is included in your MOUNTS
+HF_HOME=/path/to/hf_home \
+COMMAND="uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml policy.model_name='Qwen/Qwen2.5-32B' policy.generation.vllm_cfg.tensor_parallel_size=4 policy.max_total_sequence_length=16384 cluster.num_nodes=${NUM_ACTOR_NODES} policy.dtensor_cfg.enabled=True policy.dtensor_cfg.tensor_parallel_size=8 policy.dtensor_cfg.sequence_parallel=True policy.dtensor_cfg.activation_checkpointing=True checkpointing.checkpoint_dir='results/qwen2.5-32b' logger.wandb_enabled=True logger.wandb.name='qwen2.5-32b'" \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
+```
+
+## GRPO Multi-Turn
+
+We also support multi-turn generation and training (tool use, games, etc.). Reference example for training to play a Sliding Puzzle Game:
+
+```sh
+uv run python examples/run_grpo_sliding_puzzle.py
+```
+