NVIDIA-NeMo
diff --git a/‎.github/workflows/release-docs.yml‎
Lines changed: 13 additions & 11 deletions b/‎.github/workflows/release-docs.yml‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎docker/Dockerfile.ci‎
Lines changed: 2 additions & 1 deletion b/‎docker/Dockerfile.ci‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/models/llm/nemotron3.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/models/llm/nemotron3.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/nemotron_3/README.md‎
Lines changed: 93 additions & 0 deletions b/‎examples/models/nemotron_3/README.md‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎examples/models/nemotron_3/conversion.sh‎
Lines changed: 42 additions & 0 deletions b/‎examples/models/nemotron_3/conversion.sh‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎examples/models/nemotron_3/slurm_peft.sh‎
Lines changed: 181 additions & 0 deletions b/‎examples/models/nemotron_3/slurm_peft.sh‎
Lines changed: 181 additions & 0 deletions
@@ -20,19 +20,20 @@ on:
         required: true
         type: boolean
         default: true
-      version-number:
-        description: Version number to release this as (use `latest` for main branch)
-        required: true
+      publish-as-latest:
+        description: Publish as Latest stable version.
+        required: false
+        type: boolean
+        default: true
+      docs-version-override:
+        description: Docs version if commit is not tagged
+        required: false
         type: string
+        default: ""
       notify-emails:
         description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
-        required: true
-        type: string
-      aws-region:
-        description: AWS region
         required: false
         type: string
-        default: us-east-1
 
 jobs:
   build-docs:
@@ -45,7 +46,7 @@ jobs:
       - uses: actions/checkout@v6
         with:
           repository: NVIDIA-NeMo/FW-CI-templates
-          ref: v0.67.2
+          ref: v0.72.0
           path: FW-CI-templates
 
       - uses: ./FW-CI-templates/.github/actions/publish-docs
@@ -59,10 +60,11 @@ jobs:
           artifacts-name: docs-html
           artifacts-path: _build/html
           emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
-          overwrite-latest-on-tag: false
+          overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
+          docs-version-override: ${{ inputs.docs-version-override }}
           run-on-version-tag-only: ${{ github.ref_name != 'main' }}
           request-name: megatron-bridge-publish-docs-${{ github.run_id }}
-          aws-region: ${{ inputs.aws-region }}
+          aws-region: ${{ vars.DOCS_AWS_REGION }}
           aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
@@ -32,8 +32,9 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
 
 COPY pyproject.toml uv.lock /opt/Megatron-Bridge/
 COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/
-COPY 3rdparty/Megatron-LM/pyproject.toml /opt/Megatron-Bridge/3rdparty/Megatron-LM/
+COPY 3rdparty/Megatron-LM/pyproject.toml 3rdparty/Megatron-LM/setup.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/
 COPY 3rdparty/Megatron-LM/megatron/core/__init__.py 3rdparty/Megatron-LM/megatron/core/package_info.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
+COPY 3rdparty/Megatron-LM/megatron/core/datasets/Makefile 3rdparty/Megatron-LM/megatron/core/datasets/helpers.cpp /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/datasets/
 
 # Build arg to skip --locked when testing with different MCore versions
 ARG MCORE_TRIGGERED_TESTING=false
 
@@ -66,7 +66,7 @@ checkpoint.pretrained_checkpoint=/path/to/output/megatron/ckpt
 ```
 
 Notes:
-- Default parallelism TP=1, EP=8, PP=1, CP=1. It is recommended to run this recipe on at least 2 H100 nodes (16 GPUs).
+- Default parallelism TP=1, EP=8, PP=1, CP=1. Running this recipe requires at least 2 H100 nodes (16 GPUs).
 - By default, the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad) dataset is used. To use customerized dataset, see this [tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/recipes/llama#quickstart)
 - Fine-tuning requires a pretrained megatron checkpoint, which can be obtained in "Import HF → Megatron" section above
 
 
@@ -0,0 +1,93 @@
+# Nemotron 3 Examples
+
+This directory contains example scripts for Nemotron 3 language models.
+
+For model introduction and architecture details, see the Nemotron 3 documentation.
+
+## Workspace Configuration
+
+All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it:
+
+```bash
+export WORKSPACE=/your/custom/path
+```
+
+Directory structure:
+- `${WORKSPACE}/models/` - Converted checkpoints
+- `${WORKSPACE}/results/` - Training outputs and experiment results
+
+## Checkpoint Conversion
+
+See the [conversion.sh](conversion.sh) script for checkpoint conversion examples.
+
+### Import HF → Megatron
+
+To import the HF model to your desired Megatron path:
+
+```bash
+python examples/conversion/convert_checkpoints.py import \
+    --hf-model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
+    --megatron-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
+    --trust-remote-code
+```
+
+### Export Megatron → HF
+
+```bash
+python examples/conversion/convert_checkpoints.py export \
+    --hf-model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
+    --megatron-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/iter_0000000 \
+    --hf-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16-hf-export
+```
+
+### Round-trip Validation
+
+Multi-GPU round-trip validation between formats:
+
+```bash
+python -m torch.distributed.run --nproc_per_node=8 \
+    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
+    --megatron-load-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/iter_0000000 \
+    --tp 2 --pp 2 \
+    --trust-remote-code
+```
+
+## Training Recipes
+
+- See: [bridge.recipes.nemotronh](../../../src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py)
+- Available recipes:
+  - `nemotron_3_nano_pretrain_config`: Pretraining configuration
+  - `nemotron_3_nano_finetune_config`: Finetuning configuration with PEFT support
+
+Before training, ensure the following are configured:
+1. **Container Image**: Set `CONTAINER_IMAGE` in the SLURM scripts to your container path
+2. **Container Mounts**: (optional) Set `CONTAINER_MOUNTS` for data and workspace directories
+3. **Environment Variables**:
+   - `HF_TOKEN`: to download models from HF Hub (if required)
+   - `HF_HOME`: (optional) to avoid re-downloading models and datasets
+   - `WANDB_API_KEY`: (optional) to enable WandB logging
+
+All training scripts use SLURM for containerized multi-node training.
+
+### Pretrain
+
+See the [slurm_pretrain.sh](slurm_pretrain.sh) script for pretraining with configurable model parallelisms.
+
+W&B report coming soon.
+
+### Supervised Fine-Tuning (SFT)
+
+See the [slurm_sft.sh](slurm_sft.sh) script for full parameter fine-tuning.
+
+W&B report coming soon.
+
+### Parameter-Efficient Fine-Tuning (PEFT) with LoRA
+
+See the [slurm_peft.sh](slurm_peft.sh) script for LoRA fine-tuning.
+
+W&B report coming soon.
+
+## Evaluation
+
+Coming soon.
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+MODEL_NAME=NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+HF_MODEL_ID=nvidia/$MODEL_NAME
+
+# Import HF → Megatron
+uv run python examples/conversion/convert_checkpoints.py import \
+    --hf-model $HF_MODEL_ID \
+    --megatron-path ${WORKSPACE}/models/$MODEL_NAME \
+    --trust-remote-code
+
+# Export Megatron → HF
+uv run python examples/conversion/convert_checkpoints.py export \
+    --hf-model $HF_MODEL_ID \
+    --megatron-path ${WORKSPACE}/models/$MODEL_NAME/iter_0000000 \
+    --hf-path ${WORKSPACE}/models/$MODEL_NAME-hf-export
+
+# Round-trip validation
+uv run python -m torch.distributed.run --nproc_per_node=8 \
+    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id $HF_MODEL_ID \
+    --megatron-load-path ${WORKSPACE}/models/$MODEL_NAME/iter_0000000 \
+    --tp 2 --ep 8 \
+    --trust-remote-code
@@ -0,0 +1,181 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ==============================================================================
+# Nemotron 3 Nano Parameter-Efficient Fine-Tuning (PEFT) with LoRA
+#
+# Nemotron 3 Nano is a 30B parameter model with A3B (Active 3 Billion) architecture
+# LoRA/DoRA significantly reduces memory requirements
+# Supports multiple parallelism configs: each "TP,PP,EP,CP,SP" runs sequentially.
+#
+# Usage:
+#   1. Modify the #SBATCH directives below for your cluster
+#   2. Set CONTAINER_IMAGE to your container path
+#   3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled)
+#   4. Submit: sbatch slurm_peft.sh
+# ==============================================================================
+
+#SBATCH --job-name=nemotron3-lora
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --time=08:00:00
+#SBATCH --partition=gpu
+#SBATCH --account=my_account
+#SBATCH --output=logs/nemotron3_lora_%j.out
+#SBATCH --error=logs/nemotron3_lora_%j.err
+#SBATCH --exclusive
+
+# ==============================================================================
+# CONFIGURATION
+# ==============================================================================
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+# Model and training configurations
+PRETRAINED_CHECKPOINT=${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+MODEL_NAME=nemotron_3_nano
+DATASET_NAME=squad
+SEQ_LENGTH=2048
+TRAIN_ITERS=50
+GLOBAL_BATCH_SIZE=8
+MICRO_BATCH_SIZE=1
+EVAL_ITERS=10
+LR_WARMUP_ITERS=5
+LOG_INTERVAL=1
+WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
+GLOBAL_BATCH_SIZE=16
+
+# Parallelism configs: "TP,PP,EP,CP,SP" per entry
+PARALLELISM_CONFIGS=("4,1,8,1,True" "2,2,8,1,True" "2,1,8,2,True")
+
+# Container image (required)
+CONTAINER_IMAGE=""
+# CONTAINER_IMAGE="/path/to/container.sqsh"
+
+# Container mounts (optional, space-separated)
+CONTAINER_MOUNTS=""
+# CONTAINER_MOUNTS="/data:/data /workspace:/workspace"
+
+# ==============================================================================
+# Environment Setup
+# ==============================================================================
+
+# NCCL optimizations for large-scale training
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_NVLS_ENABLE=0
+
+# UV cache on shared filesystem (recommended for multi-node setups)
+# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync
+# export UV_CACHE_DIR="/path/to/shared/uv_cache"
+
+# HuggingFace cache directory (recommended for shared filesystem)
+# export HF_HOME="/path/to/shared/HF_HOME"
+
+# Authentication tokens (set these for your environment)
+# export HF_TOKEN="hf_your_token_here"
+# export WANDB_API_KEY="your_wandb_key_here"
+
+# ==============================================================================
+# Job Execution
+# ==============================================================================
+
+echo "======================================"
+echo "Nemotron 3 Nano LoRA Fine-Tuning Job"
+echo "======================================"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Nodes: $SLURM_JOB_NUM_NODES"
+echo "GPUs per node: $SLURM_GPUS_PER_NODE"
+echo "Model: $MODEL_NAME"
+echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}"
+echo "PEFT: LoRA"
+echo "======================================"
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Require container image
+if [ -z "$CONTAINER_IMAGE" ]; then
+    echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image."
+    exit 1
+fi
+
+# Build srun command (shared across configs)
+SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
+if [ -n "$CONTAINER_MOUNTS" ]; then
+    SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS"
+fi
+echo "SRUN base: $SRUN_CMD"
+echo "======================================"
+
+# Run each parallelism config in sequence
+CONFIG_INDEX=0
+for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do
+    IFS=',' read -r TP PP EP CP SP <<< "$CONFIG"
+    CONFIG_INDEX=$((CONFIG_INDEX + 1))
+    echo ""
+    echo "======================================"
+    echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP"
+    echo "======================================"
+
+    # Build CLI overrides for this config
+    CLI_OVERRIDES=" \
+        checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
+        train.train_iters=$TRAIN_ITERS \
+        train.global_batch_size=$GLOBAL_BATCH_SIZE \
+        train.micro_batch_size=$MICRO_BATCH_SIZE \
+        train.eval_iters=$EVAL_ITERS \
+        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
+        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
+        logger.log_interval=$LOG_INTERVAL \
+        logger.wandb_project=$WANDB_PROJECT \
+        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
+        model.tensor_model_parallel_size=$TP \
+        model.pipeline_model_parallel_size=$PP \
+        model.expert_model_parallel_size=$EP \
+        model.sequence_parallel=$SP \
+        model.context_parallel_size=$CP \
+        model.calculate_per_token_loss=True \
+        train.global_batch_size=$GLOBAL_BATCH_SIZE \
+        dataset.packed_sequence_specs.pad_seq_to_mult=$((CP * 2)) \
+        dataset.packed_sequence_specs.packed_sequence_size=$SEQ_LENGTH \
+        dataset.seq_length=$SEQ_LENGTH \
+        model.seq_length=$SEQ_LENGTH
+    "
+
+    CMD="python /opt/Megatron-Bridge/scripts/training/run_recipe.py"
+    CMD="$CMD --recipe ${MODEL_NAME}_finetune_config"
+    CMD="$CMD --peft_scheme lora"
+    # Collapse newlines so bash -c receives a single command
+    CMD="$CMD $(echo "$CLI_OVERRIDES" | tr '\n' ' ' | sed 's/  \+/ /g')"
+
+    echo "Executing command..."
+    echo $CMD
+    echo "======================================"
+
+    
+
+    $SRUN_CMD bash -c "$CMD"
+    RUN_EXIT=$?
+    if [ $RUN_EXIT -ne 0 ]; then
+        echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT"
+        exit $RUN_EXIT
+    fi
+done
+
+echo "======================================"
+echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)"
+echo "======================================"