Skip to content

Commit e6e40ec

Browse files
authored
Merge branch 'main' into romeyn/parquet-sequence-pack
2 parents b79f75f + c192333 commit e6e40ec

File tree

21 files changed

+1268
-46
lines changed

21 files changed

+1268
-46
lines changed

.github/workflows/release-docs.yml

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,20 @@ on:
2020
required: true
2121
type: boolean
2222
default: true
23-
version-number:
24-
description: Version number to release this as (use `latest` for main branch)
25-
required: true
23+
publish-as-latest:
24+
description: Publish as Latest stable version.
25+
required: false
26+
type: boolean
27+
default: true
28+
docs-version-override:
29+
description: Docs version if commit is not tagged
30+
required: false
2631
type: string
32+
default: ""
2733
notify-emails:
2834
description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
29-
required: true
30-
type: string
31-
aws-region:
32-
description: AWS region
3335
required: false
3436
type: string
35-
default: us-east-1
3637

3738
jobs:
3839
build-docs:
@@ -45,7 +46,7 @@ jobs:
4546
- uses: actions/checkout@v6
4647
with:
4748
repository: NVIDIA-NeMo/FW-CI-templates
48-
ref: v0.67.2
49+
ref: v0.72.0
4950
path: FW-CI-templates
5051

5152
- uses: ./FW-CI-templates/.github/actions/publish-docs
@@ -59,10 +60,11 @@ jobs:
5960
artifacts-name: docs-html
6061
artifacts-path: _build/html
6162
emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
62-
overwrite-latest-on-tag: false
63+
overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
64+
docs-version-override: ${{ inputs.docs-version-override }}
6365
run-on-version-tag-only: ${{ github.ref_name != 'main' }}
6466
request-name: megatron-bridge-publish-docs-${{ github.run_id }}
65-
aws-region: ${{ inputs.aws-region }}
67+
aws-region: ${{ vars.DOCS_AWS_REGION }}
6668
aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
6769
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
6870
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

docker/Dockerfile.ci

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
3232

3333
COPY pyproject.toml uv.lock /opt/Megatron-Bridge/
3434
COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/
35-
COPY 3rdparty/Megatron-LM/pyproject.toml /opt/Megatron-Bridge/3rdparty/Megatron-LM/
35+
COPY 3rdparty/Megatron-LM/pyproject.toml 3rdparty/Megatron-LM/setup.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/
3636
COPY 3rdparty/Megatron-LM/megatron/core/__init__.py 3rdparty/Megatron-LM/megatron/core/package_info.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
37+
COPY 3rdparty/Megatron-LM/megatron/core/datasets/Makefile 3rdparty/Megatron-LM/megatron/core/datasets/helpers.cpp /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/datasets/
3738

3839
# Build arg to skip --locked when testing with different MCore versions
3940
ARG MCORE_TRIGGERED_TESTING=false

docs/models/llm/nemotron3.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ checkpoint.pretrained_checkpoint=/path/to/output/megatron/ckpt
6666
```
6767

6868
Notes:
69-
- Default parallelism TP=1, EP=8, PP=1, CP=1. It is recommended to run this recipe on at least 2 H100 nodes (16 GPUs).
69+
- Default parallelism TP=1, EP=8, PP=1, CP=1. Running this recipe requires at least 2 H100 nodes (16 GPUs).
7070
- By default, the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad) dataset is used. To use customerized dataset, see this [tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/recipes/llama#quickstart)
7171
- Fine-tuning requires a pretrained megatron checkpoint, which can be obtained in "Import HF → Megatron" section above
7272

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Nemotron 3 Examples
2+
3+
This directory contains example scripts for Nemotron 3 language models.
4+
5+
For model introduction and architecture details, see the Nemotron 3 documentation.
6+
7+
## Workspace Configuration
8+
9+
All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it:
10+
11+
```bash
12+
export WORKSPACE=/your/custom/path
13+
```
14+
15+
Directory structure:
16+
- `${WORKSPACE}/models/` - Converted checkpoints
17+
- `${WORKSPACE}/results/` - Training outputs and experiment results
18+
19+
## Checkpoint Conversion
20+
21+
See the [conversion.sh](conversion.sh) script for checkpoint conversion examples.
22+
23+
### Import HF → Megatron
24+
25+
To import the HF model to your desired Megatron path:
26+
27+
```bash
28+
python examples/conversion/convert_checkpoints.py import \
29+
--hf-model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
30+
--megatron-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
31+
--trust-remote-code
32+
```
33+
34+
### Export Megatron → HF
35+
36+
```bash
37+
python examples/conversion/convert_checkpoints.py export \
38+
--hf-model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
39+
--megatron-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/iter_0000000 \
40+
--hf-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16-hf-export
41+
```
42+
43+
### Round-trip Validation
44+
45+
Multi-GPU round-trip validation between formats:
46+
47+
```bash
48+
python -m torch.distributed.run --nproc_per_node=8 \
49+
examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
50+
--hf-model-id nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
51+
--megatron-load-path ${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/iter_0000000 \
52+
--tp 2 --pp 2 \
53+
--trust-remote-code
54+
```
55+
56+
## Training Recipes
57+
58+
- See: [bridge.recipes.nemotronh](../../../src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py)
59+
- Available recipes:
60+
- `nemotron_3_nano_pretrain_config`: Pretraining configuration
61+
- `nemotron_3_nano_finetune_config`: Finetuning configuration with PEFT support
62+
63+
Before training, ensure the following are configured:
64+
1. **Container Image**: Set `CONTAINER_IMAGE` in the SLURM scripts to your container path
65+
2. **Container Mounts**: (optional) Set `CONTAINER_MOUNTS` for data and workspace directories
66+
3. **Environment Variables**:
67+
- `HF_TOKEN`: to download models from HF Hub (if required)
68+
- `HF_HOME`: (optional) to avoid re-downloading models and datasets
69+
- `WANDB_API_KEY`: (optional) to enable WandB logging
70+
71+
All training scripts use SLURM for containerized multi-node training.
72+
73+
### Pretrain
74+
75+
See the [slurm_pretrain.sh](slurm_pretrain.sh) script for pretraining with configurable model parallelisms.
76+
77+
W&B report coming soon.
78+
79+
### Supervised Fine-Tuning (SFT)
80+
81+
See the [slurm_sft.sh](slurm_sft.sh) script for full parameter fine-tuning.
82+
83+
W&B report coming soon.
84+
85+
### Parameter-Efficient Fine-Tuning (PEFT) with LoRA
86+
87+
See the [slurm_peft.sh](slurm_peft.sh) script for LoRA fine-tuning.
88+
89+
W&B report coming soon.
90+
91+
## Evaluation
92+
93+
Coming soon.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -xeuo pipefail
17+
18+
# Workspace directory for checkpoints and results
19+
WORKSPACE=${WORKSPACE:-/workspace}
20+
21+
MODEL_NAME=NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
22+
HF_MODEL_ID=nvidia/$MODEL_NAME
23+
24+
# Import HF → Megatron
25+
uv run python examples/conversion/convert_checkpoints.py import \
26+
--hf-model $HF_MODEL_ID \
27+
--megatron-path ${WORKSPACE}/models/$MODEL_NAME \
28+
--trust-remote-code
29+
30+
# Export Megatron → HF
31+
uv run python examples/conversion/convert_checkpoints.py export \
32+
--hf-model $HF_MODEL_ID \
33+
--megatron-path ${WORKSPACE}/models/$MODEL_NAME/iter_0000000 \
34+
--hf-path ${WORKSPACE}/models/$MODEL_NAME-hf-export
35+
36+
# Round-trip validation
37+
uv run python -m torch.distributed.run --nproc_per_node=8 \
38+
examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
39+
--hf-model-id $HF_MODEL_ID \
40+
--megatron-load-path ${WORKSPACE}/models/$MODEL_NAME/iter_0000000 \
41+
--tp 2 --ep 8 \
42+
--trust-remote-code
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# ==============================================================================
17+
# Nemotron 3 Nano Parameter-Efficient Fine-Tuning (PEFT) with LoRA
18+
#
19+
# Nemotron 3 Nano is a 30B parameter model with A3B (Active 3 Billion) architecture
20+
# LoRA/DoRA significantly reduces memory requirements
21+
# Supports multiple parallelism configs: each "TP,PP,EP,CP,SP" runs sequentially.
22+
#
23+
# Usage:
24+
# 1. Modify the #SBATCH directives below for your cluster
25+
# 2. Set CONTAINER_IMAGE to your container path
26+
# 3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled)
27+
# 4. Submit: sbatch slurm_peft.sh
28+
# ==============================================================================
29+
30+
#SBATCH --job-name=nemotron3-lora
31+
#SBATCH --nodes=2
32+
#SBATCH --ntasks-per-node=8
33+
#SBATCH --gpus-per-node=8
34+
#SBATCH --time=08:00:00
35+
#SBATCH --partition=gpu
36+
#SBATCH --account=my_account
37+
#SBATCH --output=logs/nemotron3_lora_%j.out
38+
#SBATCH --error=logs/nemotron3_lora_%j.err
39+
#SBATCH --exclusive
40+
41+
# ==============================================================================
42+
# CONFIGURATION
43+
# ==============================================================================
44+
45+
# Workspace directory for checkpoints and results
46+
WORKSPACE=${WORKSPACE:-/workspace}
47+
48+
# Model and training configurations
49+
PRETRAINED_CHECKPOINT=${WORKSPACE}/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
50+
MODEL_NAME=nemotron_3_nano
51+
DATASET_NAME=squad
52+
SEQ_LENGTH=2048
53+
TRAIN_ITERS=50
54+
GLOBAL_BATCH_SIZE=8
55+
MICRO_BATCH_SIZE=1
56+
EVAL_ITERS=10
57+
LR_WARMUP_ITERS=5
58+
LOG_INTERVAL=1
59+
WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
60+
GLOBAL_BATCH_SIZE=16
61+
62+
# Parallelism configs: "TP,PP,EP,CP,SP" per entry
63+
PARALLELISM_CONFIGS=("4,1,8,1,True" "2,2,8,1,True" "2,1,8,2,True")
64+
65+
# Container image (required)
66+
CONTAINER_IMAGE=""
67+
# CONTAINER_IMAGE="/path/to/container.sqsh"
68+
69+
# Container mounts (optional, space-separated)
70+
CONTAINER_MOUNTS=""
71+
# CONTAINER_MOUNTS="/data:/data /workspace:/workspace"
72+
73+
# ==============================================================================
74+
# Environment Setup
75+
# ==============================================================================
76+
77+
# NCCL optimizations for large-scale training
78+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
79+
export NCCL_NVLS_ENABLE=0
80+
81+
# UV cache on shared filesystem (recommended for multi-node setups)
82+
# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync
83+
# export UV_CACHE_DIR="/path/to/shared/uv_cache"
84+
85+
# HuggingFace cache directory (recommended for shared filesystem)
86+
# export HF_HOME="/path/to/shared/HF_HOME"
87+
88+
# Authentication tokens (set these for your environment)
89+
# export HF_TOKEN="hf_your_token_here"
90+
# export WANDB_API_KEY="your_wandb_key_here"
91+
92+
# ==============================================================================
93+
# Job Execution
94+
# ==============================================================================
95+
96+
echo "======================================"
97+
echo "Nemotron 3 Nano LoRA Fine-Tuning Job"
98+
echo "======================================"
99+
echo "Job ID: $SLURM_JOB_ID"
100+
echo "Nodes: $SLURM_JOB_NUM_NODES"
101+
echo "GPUs per node: $SLURM_GPUS_PER_NODE"
102+
echo "Model: $MODEL_NAME"
103+
echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}"
104+
echo "PEFT: LoRA"
105+
echo "======================================"
106+
107+
# Create logs directory if it doesn't exist
108+
mkdir -p logs
109+
110+
# Require container image
111+
if [ -z "$CONTAINER_IMAGE" ]; then
112+
echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image."
113+
exit 1
114+
fi
115+
116+
# Build srun command (shared across configs)
117+
SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
118+
if [ -n "$CONTAINER_MOUNTS" ]; then
119+
SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS"
120+
fi
121+
echo "SRUN base: $SRUN_CMD"
122+
echo "======================================"
123+
124+
# Run each parallelism config in sequence
125+
CONFIG_INDEX=0
126+
for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do
127+
IFS=',' read -r TP PP EP CP SP <<< "$CONFIG"
128+
CONFIG_INDEX=$((CONFIG_INDEX + 1))
129+
echo ""
130+
echo "======================================"
131+
echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP"
132+
echo "======================================"
133+
134+
# Build CLI overrides for this config
135+
CLI_OVERRIDES=" \
136+
checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \
137+
train.train_iters=$TRAIN_ITERS \
138+
train.global_batch_size=$GLOBAL_BATCH_SIZE \
139+
train.micro_batch_size=$MICRO_BATCH_SIZE \
140+
train.eval_iters=$EVAL_ITERS \
141+
scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
142+
checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_lora_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
143+
logger.log_interval=$LOG_INTERVAL \
144+
logger.wandb_project=$WANDB_PROJECT \
145+
logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_lora_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
146+
model.tensor_model_parallel_size=$TP \
147+
model.pipeline_model_parallel_size=$PP \
148+
model.expert_model_parallel_size=$EP \
149+
model.sequence_parallel=$SP \
150+
model.context_parallel_size=$CP \
151+
model.calculate_per_token_loss=True \
152+
train.global_batch_size=$GLOBAL_BATCH_SIZE \
153+
dataset.packed_sequence_specs.pad_seq_to_mult=$((CP * 2)) \
154+
dataset.packed_sequence_specs.packed_sequence_size=$SEQ_LENGTH \
155+
dataset.seq_length=$SEQ_LENGTH \
156+
model.seq_length=$SEQ_LENGTH
157+
"
158+
159+
CMD="python /opt/Megatron-Bridge/scripts/training/run_recipe.py"
160+
CMD="$CMD --recipe ${MODEL_NAME}_finetune_config"
161+
CMD="$CMD --peft_scheme lora"
162+
# Collapse newlines so bash -c receives a single command
163+
CMD="$CMD $(echo "$CLI_OVERRIDES" | tr '\n' ' ' | sed 's/ \+/ /g')"
164+
165+
echo "Executing command..."
166+
echo $CMD
167+
echo "======================================"
168+
169+
170+
171+
$SRUN_CMD bash -c "$CMD"
172+
RUN_EXIT=$?
173+
if [ $RUN_EXIT -ne 0 ]; then
174+
echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT"
175+
exit $RUN_EXIT
176+
fi
177+
done
178+
179+
echo "======================================"
180+
echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)"
181+
echo "======================================"

0 commit comments

Comments
 (0)