Skip to content

Commit 9d21653

Browse files
author
Huy Vu2
committed
Merge remote-tracking branch 'origin/main' into huvu/vlm_generation_fix
2 parents 1e1d374 + 7a50d2e commit 9d21653

File tree

19 files changed

+765
-29
lines changed

19 files changed

+765
-29
lines changed

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ jobs:
395395
- script: L2_Launch_models_qwen
396396
- script: L2_Launch_models_qwen_quantization
397397
- script: L2_Launch_models_qwen_vl
398+
- script: L2_Launch_recipes_gpt_oss
398399
- script: L2_Launch_recipes_llama_1b
399400
- script: L2_Launch_recipes_llama_3b
400401
- script: L2_Launch_recipes_llama_distill

docs/bridge-guide.md

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -199,20 +199,33 @@ model = bridge.to_megatron_model() # Uses default settings
199199
```
200200

201201
### 3. Leverage the Parameter Streaming API
202-
You can stream converted weights from Megatron to HF without saving to disk. You can also use config-only loading for architecture exploration without loading weights:
202+
You can stream converted weights from Megatron to HF without saving to disk:
203203

204204
```python
205205
# ✅ Use streaming for large models
206206
for name, weight in bridge.export_hf_weights(model, cpu=True):
207207
process_weight(name, weight)
208+
```
209+
210+
### 4. Use `from_hf_pretrained` for Export Workflows
211+
212+
When exporting Megatron checkpoints back to 🤗 Hugging Face format, always use `from_hf_pretrained()` instead of `from_hf_config()`. The `from_hf_config()` method does not load the tokenizer and other artifacts required for saving a complete 🤗 Hugging Face checkpoint:
208213

209-
# ✅ Use config-only loading for architecture exploration
210-
config = AutoConfig.from_pretrained("meta-llama/Llama-3-8B")
211-
bridge = AutoBridge.from_hf_config(config)
212-
transformer_config = bridge.transformer_config
213-
print(f"Hidden size: {transformer_config.hidden_size}")
214+
```python
215+
from megatron.bridge import AutoBridge
216+
217+
# ✅ Correct: Use from_hf_pretrained for export workflows
218+
bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B")
219+
bridge.export_ckpt("./megatron_checkpoints/llama32_1b", "./hf_exports/llama32_1b")
220+
221+
# ❌ Avoid: from_hf_config lacks artifacts needed for saving
222+
# config = AutoConfig.from_pretrained("meta-llama/Llama-3.2-1B")
223+
# bridge = AutoBridge.from_hf_config(config) # Missing tokenizer, etc.
224+
# bridge.export_ckpt(...) # Will fail!
214225
```
215226

227+
The `from_hf_config()` method is only suitable for architecture exploration and introspection (e.g., inspecting `transformer_config`), not for checkpoint conversion workflows.
228+
216229
For more examples and advanced usage patterns, see the `examples/conversion/` directory in the repository.
217230

218231
## Convenience Workflows (Commands)
@@ -229,7 +242,7 @@ python -c "from megatron.bridge import AutoBridge; AutoBridge.import_ckpt('meta-
229242
### Megatron → HF export (one call)
230243

231244
```bash
232-
python -c "from megatron.bridge import AutoBridge; from transformers import AutoConfig; cfg=AutoConfig.from_pretrained('meta-llama/Llama-3.2-1B'); b=AutoBridge.from_hf_config(cfg); b.export_ckpt('./megatron_checkpoints/llama32_1b','./hf_exports/llama32_1b')"
245+
python -c "from megatron.bridge import AutoBridge; b=AutoBridge.from_hf_pretrained('meta-llama/Llama-3.2-1B'); b.export_ckpt('./megatron_checkpoints/llama32_1b','./hf_exports/llama32_1b')"
233246
```
234247

235248
### Create Megatron models and run locally

scripts/performance/configs/deepseek/deepseek_llm_pretrain.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(
6060
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
6161
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
6262
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
63-
layout=None,
63+
layout=base_cfg.pp_layout,
6464
)
6565
set_deepseek_v3_common_configs(cfg)
6666
set_workload_base_configs(cfg, base_cfg)

scripts/performance/configs/deepseek/deepseek_workload_base_configs.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,16 @@
5454
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
5555
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
5656
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
57-
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
57+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
58+
BASE_DEEPSEEK_V3_CONFIG,
59+
micro_batch_size=2,
60+
pipeline_model_parallel_size=2,
61+
virtual_pipeline_model_parallel_size=8,
62+
pp_layout="Et*4|(t*4|)*14tmL",
63+
expert_model_parallel_size=32,
64+
cuda_graph_scope=[],
65+
recompute_modules=["mla_up_proj"],
66+
)
5867

5968

6069
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
@@ -133,7 +142,10 @@
133142
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
134143
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
135144
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
136-
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
145+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
146+
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
147+
global_batch_size=4096,
148+
)
137149

138150

139151
DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(

scripts/performance/perf_plugins.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
446446
self.train_task,
447447
)
448448

449+
# Set NVFP4-specific environment variables
450+
if self.compute_dtype == "nvfp4":
451+
executor.env_vars["NVTE_USE_FAST_MATH"] = "1"
452+
449453

450454
@dataclass
451455
class PyTorchProfilerPluginScriptArgs:

scripts/performance/utils/executors.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@
3838

3939
PERF_ENV_VARS = {
4040
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
41-
"TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace
41+
"TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace
4242
"TOKENIZERS_PARALLELISM": "False", # Restrict warning message prints
4343
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
4444
"NVTE_NORM_FWD_USE_CUDNN": "1",
4545
"NVTE_NORM_BWD_USE_CUDNN": "1",
4646
"TORCH_NCCL_HIGH_PRIORITY": "1",
47-
"HF_HUB_OFFLINE": "0",
47+
"HF_HUB_OFFLINE": "1",
4848
}
4949

5050

@@ -86,6 +86,7 @@ def slurm_executor(
8686
srun_args = custom_srun_args.copy() + [
8787
"--mpi=pmix",
8888
"--no-container-mount-home",
89+
"--container-writable",
8990
]
9091

9192
if log_dir is not None:
@@ -107,7 +108,9 @@ def slurm_executor(
107108
PERF_ENV_VARS["NEMO_HOME"] = nemo_home
108109
mounts.extend([f"{nemo_home}:{nemo_home}"])
109110
if hf_token is not None:
110-
PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
111+
PERF_ENV_VARS["HF_TOKEN"] = hf_token
112+
PERF_ENV_VARS["TRANSFORMERS_OFFLINE"] = "0"
113+
PERF_ENV_VARS["HF_HUB_OFFLINE"] = "0"
111114

112115
PERF_ENV_VARS.update(custom_env_vars)
113116
mounts.extend(custom_mounts)

scripts/performance/utils/overrides.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ def set_post_overrides(
374374
dp = int(num_gpus / (tp * pp * cp))
375375
logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}")
376376
## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved.
377-
if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx":
377+
if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"):
378378
recipe.optimizer.overlap_param_gather_with_optimizer_step = True
379379
if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig):
380380
recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True

scripts/performance/utils/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ class WorkloadBaseConfig:
6262
moe_a2a_overlap: Optional[bool] = False
6363
peft: Optional[str] = None
6464

65+
# Pipeline parallelism layout
66+
pp_layout: Optional[str] = None
67+
6568
@property
6669
def sequence_parallel(self) -> bool:
6770
"""Get the sequence parallel flag."""

0 commit comments

Comments
 (0)