NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/bridge-guide.md‎
Lines changed: 20 additions & 7 deletions b/‎docs/bridge-guide.md‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎scripts/performance/configs/deepseek/deepseek_llm_pretrain.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/performance/configs/deepseek/deepseek_llm_pretrain.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/performance/configs/deepseek/deepseek_workload_base_configs.py‎
Lines changed: 14 additions & 2 deletions b/‎scripts/performance/configs/deepseek/deepseek_workload_base_configs.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎scripts/performance/perf_plugins.py‎
Lines changed: 4 additions & 0 deletions b/‎scripts/performance/perf_plugins.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎scripts/performance/utils/executors.py‎
Lines changed: 6 additions & 3 deletions b/‎scripts/performance/utils/executors.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎scripts/performance/utils/overrides.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/performance/utils/overrides.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/performance/utils/utils.py‎
Lines changed: 3 additions & 0 deletions b/‎scripts/performance/utils/utils.py‎
Lines changed: 3 additions & 0 deletions
@@ -395,6 +395,7 @@ jobs:
           - script: L2_Launch_models_qwen
           - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
+          - script: L2_Launch_recipes_gpt_oss
           - script: L2_Launch_recipes_llama_1b
           - script: L2_Launch_recipes_llama_3b
           - script: L2_Launch_recipes_llama_distill
 
@@ -199,20 +199,33 @@ model = bridge.to_megatron_model()  # Uses default settings
 ```
 
 ### 3. Leverage the Parameter Streaming API
-You can stream converted weights from Megatron to HF without saving to disk. You can also use config-only loading for architecture exploration without loading weights:
+You can stream converted weights from Megatron to HF without saving to disk:
 
 ```python
 # ✅ Use streaming for large models
 for name, weight in bridge.export_hf_weights(model, cpu=True):
     process_weight(name, weight)
+```
+
+### 4. Use `from_hf_pretrained` for Export Workflows
+
+When exporting Megatron checkpoints back to 🤗 Hugging Face format, always use `from_hf_pretrained()` instead of `from_hf_config()`. The `from_hf_config()` method does not load the tokenizer and other artifacts required for saving a complete 🤗 Hugging Face checkpoint:
 
-# ✅ Use config-only loading for architecture exploration
-config = AutoConfig.from_pretrained("meta-llama/Llama-3-8B")
-bridge = AutoBridge.from_hf_config(config)
-transformer_config = bridge.transformer_config
-print(f"Hidden size: {transformer_config.hidden_size}")
+```python
+from megatron.bridge import AutoBridge
+
+# ✅ Correct: Use from_hf_pretrained for export workflows
+bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B")
+bridge.export_ckpt("./megatron_checkpoints/llama32_1b", "./hf_exports/llama32_1b")
+
+# ❌ Avoid: from_hf_config lacks artifacts needed for saving
+# config = AutoConfig.from_pretrained("meta-llama/Llama-3.2-1B")
+# bridge = AutoBridge.from_hf_config(config)  # Missing tokenizer, etc.
+# bridge.export_ckpt(...)  # Will fail!
 ```
 
+The `from_hf_config()` method is only suitable for architecture exploration and introspection (e.g., inspecting `transformer_config`), not for checkpoint conversion workflows.
+
 For more examples and advanced usage patterns, see the `examples/conversion/` directory in the repository.
 
 ## Convenience Workflows (Commands)
@@ -229,7 +242,7 @@ python -c "from megatron.bridge import AutoBridge; AutoBridge.import_ckpt('meta-
 ### Megatron → HF export (one call)
 
 ```bash
-python -c "from megatron.bridge import AutoBridge; from transformers import AutoConfig; cfg=AutoConfig.from_pretrained('meta-llama/Llama-3.2-1B'); b=AutoBridge.from_hf_config(cfg); b.export_ckpt('./megatron_checkpoints/llama32_1b','./hf_exports/llama32_1b')"
+python -c "from megatron.bridge import AutoBridge; b=AutoBridge.from_hf_pretrained('meta-llama/Llama-3.2-1B'); b.export_ckpt('./megatron_checkpoints/llama32_1b','./hf_exports/llama32_1b')"
 ```
 
 ### Create Megatron models and run locally
 
@@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(
         pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
         virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
         moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
-        layout=None,
+        layout=base_cfg.pp_layout,
     )
     set_deepseek_v3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
 
@@ -54,7 +54,16 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
+    BASE_DEEPSEEK_V3_CONFIG,
+    micro_batch_size=2,
+    pipeline_model_parallel_size=2,
+    virtual_pipeline_model_parallel_size=8,
+    pp_layout="Et*4|(t*4|)*14tmL",
+    expert_model_parallel_size=32,
+    cuda_graph_scope=[],
+    recompute_modules=["mla_up_proj"],
+)
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
@@ -133,7 +142,10 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
+    DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
+    global_batch_size=4096,
+)
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(
 
@@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
             self.train_task,
         )
 
+        # Set NVFP4-specific environment variables
+        if self.compute_dtype == "nvfp4":
+            executor.env_vars["NVTE_USE_FAST_MATH"] = "1"
+
 
 @dataclass
 class PyTorchProfilerPluginScriptArgs:
 
@@ -38,13 +38,13 @@
 
 PERF_ENV_VARS = {
     "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
-    "TRANSFORMERS_OFFLINE": "1",  # Enable online downloads from HuggingFace
+    "TRANSFORMERS_OFFLINE": "1",  # Disable online downloads from HuggingFace
     "TOKENIZERS_PARALLELISM": "False",  # Restrict warning message prints
     "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     "NVTE_NORM_FWD_USE_CUDNN": "1",
     "NVTE_NORM_BWD_USE_CUDNN": "1",
     "TORCH_NCCL_HIGH_PRIORITY": "1",
-    "HF_HUB_OFFLINE": "0",
+    "HF_HUB_OFFLINE": "1",
 }
 
 
@@ -86,6 +86,7 @@ def slurm_executor(
     srun_args = custom_srun_args.copy() + [
         "--mpi=pmix",
         "--no-container-mount-home",
+        "--container-writable",
     ]
 
     if log_dir is not None:
@@ -107,7 +108,9 @@ def slurm_executor(
         PERF_ENV_VARS["NEMO_HOME"] = nemo_home
         mounts.extend([f"{nemo_home}:{nemo_home}"])
     if hf_token is not None:
-        PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+        PERF_ENV_VARS["HF_TOKEN"] = hf_token
+        PERF_ENV_VARS["TRANSFORMERS_OFFLINE"] = "0"
+        PERF_ENV_VARS["HF_HUB_OFFLINE"] = "0"
 
     PERF_ENV_VARS.update(custom_env_vars)
     mounts.extend(custom_mounts)
 
@@ -374,7 +374,7 @@ def set_post_overrides(
     dp = int(num_gpus / (tp * pp * cp))
     logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}")
     ## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved.
-    if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx":
+    if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"):
         recipe.optimizer.overlap_param_gather_with_optimizer_step = True
         if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig):
             recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True
 
@@ -62,6 +62,9 @@ class WorkloadBaseConfig:
     moe_a2a_overlap: Optional[bool] = False
     peft: Optional[str] = None
 
+    # Pipeline parallelism layout
+    pp_layout: Optional[str] = None
+
     @property
     def sequence_parallel(self) -> bool:
         """Get the sequence parallel flag."""
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(`
`60`	`60`	`pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,`
`61`	`61`	`virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,`
`62`	`62`	`moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,`
`63`		`- layout=None,`
	`63`	`+ layout=base_cfg.pp_layout,`
`64`	`64`	`)`
`65`	`65`	`set_deepseek_v3_common_configs(cfg)`
`66`	`66`	`set_workload_base_configs(cfg, base_cfg)`
Original file line number	Diff line number	Diff line change
`@@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo`
`446`	`446`	`self.train_task,`
`447`	`447`	`)`
`448`	`448`
	`449`	`+ # Set NVFP4-specific environment variables`
	`450`	`+ if self.compute_dtype == "nvfp4":`
	`451`	`+ executor.env_vars["NVTE_USE_FAST_MATH"] = "1"`
	`452`	`+`
`449`	`453`
`450`	`454`	`@dataclass`
`451`	`455`	`class PyTorchProfilerPluginScriptArgs:`