Merge branch 'main' into rmukundan/llama3_lora_tp_overlap_packed_seq

rhmukundan · web-flow · commit 89d9b2ad6db4 · 2026-02-06T08:50:51.000-08:00
diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml
@@ -1,7 +1,7 @@
 name: Dependabot
 on:
-  schedule:
-    - cron: "0 8 * * *"
+  # schedule:
+  #   - cron: "0 8 * * *"
   workflow_dispatch: # Allow manual triggering
 
 permissions:
diff --git a/pyproject.toml b/pyproject.toml
@@ -116,7 +116,7 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-megatron-core = { path = "3rdparty/Megatron-LM/" }
+megatron-core = { path = "3rdparty/Megatron-LM/", editable = true }
 nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" }
 nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "v0.4.1" } # Requires a source install to compile cupti for cuda13
 
diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py
@@ -422,9 +422,10 @@ def parse_cli_args():
     slurm_args.add_argument(
         "-cb",
         "--custom_bash_cmds",
-        type=list_of_strings,
-        help="Comma separated string of bash commands",
-        default=[],
+        nargs="*",
+        action="append",
+        help="List of bash commands to execute before the main command",
+        default=None,
     )
     slurm_args.add_argument(
         "--gres",
diff --git a/scripts/performance/configs/llama/llama31_workload_base_configs.py b/scripts/performance/configs/llama/llama31_workload_base_configs.py
@@ -246,6 +246,7 @@
     LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V1,
     num_gpus=256,
     global_batch_size=1536,
+    cuda_graph_impl="none",
 )
 
 
diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py
@@ -206,7 +206,7 @@ def main(
     custom_mounts: List[str],
     custom_env_vars: Dict[str, str],
     custom_srun_args: List[str],
-    custom_bash_cmds: List[str],
+    custom_bash_cmds: List[List[str]],
     nccl_ub: bool,
     pretrained_checkpoint: Optional[str],
     num_gpus: int,
diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py
@@ -64,7 +64,7 @@ def slurm_executor(
     nemo_home: str = DEFAULT_NEMO_HOME,
     wandb_key: str = None,
     network: str = None,
-    custom_bash_cmds: List[str] = None,
+    custom_bash_cmds: List[List[str]] = None,
     additional_slurm_params: Dict[str, Any] = None,
     gres: Optional[str] = None,
 ) -> run.SlurmExecutor:
@@ -79,7 +79,7 @@ def slurm_executor(
                 #SBATCH --nodelist=node001,node002
                 #SBATCH --constraint=gpu
     """
-    custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
+    custom_bash_cmds = [] if custom_bash_cmds is None else [" ".join(cmd) for cmd in custom_bash_cmds]
     mounts = []
     # Explicitly request GPU resources to ensure proper allocation
     # Without --gres=gpu:N, some clusters only allocate 1 GPU regardless of ntasks_per_node
diff --git a/src/megatron/bridge/training/initialize.py b/src/megatron/bridge/training/initialize.py
@@ -14,13 +14,15 @@
 
 import datetime
 import os
+import time
 import warnings
 from typing import Callable, Optional
 
 import torch
 import torch.distributed
 import torch.nn.functional as F
 from megatron.core import parallel_state, tensor_parallel
+from megatron.core.datasets.utils import compile_helpers
 from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
 from megatron.core.fusions.fused_bias_gelu import bias_gelu
 from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
@@ -115,7 +117,7 @@ def initialize_megatron(
     init_rerun_state(rerun_state_machine_config)
 
     # torch.distributed initialization
-    return torch_dist_init(
+    result = torch_dist_init(
         model_config=model_config,
         dist_config=dist_config,
         rng_config=rng_config,
@@ -128,6 +130,22 @@ def initialize_megatron(
         use_inprocess_restart=use_inprocess_restart,
     )
 
+    # Compile dataset helpers after distributed initialization
+    if torch.distributed.is_initialized():
+        if get_rank_safe() == 0:
+            start_time = time.time()
+            print("> compiling dataset index builder ...")
+            compile_helpers()
+            print(
+                ">>> done with dataset index builder. Compilation time: {:.3f} seconds".format(
+                    time.time() - start_time
+                ),
+                flush=True,
+            )
+        torch.distributed.barrier()
+
+    return result
+
 
 def torch_dist_init(
     model_config: GPTModelProvider | T5ModelProvider,
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ override-dependencies = [`
`116`	`116`	`]`
`117`	`117`
`118`	`118`	`[tool.uv.sources]`
`119`		`-megatron-core = { path = "3rdparty/Megatron-LM/" }`
	`119`	`+megatron-core = { path = "3rdparty/Megatron-LM/", editable = true }`
`120`	`120`	`nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" }`
`121`	`121`	`nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "v0.4.1" } # Requires a source install to compile cupti for cuda13`
`122`	`122`
Original file line number	Diff line number	Diff line change
`@@ -246,6 +246,7 @@`
`246`	`246`	`LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V1,`
`247`	`247`	`num_gpus=256,`
`248`	`248`	`global_batch_size=1536,`
	`249`	`+ cuda_graph_impl="none",`
`249`	`250`	`)`
`250`	`251`
`251`	`252`