Skip to content

Commit 89d9b2a

Browse files
authored
Merge branch 'main' into rmukundan/llama3_lora_tp_overlap_packed_seq
2 parents 0b928cd + 7dd2007 commit 89d9b2a

File tree

8 files changed

+32
-12
lines changed

8 files changed

+32
-12
lines changed

.github/workflows/dependabot.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: Dependabot
22
on:
3-
schedule:
4-
- cron: "0 8 * * *"
3+
# schedule:
4+
# - cron: "0 8 * * *"
55
workflow_dispatch: # Allow manual triggering
66

77
permissions:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ override-dependencies = [
116116
]
117117

118118
[tool.uv.sources]
119-
megatron-core = { path = "3rdparty/Megatron-LM/" }
119+
megatron-core = { path = "3rdparty/Megatron-LM/", editable = true }
120120
nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" }
121121
nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "v0.4.1" } # Requires a source install to compile cupti for cuda13
122122

scripts/performance/argument_parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -422,9 +422,10 @@ def parse_cli_args():
422422
slurm_args.add_argument(
423423
"-cb",
424424
"--custom_bash_cmds",
425-
type=list_of_strings,
426-
help="Comma separated string of bash commands",
427-
default=[],
425+
nargs="*",
426+
action="append",
427+
help="List of bash commands to execute before the main command",
428+
default=None,
428429
)
429430
slurm_args.add_argument(
430431
"--gres",

scripts/performance/configs/llama/llama31_workload_base_configs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@
246246
LLAMA31_405B_PRETRAIN_CONFIG_GB300_NVFP4_V1,
247247
num_gpus=256,
248248
global_batch_size=1536,
249+
cuda_graph_impl="none",
249250
)
250251

251252

scripts/performance/setup_experiment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def main(
206206
custom_mounts: List[str],
207207
custom_env_vars: Dict[str, str],
208208
custom_srun_args: List[str],
209-
custom_bash_cmds: List[str],
209+
custom_bash_cmds: List[List[str]],
210210
nccl_ub: bool,
211211
pretrained_checkpoint: Optional[str],
212212
num_gpus: int,

scripts/performance/utils/executors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def slurm_executor(
6464
nemo_home: str = DEFAULT_NEMO_HOME,
6565
wandb_key: str = None,
6666
network: str = None,
67-
custom_bash_cmds: List[str] = None,
67+
custom_bash_cmds: List[List[str]] = None,
6868
additional_slurm_params: Dict[str, Any] = None,
6969
gres: Optional[str] = None,
7070
) -> run.SlurmExecutor:
@@ -79,7 +79,7 @@ def slurm_executor(
7979
#SBATCH --nodelist=node001,node002
8080
#SBATCH --constraint=gpu
8181
"""
82-
custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
82+
custom_bash_cmds = [] if custom_bash_cmds is None else [" ".join(cmd) for cmd in custom_bash_cmds]
8383
mounts = []
8484
# Explicitly request GPU resources to ensure proper allocation
8585
# Without --gres=gpu:N, some clusters only allocate 1 GPU regardless of ntasks_per_node

src/megatron/bridge/training/initialize.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@
1414

1515
import datetime
1616
import os
17+
import time
1718
import warnings
1819
from typing import Callable, Optional
1920

2021
import torch
2122
import torch.distributed
2223
import torch.nn.functional as F
2324
from megatron.core import parallel_state, tensor_parallel
25+
from megatron.core.datasets.utils import compile_helpers
2426
from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
2527
from megatron.core.fusions.fused_bias_gelu import bias_gelu
2628
from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
@@ -115,7 +117,7 @@ def initialize_megatron(
115117
init_rerun_state(rerun_state_machine_config)
116118

117119
# torch.distributed initialization
118-
return torch_dist_init(
120+
result = torch_dist_init(
119121
model_config=model_config,
120122
dist_config=dist_config,
121123
rng_config=rng_config,
@@ -128,6 +130,22 @@ def initialize_megatron(
128130
use_inprocess_restart=use_inprocess_restart,
129131
)
130132

133+
# Compile dataset helpers after distributed initialization
134+
if torch.distributed.is_initialized():
135+
if get_rank_safe() == 0:
136+
start_time = time.time()
137+
print("> compiling dataset index builder ...")
138+
compile_helpers()
139+
print(
140+
">>> done with dataset index builder. Compilation time: {:.3f} seconds".format(
141+
time.time() - start_time
142+
),
143+
flush=True,
144+
)
145+
torch.distributed.barrier()
146+
147+
return result
148+
131149

132150
def torch_dist_init(
133151
model_config: GPTModelProvider | T5ModelProvider,

uv.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)