Skip to content

Commit b79f75f

Browse files
authored
Merge branch 'main' into romeyn/parquet-sequence-pack
2 parents 1297971 + 24cd876 commit b79f75f

File tree

7 files changed

+321
-105
lines changed

7 files changed

+321
-105
lines changed

3rdparty/Megatron-LM

Submodule Megatron-LM updated 58 files

examples/evaluation/deploy.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,4 @@ python \
1313
--host 0.0.0.0 \
1414
--port 8000 \
1515
--num_gpus "$NUM_GPUS" \
16-
--num_replicas "$NUM_REPLICAS" \
17-
--tensor_model_parallel_size 1 \
18-
--pipeline_model_parallel_size 1 \
19-
--context_parallel_size 1
16+
--num_replicas "$NUM_REPLICAS"

examples/evaluation/launch_evaluation_pipeline.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,16 @@ def main(args):
111111
executor=executor,
112112
)
113113
job.start(
114-
command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log",
114+
command=f"""
115+
bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \
116+
{args.megatron_checkpoint} \
117+
{args.num_replicas} \
118+
{args.num_gpus}| tee -a deploy.log & \
119+
sleep 120; \
120+
bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \
121+
{args.output_dir} \
122+
{args.parallelism} | tee -a eval.log
123+
""",
115124
workdir=None,
116125
)
117126

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ dependencies = [
8989
"timm",
9090
"open-clip-torch>=3.2.0",
9191
"mlflow>=3.5.0",
92+
"torch>=2.6.0",
9293
]
9394

9495

scripts/performance/configs/qwen/qwen3_llm_pretrain.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def qwen3_235b_a22b_pretrain_config_gb300(
7878
cfg.mixed_precision = precision_config
7979
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
8080
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
81+
cfg.model.moe_token_dispatcher_type = "flex"
8182

8283
set_qwen3_common_configs(cfg)
8384
set_workload_base_configs(cfg, base_cfg)
@@ -103,6 +104,7 @@ def qwen3_235b_a22b_pretrain_config_gb200(
103104
cfg.mixed_precision = precision_config
104105
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
105106
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
107+
cfg.model.moe_token_dispatcher_type = "flex"
106108

107109
set_qwen3_common_configs(cfg)
108110
set_workload_base_configs(cfg, base_cfg)
@@ -206,6 +208,7 @@ def qwen3_30b_a3b_pretrain_config_gb300(
206208
cfg.mixed_precision = precision_config
207209
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
208210
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
211+
cfg.model.moe_token_dispatcher_type = "flex"
209212

210213
set_qwen3_common_configs(cfg)
211214
set_workload_base_configs(cfg, base_cfg)
@@ -231,6 +234,7 @@ def qwen3_30b_a3b_pretrain_config_gb200(
231234
cfg.mixed_precision = precision_config
232235
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
233236
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
237+
cfg.model.moe_token_dispatcher_type = "flex"
234238

235239
set_qwen3_common_configs(cfg)
236240
set_workload_base_configs(cfg, base_cfg)
@@ -256,6 +260,7 @@ def qwen3_30b_a3b_pretrain_config_b300(
256260
cfg.mixed_precision = precision_config
257261
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
258262
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
263+
cfg.model.moe_token_dispatcher_type = "flex"
259264

260265
set_qwen3_common_configs(cfg)
261266
set_workload_base_configs(cfg, base_cfg)

scripts/performance/setup_experiment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ def main(
403403
error_msg = None
404404
n_attempts = 0
405405
exp_name = (
406-
exp_name[:37] if dgxc_cluster is not None else exp_name
406+
exp_name[:33] if dgxc_cluster is not None else exp_name
407407
) # Some k8s clusters have a limit on the length of the experiment name.
408408
wandb_run_id = None
409409
while n_attempts <= max_retries:

0 commit comments

Comments
 (0)