Merge branch 'main' into romeyn/parquet-sequence-pack

marcromeyn · web-flow · commit b79f75ff5841 · 2026-02-17T18:06:45.000+01:00
diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
@@ -1 +1 @@
-Subproject commit 76a9f472a58fa054120031a8533404cd799b3f01
+Subproject commit 8f1c2f8ae53b4e3f32c0ae7f397d8b38a675eaa2
diff --git a/examples/evaluation/deploy.sh b/examples/evaluation/deploy.sh
@@ -13,7 +13,4 @@ python \
   --host 0.0.0.0 \
   --port 8000 \
   --num_gpus "$NUM_GPUS" \
-  --num_replicas "$NUM_REPLICAS" \
-  --tensor_model_parallel_size 1 \
-  --pipeline_model_parallel_size 1 \
-  --context_parallel_size 1 
+  --num_replicas "$NUM_REPLICAS"
diff --git a/examples/evaluation/launch_evaluation_pipeline.py b/examples/evaluation/launch_evaluation_pipeline.py
@@ -111,7 +111,16 @@ def main(args):
         executor=executor,
     )
     job.start(
-        command=f"bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh {args.megatron_checkpoint} {args.num_replicas} {args.num_gpus} | tee -a deploy.log & sleep 120; bash /opt/Megatron-Bridge/examples/evaluation/eval.sh {args.output_dir} {args.parallelism} | tee -a eval.log",
+        command=f"""
+        bash /opt/Megatron-Bridge/examples/evaluation/deploy.sh \
+            {args.megatron_checkpoint} \
+            {args.num_replicas} \
+            {args.num_gpus}| tee -a deploy.log & \
+        sleep 120; \
+        bash /opt/Megatron-Bridge/examples/evaluation/eval.sh \
+            {args.output_dir} \
+            {args.parallelism} | tee -a eval.log
+        """,
         workdir=None,
     )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -89,6 +89,7 @@ dependencies = [
     "timm",
     "open-clip-torch>=3.2.0",
     "mlflow>=3.5.0",
+    "torch>=2.6.0",
 ]
 
 
diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py
@@ -78,6 +78,7 @@ def qwen3_235b_a22b_pretrain_config_gb300(
     cfg.mixed_precision = precision_config
     cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
     cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
+    cfg.model.moe_token_dispatcher_type = "flex"
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
@@ -103,6 +104,7 @@ def qwen3_235b_a22b_pretrain_config_gb200(
     cfg.mixed_precision = precision_config
     cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
     cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
+    cfg.model.moe_token_dispatcher_type = "flex"
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
@@ -206,6 +208,7 @@ def qwen3_30b_a3b_pretrain_config_gb300(
     cfg.mixed_precision = precision_config
     cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
     cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
+    cfg.model.moe_token_dispatcher_type = "flex"
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
@@ -231,6 +234,7 @@ def qwen3_30b_a3b_pretrain_config_gb200(
     cfg.mixed_precision = precision_config
     cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
     cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
+    cfg.model.moe_token_dispatcher_type = "flex"
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
@@ -256,6 +260,7 @@ def qwen3_30b_a3b_pretrain_config_b300(
     cfg.mixed_precision = precision_config
     cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
     cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
+    cfg.model.moe_token_dispatcher_type = "flex"
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py
@@ -403,7 +403,7 @@ def main(
     error_msg = None
     n_attempts = 0
     exp_name = (
-        exp_name[:37] if dgxc_cluster is not None else exp_name
+        exp_name[:33] if dgxc_cluster is not None else exp_name
     )  # Some k8s clusters have a limit on the length of the experiment name.
     wandb_run_id = None
     while n_attempts <= max_retries:
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,7 @@ dependencies = [`
`89`	`89`	`"timm",`
`90`	`90`	`"open-clip-torch>=3.2.0",`
`91`	`91`	`"mlflow>=3.5.0",`
	`92`	`+ "torch>=2.6.0",`
`92`	`93`	`]`
`93`	`94`
`94`	`95`