Set parallel config for megatron bridge (#1184)

zhuzilin · web-flow · commit 7dd5f6331a81 · 2025-12-22T19:22:44.000+08:00
diff --git a/docker/patch/latest/megatron.patch b/docker/patch/latest/megatron.patch
@@ -384,16 +384,15 @@ index a8f4abfcd..f33f6f05e 100755
  
          if self.config.recompute_method == 'uniform':
 diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
-index e2705bd9f..83a947c00 100644
+index e2705bd9f..a0aa109b5 100644
 --- a/megatron/core/transformer/transformer_config.py
 +++ b/megatron/core/transformer/transformer_config.py
-@@ -210,6 +210,10 @@ class TransformerConfig(ModelParallelConfig):
+@@ -210,6 +210,9 @@ class TransformerConfig(ModelParallelConfig):
      attention_output_gate: bool = False
      """Whether to apply output gate to the attention layers."""
  
 +    post_self_attn_layernorm: bool = False
 +    post_mlp_layernorm: bool = False
-+    use_gated_attention: bool = False
 +
      test_mode: bool = False
      """Whether to run real-time tests."""
@@ -469,21 +468,20 @@ index 3ea405770..5a42001b9 100644
              # discard the output of the pre-mlp layernorm and register the recompute
              # as a gradient hook of mlp_output_with_bias[0]
 diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
-index b267c8a81..def4ce809 100644
+index b267c8a81..83736acdc 100644
 --- a/megatron/training/arguments.py
 +++ b/megatron/training/arguments.py
-@@ -1398,6 +1398,10 @@ def core_transformer_config_from_args(args, config_class=None):
+@@ -1398,6 +1398,9 @@ def core_transformer_config_from_args(args, config_class=None):
  
      kw_args['inference_sampling_seed'] = args.seed
  
 +    kw_args['post_self_attn_layernorm'] = args.post_self_attn_layernorm
 +    kw_args['post_mlp_layernorm'] = args.post_mlp_layernorm
-+    kw_args['use_gated_attention'] = args.use_gated_attention
 +
      # handle quantization config
      # NOTE: Kitchen arguments are only added to the namespace when
      # Kitchen library is available.
-@@ -1764,6 +1768,12 @@ def _add_network_size_args(parser):
+@@ -1764,6 +1767,12 @@ def _add_network_size_args(parser):
                         action='store_true',
                         help='If set, use original BERT residula connection '
                         'ordering.')
diff --git a/examples/geo3k_vlm/run_geo3k_vlm.sh b/examples/geo3k_vlm/run_geo3k_vlm.sh
@@ -80,6 +80,8 @@ fi
 # Common args
 CKPT_ARGS=(
    --hf-checkpoint /root/models/${MODEL_NAME}
+   # vl model has rotary base 5000000
+   --rotary-base 5000000
 )
 
 ROLLOUT_ARGS=(
diff --git a/slime/backends/megatron_utils/__init__.py b/slime/backends/megatron_utils/__init__.py
@@ -21,14 +21,21 @@ def new_init(self, *args, **kwargs):
     logging.warning("deep_ep is not installed, some functionalities may be limited.")
 
 try:
-    from megatron.bridge.models.qwen_vl.modelling_qwen3_vl.text_model import Qwen3VLTextRotaryEmbedding
+    from megatron.bridge.models.qwen_vl.modelling_qwen3_vl.text_model import (
+        Qwen3VLMoETextRotaryEmbedding,
+        Qwen3VLTextRotaryEmbedding,
+    )
 
-    _original_forward = Qwen3VLTextRotaryEmbedding.forward
+    def patch_rotary_embedding(cls):
+        _original_forward = cls.forward
 
-    def _patched_forward(self, *args, packed_seq_params=None, **kwargs):
-        return _original_forward(self, *args, **kwargs)
+        def _patched_forward(self, *args, packed_seq_params=None, **kwargs):
+            return _original_forward(self, *args, **kwargs)
 
-    Qwen3VLTextRotaryEmbedding.forward = _patched_forward
+        cls.forward = _patched_forward
+
+    patch_rotary_embedding(Qwen3VLTextRotaryEmbedding)
+    patch_rotary_embedding(Qwen3VLMoETextRotaryEmbedding)
 except ImportError:
     pass
 
diff --git a/slime/backends/megatron_utils/model_provider.py b/slime/backends/megatron_utils/model_provider.py
@@ -58,6 +58,12 @@ def get_model_provider_func(
 
         bridge = AutoBridge.from_hf_pretrained(args.hf_checkpoint, trust_remote_code=True)
         provider = bridge.to_megatron_provider(load_weights=False)
+        # TODO: we should not manually set this...
+        provider.tensor_model_parallel_size = args.tensor_model_parallel_size
+        provider.pipeline_model_parallel_size = args.pipeline_model_parallel_size
+        provider.expert_model_parallel_size = args.expert_model_parallel_size
+        provider.expert_tensor_parallel_size = args.expert_tensor_parallel_size
+        provider.sequence_parallel = args.sequence_parallel
         provider.finalize()
         return provider.provide
 
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -1590,6 +1590,10 @@ def equal(x, y):
 
     errors = []
 
+    # multimodal models have different config structure
+    if hasattr(hf_config, "text_config"):
+        hf_config = hf_config.text_config
+
     for hf_config_name, megatron_config_name, compare_fn in [
         ("hidden_size", "hidden_size", equal),
         ("num_attention_heads", "num_attention_heads", equal),
diff --git a/tools/convert_hf_to_torch_dist.py b/tools/convert_hf_to_torch_dist.py
@@ -110,7 +110,7 @@ def main():
 
     # Load model
     hf_model_path = args.hf_checkpoint
-    bridge = AutoBridge.from_pretrained(hf_model_path, trust_remote_code=True)
+    bridge = AutoBridge.from_hf_pretrained(hf_model_path, trust_remote_code=True)
     bridge.load_weights(model, hf_model_path, memory_efficient=True)
     print(f"Model loaded: {hf_model_path}")
 

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,8 @@ fi`
`80`	`80`	`# Common args`
`81`	`81`	`CKPT_ARGS=(`
`82`	`82`	`--hf-checkpoint /root/models/${MODEL_NAME}`
	`83`	`+ # vl model has rotary base 5000000`
	`84`	`+ --rotary-base 5000000`
`83`	`85`	`)`
`84`	`86`
`85`	`87`	`ROLLOUT_ARGS=(`