[bugfix] fix megatron multimodal modules_to_save (#5876)

Jintao-Huang · Jintao-Huang · commit 54ddad3f9fa8 · 2025-09-22T17:21:45.000+08:00
diff --git a/examples/eval/vlm/eval.sh b/examples/eval/vlm/eval.sh
@@ -1,7 +1,7 @@
 CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift eval \
-  --model Qwen/Qwen2-VL-2B-Instruct \
+  --model Qwen/Qwen2.5-VL-3B-Instruct \
   --infer_backend vllm \
   --eval_limit 100 \
   --eval_dataset realWorldQA \
diff --git a/examples/infer/lmdeploy/batch_ddp.sh b/examples/infer/lmdeploy/batch_ddp.sh
@@ -1,4 +1,4 @@
-# test env: lmdeploy 0.9.2
+# test env: lmdeploy 0.9.2.post1
 NPROC_PER_NODE=4 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 swift infer \
diff --git a/examples/train/moe/qwen3_moe.sh b/examples/train/moe/qwen3_moe.sh
@@ -1,5 +1,9 @@
 # If you don't want to train the router, set:
 # `--target_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj`
+
+# Note: If you need to use DeepSpeed ZeRO-2/ZeRO-3 but encounter hangs
+# try using transformers==4.51.3
+
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen3-30B-A3B-Instruct-2507 \
diff --git a/examples/train/packing/qwen2_5_vl.sh b/examples/train/packing/qwen2_5_vl.sh
@@ -4,8 +4,6 @@
 # For local datasets, it is recommended to use streaming: `--streaming true` (save memory)
 # You can also use padding_free to avoid the space/time cost caused by multi-modal packing:
 # https://github.com/modelscope/ms-swift/blob/main/examples/train/padding_free/sft.sh
-pip install "transformers==4.51.*"
-
 NPROC_PER_NODE=4 \
 MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
diff --git a/examples/train/packing/streaming.sh b/examples/train/packing/streaming.sh
@@ -1,8 +1,6 @@
 # 4 * 36GB
 # A demo using the Hugging Face dataset
 # The first model weights will be saved around step 70.
-pip install "transformers==4.51.*"
-
 NPROC_PER_NODE=4 \
 MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
diff --git a/swift/megatron/argument/train_args.py b/swift/megatron/argument/train_args.py
@@ -75,3 +75,5 @@ def __post_init__(self):
         if self.load is None and self.no_initialization:
             raise ValueError('You did not pass `--load`, so you need to set `--no_initialization false` '
                              'to allow the model to initialize weights properly.')
+        if self.cached_dataset and self.context_parallel_size > 1:
+            raise ValueError('`cached_dataset` does not support context parallelism.')
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -397,7 +397,7 @@ def sharded_state_dict(
                 metadata: Optional[dict] = None,
         ) -> ShardedStateDict:
             sharded_state_dict = tuners_sharded_state_dict(self, prefix, sharded_offsets, metadata)
-            if prefix == 'output_layer.':
+            if prefix in {'output_layer.', 'language_model.output_layer.'}:
                 for k in list(sharded_state_dict.keys()):
                     if '_extra_state' in k:
                         # Old GPT checkpoints only stored the output layer weight key. So we remove the

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# test env: lmdeploy 0.9.2`
	`1`	`+# test env: lmdeploy 0.9.2.post1`
`2`	`2`	`NPROC_PER_NODE=4 \`
`3`	`3`	`CUDA_VISIBLE_DEVICES=0,1,2,3 \`
`4`	`4`	`swift infer \`