Skip to content

Commit 54ddad3

Browse files
committed
[bugfix] fix megatron multimodal modules_to_save (#5876)
1 parent 3eb09e8 commit 54ddad3

File tree

7 files changed

+9
-7
lines changed

7 files changed

+9
-7
lines changed

examples/eval/vlm/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
CUDA_VISIBLE_DEVICES=0 \
22
MAX_PIXELS=1003520 \
33
swift eval \
4-
--model Qwen/Qwen2-VL-2B-Instruct \
4+
--model Qwen/Qwen2.5-VL-3B-Instruct \
55
--infer_backend vllm \
66
--eval_limit 100 \
77
--eval_dataset realWorldQA \

examples/infer/lmdeploy/batch_ddp.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# test env: lmdeploy 0.9.2
1+
# test env: lmdeploy 0.9.2.post1
22
NPROC_PER_NODE=4 \
33
CUDA_VISIBLE_DEVICES=0,1,2,3 \
44
swift infer \

examples/train/moe/qwen3_moe.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# If you don't want to train the router, set:
22
# `--target_modules q_proj k_proj v_proj o_proj gate_proj up_proj down_proj`
3+
4+
# Note: If you need to use DeepSpeed ZeRO-2/ZeRO-3 but encounter hangs
5+
# try using transformers==4.51.3
6+
37
CUDA_VISIBLE_DEVICES=0 \
48
swift sft \
59
--model Qwen/Qwen3-30B-A3B-Instruct-2507 \

examples/train/packing/qwen2_5_vl.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
# For local datasets, it is recommended to use streaming: `--streaming true` (save memory)
55
# You can also use padding_free to avoid the space/time cost caused by multi-modal packing:
66
# https://github.com/modelscope/ms-swift/blob/main/examples/train/padding_free/sft.sh
7-
pip install "transformers==4.51.*"
8-
97
NPROC_PER_NODE=4 \
108
MAX_PIXELS=1003520 \
119
CUDA_VISIBLE_DEVICES=0,1,2,3 \

examples/train/packing/streaming.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# 4 * 36GB
22
# A demo using the Hugging Face dataset
33
# The first model weights will be saved around step 70.
4-
pip install "transformers==4.51.*"
5-
64
NPROC_PER_NODE=4 \
75
MAX_PIXELS=1003520 \
86
CUDA_VISIBLE_DEVICES=0,1,2,3 \

swift/megatron/argument/train_args.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,5 @@ def __post_init__(self):
7575
if self.load is None and self.no_initialization:
7676
raise ValueError('You did not pass `--load`, so you need to set `--no_initialization false` '
7777
'to allow the model to initialize weights properly.')
78+
if self.cached_dataset and self.context_parallel_size > 1:
79+
raise ValueError('`cached_dataset` does not support context parallelism.')

swift/megatron/init.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def sharded_state_dict(
397397
metadata: Optional[dict] = None,
398398
) -> ShardedStateDict:
399399
sharded_state_dict = tuners_sharded_state_dict(self, prefix, sharded_offsets, metadata)
400-
if prefix == 'output_layer.':
400+
if prefix in {'output_layer.', 'language_model.output_layer.'}:
401401
for k in list(sharded_state_dict.keys()):
402402
if '_extra_state' in k:
403403
# Old GPT checkpoints only stored the output layer weight key. So we remove the

0 commit comments

Comments
 (0)