Skip to content

Commit 09974ef

Browse files
authored
[compat] compat transformers 5.2.0 (#8075)
1 parent 6f71621 commit 09974ef

File tree

8 files changed

+15
-6
lines changed

8 files changed

+15
-6
lines changed

examples/models/qwen3_next/transformers.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ swift sft \
1313
--lora_rank 8 \
1414
--lora_alpha 32 \
1515
--target_modules all-linear \
16+
--experts_impl grouped_mm \
1617
--router_aux_loss_coef 1e-3 \
1718
--gradient_accumulation_steps 2 \
1819
--eval_steps 50 \

examples/models/qwen3_vl/transformers.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ swift sft \
1919
--lora_alpha 32 \
2020
--target_modules all-linear \
2121
--router_aux_loss_coef 1e-3 \
22+
--experts_impl grouped_mm \
2223
--freeze_vit true \
2324
--freeze_aligner true \
2425
--gradient_accumulation_steps 4 \

examples/models/qwen3_vl/zero3.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ swift sft \
1919
--lora_alpha 32 \
2020
--target_modules all-linear \
2121
--router_aux_loss_coef 1e-3 \
22+
--experts_impl grouped_mm \
2223
--freeze_vit true \
2324
--freeze_aligner true \
2425
--gradient_accumulation_steps 1 \

examples/train/moe/qwen3_moe.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ swift sft \
1919
--lora_rank 8 \
2020
--lora_alpha 32 \
2121
--router_aux_loss_coef 1e-3 \
22+
--experts_impl grouped_mm \
2223
--gradient_accumulation_steps 16 \
2324
--eval_steps 50 \
2425
--save_steps 50 \

swift/model/models/qwen.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -950,14 +950,10 @@ def _compat_qwen3_vl_mixed_data(model, processor, is_moe: bool = False):
950950
if hasattr(model, 'origin_forward'):
951951
return
952952
from transformers.models.qwen3_vl.modeling_qwen3_vl import (Cache, Qwen3VLModelOutputWithPast, TransformersKwargs,
953-
Unpack, check_model_inputs)
953+
Unpack)
954954
from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeModelOutputWithPast
955955
output_cls = Qwen3VLMoeModelOutputWithPast if is_moe else Qwen3VLModelOutputWithPast
956956

957-
if version.parse(transformers.__version__) >= version.parse('4.57.2'):
958-
check_model_inputs = check_model_inputs()
959-
960-
@check_model_inputs
961957
def forward(
962958
self,
963959
input_ids: torch.LongTensor = None,

swift/model/register.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ def __init__(
183183
self.attn_impl = attn_impl
184184
self.attn_impl_keys = None
185185
experts_impl = experts_impl or kwargs.get('experts_implementation')
186+
if experts_impl is not None and not transformers_5:
187+
raise ValueError('experts_impl is only supported in "transformers>=5.0".')
186188
self.experts_impl = experts_impl
187189
self.rope_scaling = rope_scaling
188190
self.max_model_len = max_model_len

swift/trainers/arguments.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ class TrainArgumentsMixin:
5757
acc_strategy (Literal['token', 'seq']): The strategy for calculating accuracy during training and validation.
5858
Can be 'token' for token-level accuracy or 'seq' for sequence-level accuracy. Defaults to 'token'.
5959
train_dataloader_shuffle (bool): Whether to shuffle the training data. Defaults to True.
60+
group_by_length (bool): Whether to group samples with approximately the same length together in the
61+
training dataset (with a random factor).
6062
max_epochs (Optional[int]): The total number of training epochs to perform. Overrides `num_train_epochs`.
6163
Defaults to None.
6264
aligner_lr (Optional[float]): A specific learning rate for the aligner part of the model. Defaults to None.
@@ -148,6 +150,7 @@ class TrainArgumentsMixin:
148150
check_model: bool = True
149151
acc_strategy: Literal['token', 'seq'] = 'token'
150152
train_dataloader_shuffle: bool = True
153+
group_by_length: bool = False
151154
max_epochs: Optional[int] = None
152155
aligner_lr: Optional[float] = None
153156
vit_lr: Optional[float] = None

swift/trainers/mixin.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -964,7 +964,11 @@ def _maybe_log_save_evaluate(self, tr_loss, *args, **kwargs):
964964
self.control.should_log = False
965965

966966
# all_gather + mean() to get average loss over all processes
967-
tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
967+
if version.parse(transformers.__version__) >= version.parse('5.2.0'):
968+
from transformers.trainer_pt_utils import nested_gather
969+
tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()
970+
else:
971+
tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
968972
loss = tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged)
969973
logs: Dict[str, float] = {'loss': loss} # loss first
970974
if version.parse(transformers.__version__) >= version.parse('4.38'):

0 commit comments

Comments
 (0)