[compat] compat transformers 5.2.0 (#8075)

Jintao-Huang · web-flow · commit 09974ef9ac60 · 2026-02-23T01:34:13.000+08:00
diff --git a/examples/models/qwen3_next/transformers.sh b/examples/models/qwen3_next/transformers.sh
@@ -13,6 +13,7 @@ swift sft \
     --lora_rank 8 \
     --lora_alpha 32 \
     --target_modules all-linear \
+    --experts_impl grouped_mm \
     --router_aux_loss_coef 1e-3 \
     --gradient_accumulation_steps 2 \
     --eval_steps 50 \
diff --git a/examples/models/qwen3_vl/transformers.sh b/examples/models/qwen3_vl/transformers.sh
@@ -19,6 +19,7 @@ swift sft \
     --lora_alpha 32 \
     --target_modules all-linear \
     --router_aux_loss_coef 1e-3 \
+    --experts_impl grouped_mm \
     --freeze_vit true \
     --freeze_aligner true \
     --gradient_accumulation_steps 4 \
diff --git a/examples/models/qwen3_vl/zero3.sh b/examples/models/qwen3_vl/zero3.sh
@@ -19,6 +19,7 @@ swift sft \
     --lora_alpha 32 \
     --target_modules all-linear \
     --router_aux_loss_coef 1e-3 \
+    --experts_impl grouped_mm \
     --freeze_vit true \
     --freeze_aligner true \
     --gradient_accumulation_steps 1 \
diff --git a/examples/train/moe/qwen3_moe.sh b/examples/train/moe/qwen3_moe.sh
@@ -19,6 +19,7 @@ swift sft \
     --lora_rank 8 \
     --lora_alpha 32 \
     --router_aux_loss_coef 1e-3 \
+    --experts_impl grouped_mm \
     --gradient_accumulation_steps 16 \
     --eval_steps 50 \
     --save_steps 50 \
diff --git a/swift/model/models/qwen.py b/swift/model/models/qwen.py
@@ -950,14 +950,10 @@ def _compat_qwen3_vl_mixed_data(model, processor, is_moe: bool = False):
     if hasattr(model, 'origin_forward'):
         return
     from transformers.models.qwen3_vl.modeling_qwen3_vl import (Cache, Qwen3VLModelOutputWithPast, TransformersKwargs,
-                                                                Unpack, check_model_inputs)
+                                                                Unpack)
     from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeModelOutputWithPast
     output_cls = Qwen3VLMoeModelOutputWithPast if is_moe else Qwen3VLModelOutputWithPast
 
-    if version.parse(transformers.__version__) >= version.parse('4.57.2'):
-        check_model_inputs = check_model_inputs()
-
-    @check_model_inputs
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/swift/model/register.py b/swift/model/register.py
@@ -183,6 +183,8 @@ def __init__(
         self.attn_impl = attn_impl
         self.attn_impl_keys = None
         experts_impl = experts_impl or kwargs.get('experts_implementation')
+        if experts_impl is not None and not transformers_5:
+            raise ValueError('experts_impl is only supported in "transformers>=5.0".')
         self.experts_impl = experts_impl
         self.rope_scaling = rope_scaling
         self.max_model_len = max_model_len
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -57,6 +57,8 @@ class TrainArgumentsMixin:
         acc_strategy (Literal['token', 'seq']): The strategy for calculating accuracy during training and validation.
             Can be 'token' for token-level accuracy or 'seq' for sequence-level accuracy. Defaults to 'token'.
         train_dataloader_shuffle (bool): Whether to shuffle the training data. Defaults to True.
+        group_by_length (bool): Whether to group samples with approximately the same length together in the
+            training dataset (with a random factor).
         max_epochs (Optional[int]): The total number of training epochs to perform. Overrides `num_train_epochs`.
             Defaults to None.
         aligner_lr (Optional[float]): A specific learning rate for the aligner part of the model. Defaults to None.
@@ -148,6 +150,7 @@ class TrainArgumentsMixin:
     check_model: bool = True
     acc_strategy: Literal['token', 'seq'] = 'token'
     train_dataloader_shuffle: bool = True
+    group_by_length: bool = False
     max_epochs: Optional[int] = None
     aligner_lr: Optional[float] = None
     vit_lr: Optional[float] = None
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -964,7 +964,11 @@ def _maybe_log_save_evaluate(self, tr_loss, *args, **kwargs):
             self.control.should_log = False
 
             # all_gather + mean() to get average loss over all processes
-            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+            if version.parse(transformers.__version__) >= version.parse('5.2.0'):
+                from transformers.trainer_pt_utils import nested_gather
+                tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()
+            else:
+                tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
             loss = tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged)
             logs: Dict[str, float] = {'loss': loss}  # loss first
             if version.parse(transformers.__version__) >= version.parse('4.38'):