modelscope · Jintao-Huang · Jan 8, 2026 · Jan 8, 2026 · gemini-code-assist · Jan 8, 2026
diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
@@ -800,7 +800,7 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
   - 提示：ms-swift只对thinker部分进行微调，建议设置为False以降低显存占用（只创建thinker部分的模型结构）。
 
 ### qwen3_vl
-参数含义与`qwen_vl_utils>=0.0.14`库中的含义一致，可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。通过传入以下环境变量，可以修改该库的全局变量默认值。
+参数含义与`qwen_vl_utils>=0.0.14`库中的含义一致，可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。通过传入以下环境变量，可以修改该库的全局变量默认值。（也兼容使用`qwen2_5_vl`的环境变量，例如：`MAX_PIXELS`、`VIDEO_MAX_PIXELS`，会做自动转换。）
 
 - SPATIAL_MERGE_SIZE: 默认为2。
 - IMAGE_MIN_TOKEN_NUM: 默认为`4`，代表一张图片最小图像tokens的个数。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -825,7 +825,8 @@ qwen2_5_omni not only includes the model-specific parameters of qwen2_5_vl and q
 
 
 ### qwen3_vl
-The parameter meanings are the same as in the `qwen_vl_utils>=0.0.14` library — see here: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24. By passing the following environment variables you can override the library's global default values:
+The parameter meanings are the same as in the `qwen_vl_utils>=0.0.14` library — see here: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24. By passing the following environment variables you can override the library's global default values: (It is also compatible with environment variables used by `qwen2_5_vl`, such as: `MAX_PIXELS`, `VIDEO_MAX_PIXELS`, and will perform automatic conversion.)
+
 
 - SPATIAL_MERGE_SIZE: default 2.
 - IMAGE_MIN_TOKEN_NUM: default `4`, denotes the minimum number of image tokens per image.

diff --git a/examples/infer/demo_embedding.py b/examples/infer/demo_embedding.py
@@ -4,10 +4,7 @@
 
 if __name__ == '__main__':
     engine = PtEngine(
-        'Qwen/Qwen3-Embedding-4B',
-        task_type='embedding',
-        torch_dtype=torch.float16,
-        attn_implementation='flash_attention_2')
+        'Qwen/Qwen3-Embedding-4B', task_type='embedding', torch_dtype=torch.float16, attn_impl='flash_attention_2')
 
     infer_requests = [
         InferRequest(messages=[

diff --git a/examples/infer/demo_reranker.py b/examples/infer/demo_reranker.py
@@ -7,7 +7,7 @@
         'Qwen/Qwen3-Reranker-4B',
         task_type='generative_reranker',
         torch_dtype=torch.float16,
-        attn_implementation='flash_attention_2')
+        attn_impl='flash_attention_2')
 
     infer_request = InferRequest(
         messages=[{

diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
@@ -21,6 +21,7 @@
 from swift.llm import InferRequest, Template, TemplateMeta, get_model_tokenizer, safe_snapshot_download, to_device
 from swift.plugin import Metric
 from swift.tuners import Swift
+from swift.utils import get_last_valid_indices
 from ..protocol import (ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
                         ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingResponse,
                         EmbeddingResponseData, RequestConfig, random_uuid)
@@ -349,7 +350,13 @@ def _infer_forward(self, template: Template, inputs: Dict[str, Any], adapter_req
                 negative_token = os.environ.get('GENERATIVE_RERANKER_NEGATIVE_TOKEN', 'no')
                 token_false_id = template.tokenizer.convert_tokens_to_ids(negative_token)
                 token_true_id = template.tokenizer.convert_tokens_to_ids(positive_token)
-                batch_scores = logits[:, -1, :]
+                attention_mask = inputs.get('attention_mask')
+                if attention_mask is None:
+                    batch_scores = logits[:, -1, :]
+                else:
+                    last_valid_indices = get_last_valid_indices(attention_mask)
+                    batch_indices = torch.arange(attention_mask.shape[0], device=logits.device)
+                    batch_scores = logits[batch_indices, last_valid_indices, :]
                 true_vector = batch_scores[:, token_true_id]
                 false_vector = batch_scores[:, token_false_id]
                 batch_scores = torch.stack([false_vector, true_vector], dim=1).float()

diff --git a/swift/llm/model/patcher.py b/swift/llm/model/patcher.py
@@ -9,6 +9,7 @@
 import accelerate
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import transformers
 from accelerate.utils import find_device
 from packaging import version
@@ -92,15 +93,9 @@ def _output_embedding_hook(module, args, kwargs, output):
         if attention_mask is None:
             attention_mask = output.get('attention_mask', None)
         hidden_states = output.logits
-        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
-        if left_padding:
-            embeddings = hidden_states[:, -1]
-        else:
-            sequence_lengths = attention_mask.sum(dim=1) - 1
-            batch_size = hidden_states.shape[0]
-            embeddings = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths]
-        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-
+        sequence_lengths = get_last_valid_indices(attention_mask)
+        embeddings = hidden_states[torch.arange(hidden_states.shape[0], device=hidden_states.device), sequence_lengths]
+        embeddings = F.normalize(embeddings, p=2, dim=1)
         return {
             'last_hidden_state': embeddings.contiguous(),
         }

diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
@@ -9,6 +9,7 @@
 import torch.nn.functional as F
 from accelerate.utils import find_device
 from modelscope.hub.utils.utils import get_cache_dir
+from packaging import version
 from torch import nn
 from transformers import PretrainedConfig
 from transformers.utils import strtobool
@@ -549,6 +550,8 @@ def _patch_conv3d():
         nn.Conv3d._original_forward = nn.Conv3d.forward
 
     def forward(self, x):
+        if version.parse(torch.__version__) < version.parse('2.9.0'):
+            return self._original_forward(x)
         if any(s != k for s, k in zip(self.stride, self.kernel_size)) or any(p != 0 for p in self.padding) or any(
                 d != 1 for d in self.dilation) or self.groups != 1:
             raise NotImplementedError(