diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 1881898b78..05fff05bd1 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -800,7 +800,7 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还 - 提示:ms-swift只对thinker部分进行微调,建议设置为False以降低显存占用(只创建thinker部分的模型结构)。 ### qwen3_vl -参数含义与`qwen_vl_utils>=0.0.14`库中的含义一致,可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。通过传入以下环境变量,可以修改该库的全局变量默认值。 +参数含义与`qwen_vl_utils>=0.0.14`库中的含义一致,可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)。通过传入以下环境变量,可以修改该库的全局变量默认值。(也兼容使用`qwen2_5_vl`的环境变量,例如:`MAX_PIXELS`、`VIDEO_MAX_PIXELS`,会做自动转换。) - SPATIAL_MERGE_SIZE: 默认为2。 - IMAGE_MIN_TOKEN_NUM: 默认为`4`,代表一张图片最小图像tokens的个数。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 16d89ba300..da784b1709 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -825,7 +825,8 @@ qwen2_5_omni not only includes the model-specific parameters of qwen2_5_vl and q ### qwen3_vl -The parameter meanings are the same as in the `qwen_vl_utils>=0.0.14` library — see here: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24. By passing the following environment variables you can override the library's global default values: +The parameter meanings are the same as in the `qwen_vl_utils>=0.0.14` library — see here: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24. By passing the following environment variables you can override the library's global default values: (It is also compatible with environment variables used by `qwen2_5_vl`, such as: `MAX_PIXELS`, `VIDEO_MAX_PIXELS`, and will perform automatic conversion.) + - SPATIAL_MERGE_SIZE: default 2. - IMAGE_MIN_TOKEN_NUM: default `4`, denotes the minimum number of image tokens per image. diff --git a/examples/infer/demo_embedding.py b/examples/infer/demo_embedding.py index b8b8bd7cca..59dcc1c344 100644 --- a/examples/infer/demo_embedding.py +++ b/examples/infer/demo_embedding.py @@ -4,10 +4,7 @@ if __name__ == '__main__': engine = PtEngine( - 'Qwen/Qwen3-Embedding-4B', - task_type='embedding', - torch_dtype=torch.float16, - attn_implementation='flash_attention_2') + 'Qwen/Qwen3-Embedding-4B', task_type='embedding', torch_dtype=torch.float16, attn_impl='flash_attention_2') infer_requests = [ InferRequest(messages=[ diff --git a/examples/infer/demo_reranker.py b/examples/infer/demo_reranker.py index 5a9798b149..6b654d70ce 100644 --- a/examples/infer/demo_reranker.py +++ b/examples/infer/demo_reranker.py @@ -7,7 +7,7 @@ 'Qwen/Qwen3-Reranker-4B', task_type='generative_reranker', torch_dtype=torch.float16, - attn_implementation='flash_attention_2') + attn_impl='flash_attention_2') infer_request = InferRequest( messages=[{ diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py index 03d73dca63..e5cea63da9 100644 --- a/swift/llm/infer/infer_engine/pt_engine.py +++ b/swift/llm/infer/infer_engine/pt_engine.py @@ -21,6 +21,7 @@ from swift.llm import InferRequest, Template, TemplateMeta, get_model_tokenizer, safe_snapshot_download, to_device from swift.plugin import Metric from swift.tuners import Swift +from swift.utils import get_last_valid_indices from ..protocol import (ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingResponse, EmbeddingResponseData, RequestConfig, random_uuid) @@ -349,7 +350,13 @@ def _infer_forward(self, template: Template, inputs: Dict[str, Any], adapter_req negative_token = os.environ.get('GENERATIVE_RERANKER_NEGATIVE_TOKEN', 'no') token_false_id = template.tokenizer.convert_tokens_to_ids(negative_token) token_true_id = template.tokenizer.convert_tokens_to_ids(positive_token) - batch_scores = logits[:, -1, :] + attention_mask = inputs.get('attention_mask') + if attention_mask is None: + batch_scores = logits[:, -1, :] + else: + last_valid_indices = get_last_valid_indices(attention_mask) + batch_indices = torch.arange(attention_mask.shape[0], device=logits.device) + batch_scores = logits[batch_indices, last_valid_indices, :] true_vector = batch_scores[:, token_true_id] false_vector = batch_scores[:, token_false_id] batch_scores = torch.stack([false_vector, true_vector], dim=1).float() diff --git a/swift/llm/model/patcher.py b/swift/llm/model/patcher.py index bbda851a8e..59479a2378 100644 --- a/swift/llm/model/patcher.py +++ b/swift/llm/model/patcher.py @@ -9,6 +9,7 @@ import accelerate import torch import torch.nn as nn +import torch.nn.functional as F import transformers from accelerate.utils import find_device from packaging import version @@ -92,15 +93,9 @@ def _output_embedding_hook(module, args, kwargs, output): if attention_mask is None: attention_mask = output.get('attention_mask', None) hidden_states = output.logits - left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) - if left_padding: - embeddings = hidden_states[:, -1] - else: - sequence_lengths = attention_mask.sum(dim=1) - 1 - batch_size = hidden_states.shape[0] - embeddings = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths] - embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) - + sequence_lengths = get_last_valid_indices(attention_mask) + embeddings = hidden_states[torch.arange(hidden_states.shape[0], device=hidden_states.device), sequence_lengths] + embeddings = F.normalize(embeddings, p=2, dim=1) return { 'last_hidden_state': embeddings.contiguous(), } diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py index 25e966685a..ce179ea611 100644 --- a/swift/llm/model/utils.py +++ b/swift/llm/model/utils.py @@ -9,6 +9,7 @@ import torch.nn.functional as F from accelerate.utils import find_device from modelscope.hub.utils.utils import get_cache_dir +from packaging import version from torch import nn from transformers import PretrainedConfig from transformers.utils import strtobool @@ -549,6 +550,8 @@ def _patch_conv3d(): nn.Conv3d._original_forward = nn.Conv3d.forward def forward(self, x): + if version.parse(torch.__version__) < version.parse('2.9.0'): + return self._original_forward(x) if any(s != k for s, k in zip(self.stride, self.kernel_size)) or any(p != 0 for p in self.padding) or any( d != 1 for d in self.dilation) or self.groups != 1: raise NotImplementedError(