Support infer and deploy of embedding models (#4927)

tastelikefeet · web-flow · commit ca7cf034fe3d · 2025-07-12T23:47:57.000+08:00
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ You can contact us and communicate with us by adding our group:
 
 
 ## 🎉 News
+- 🎁 2025.07.12: Deployment(pt/vLLM/SGLang) of Embedding models is supported, check [here](examples/deploy/embedding/client.py).
 - 🎁 2025.07.09: Megatron-SWIFT supports LoRA training. Compared to ms-swift, it achieves significant speedup on MoE models. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/lora).
 - 🎁 2025.06.23: Fine-tuning of reranker models is supported. Training scripts can be found here: [Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh).
 - 🎁 2025.06.18: Support for accelerating the ms-swift [inference](https://github.com/modelscope/ms-swift/blob/main/examples/infer/sglang), deployment, evaluation, and UI modules using the [sglang](https://github.com/sgl-project/sglang) inference acceleration engine. Simply set `--infer_backend sglang` to enable it.
diff --git a/README_CN.md b/README_CN.md
@@ -71,6 +71,7 @@
 - **模型量化**：支持AWQ、GPTQ、FP8和BNB的量化导出，导出的模型支持使用vLLM/SGLang/LmDeploy推理加速，并支持继续训练。
 
 ## 🎉 新闻
+- 🎁 2025.07.12: 支持部署Embedding模型的部署(pt/vLLM/SGLang), 查看[这里](examples/deploy/embedding/client.py).
 - 🎁 2025.07.09: Megatron-SWIFT支持LoRA训练。相比ms-swift，在MoE模型提速显著。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/lora)。
 - 🎁 2025.06.23: 支持Reranker模型训练，训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh)。
 - 🎁 2025.06.18: 支持使用[sglang](https://github.com/sgl-project/sglang)推理加速引擎对ms-swift[推理](https://github.com/modelscope/ms-swift/blob/main/examples/infer/sglang)/部署/评测/ui模块进行加速，设置`--infer_backend sglang`即可。
diff --git a/docs/source/BestPractices/Embedding训练.md b/docs/source/BestPractices/Embedding训练.md
@@ -107,7 +107,9 @@ SWIFT提供了两个脚手架训练脚本：
 
 ## 推理
 
-SWIFT当前没有支持Embedding的模型推理和部署（时间问题），可以使用原模型的代码进行推理：
+SWIFT已经支持GME、GTE、Qwen3-Embedding模型的部署，请查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/embedding/client.py).
+
+也可以使用原模型的代码进行推理：
 
 https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct
 
diff --git a/docs/source_en/BestPractices/Embedding.md b/docs/source_en/BestPractices/Embedding.md
@@ -107,7 +107,9 @@ SWIFT provides two scaffold training scripts:
 
 ## Inference
 
-SWIFT currently does not support Embedding model inference and deployment (due to time constraints). You can use the original model's code for inference:
+SWIFT has supported the deployment of GME、GTE、Qwen3-Embedding models，please check[here](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/embedding/client.py).
+
+You can also use the original model's code for inference:
 
 https://www.modelscope.cn/models/iic/gte_Qwen2-7B-instruct
 
diff --git a/examples/deploy/embedding/client.py b/examples/deploy/embedding/client.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+from openai import OpenAI
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def infer(client, model: str, messages):
+    # You can also use client.embeddings.create
+    # But this interface does not support multi-modal medias
+    resp = client.chat.completions.create(model=model, messages=messages)
+    emb = resp.data[0]['embedding']
+    shape = len(emb)
+    sample = str(emb)
+    if len(emb) > 6:
+        sample = str(emb[:3])[:-1] + ', ..., ' + str(emb[-3:])[1:]
+    print(f'query: {input}')
+    print(f'Embedding(shape: [1, {shape}]): {sample}')
+    return emb
+
+
+def run_client(host: str = '127.0.0.1', port: int = 8000):
+    client = OpenAI(
+        api_key='EMPTY',
+        base_url=f'http://{host}:{port}/v1',
+    )
+    model = client.models.list().data[0].id
+    print(f'model: {model}')
+
+    messages = [{
+        'role':
+        'user',
+        'content': [
+            # {
+            #   'type': 'image',
+            #   'image': 'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png'
+            # },
+            {
+                'type': 'text',
+                'text': 'What is the capital of China?'
+            },
+        ]
+    }]
+    infer(client, model, messages)
+
+
+if __name__ == '__main__':
+    from swift.llm import run_deploy, DeployArguments
+    with run_deploy(
+            DeployArguments(
+                model='Qwen/Qwen3-Embedding-0.6B',
+                task_type='embedding',
+                infer_backend='vllm',
+                verbose=False,
+                log_interval=-1)) as port:
+        run_client(port=port)
diff --git a/examples/deploy/embedding/server.sh b/examples/deploy/embedding/server.sh
@@ -0,0 +1,5 @@
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --model Qwen/Qwen3-Embedding-0.6B \
+    --infer_backend sglang
diff --git a/swift/llm/argument/infer_args.py b/swift/llm/argument/infer_args.py
@@ -104,6 +104,8 @@ def get_vllm_engine_kwargs(self):
             'use_async_engine': self.use_async_engine,
             'quantization': self.vllm_quantization,
         }
+        if self.task_type == 'embedding':
+            kwargs['task_type'] = 'embed'
         return kwargs
 
 
@@ -135,6 +137,8 @@ def get_sglang_engine_kwargs(self):
             'enable_dp_attention': self.sglang_enable_dp_attention,
             'disable_custom_all_reduce': self.sglang_disable_custom_all_reduce,
         }
+        if self.task_type == 'embedding':
+            kwargs['task_type'] = 'embedding'
         return kwargs
 
 
diff --git a/swift/llm/infer/deploy.py b/swift/llm/infer/deploy.py
@@ -16,7 +16,7 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from swift.llm import AdapterRequest, DeployArguments
-from swift.llm.infer.protocol import MultiModalRequestMixin
+from swift.llm.infer.protocol import EmbeddingRequest, MultiModalRequestMixin
 from swift.plugin import InferStats
 from swift.utils import JsonlWriter, get_logger
 from .infer import SwiftInfer
@@ -34,6 +34,7 @@ def _register_app(self):
         self.app.get('/v1/models')(self.get_available_models)
         self.app.post('/v1/chat/completions')(self.create_chat_completion)
         self.app.post('/v1/completions')(self.create_completion)
+        self.app.post('/v1/embeddings')(self.create_embedding)
 
     def __init__(self, args: Union[List[str], DeployArguments, None] = None) -> None:
         super().__init__(args)
@@ -183,13 +184,20 @@ async def _gen_wrapper():
                 yield 'data: [DONE]\n\n'
 
             return StreamingResponse(_gen_wrapper(), media_type='text/event-stream')
-        else:
+        elif hasattr(res_or_gen, 'choices'):
+            # instance of ChatCompletionResponse
             return self._post_process(request_info, res_or_gen, return_cmpl_response)
+        else:
+            return res_or_gen
 
     async def create_completion(self, request: CompletionRequest, raw_request: Request):
         chat_request = ChatCompletionRequest.from_cmpl_request(request)
         return await self.create_chat_completion(chat_request, raw_request, return_cmpl_response=True)
 
+    async def create_embedding(self, request: EmbeddingRequest, raw_request: Request):
+        chat_request = ChatCompletionRequest.from_cmpl_request(request)
+        return await self.create_chat_completion(chat_request, raw_request, return_cmpl_response=True)
+
     def run(self):
         args = self.args
         self.jsonl_writer = JsonlWriter(args.result_path) if args.result_path else None
diff --git a/swift/llm/infer/infer.py b/swift/llm/infer/infer.py
@@ -93,6 +93,18 @@ def run(self) -> List[Dict[str, Any]]:
             logger.info(f'The inference results have been saved to result_path: `{args.result_path}`.')
         return result
 
+    @staticmethod
+    def parse_data_from_response(response):
+        if hasattr(response, 'choices'):
+            return response.choices[0].message.content
+        elif hasattr(response, 'data'):
+            emb = response.data[0].embedding
+            shape = len(emb)
+            sample = str(emb)
+            if len(emb) > 6:
+                sample = str(emb[:3])[:-1] + ', ..., ' + str(emb[-3:])[1:]
+            return f'Embedding(shape: [1, {shape}]): {sample}'
+
     def infer_single(self, infer_request: Union[InferRequest, Dict[str, Any]], request_config: RequestConfig) -> str:
         res_or_gen = self.infer([infer_request],
                                 request_config,
@@ -107,7 +119,7 @@ def infer_single(self, infer_request: Union[InferRequest, Dict[str, Any]], reque
                 response += delta
             print()
         else:
-            response = res_or_gen.choices[0].message.content
+            response = self.parse_data_from_response(res_or_gen)
             print(response)
         print('-' * 50)
         return response
diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
@@ -19,7 +19,8 @@
 from swift.plugin import Metric
 from swift.tuners import Swift
 from ..protocol import (ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-                        ChatCompletionStreamResponse, ChatMessage, DeltaMessage, RequestConfig, random_uuid)
+                        ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingResponse,
+                        EmbeddingResponseData, RequestConfig, random_uuid)
 from .infer_engine import InferEngine
 from .utils import AdapterRequest, InferStreamer, LogitsStreamer, TokensIteratorStreamer, prepare_generation_config
 
@@ -325,26 +326,42 @@ def _infer_forward(self,
             call_kwargs['adapter_names'] = adapter_names
         num_prompt_tokens = self._get_num_tokens(inputs)
         inputs.pop('labels', None)
-        logits = self.model(**inputs, **call_kwargs).logits
+        output = self.model(**inputs, **call_kwargs)
+        if hasattr(output, 'logits'):
+            logits = output.logits
+        elif 'last_hidden_state' in output:
+            # embeddings
+            logits = output['last_hidden_state']
         if template.mode == 'seq_cls':
             preds, logprobs = template.decode_seq_cls(logits, top_logprobs)
         elif template.mode == 'prm':
             preds = template.decode_prm(inputs['input_ids'], logits)
             logprobs = [None] * len(preds)
+        elif template.mode == 'embedding':
+            preds = logits
+            logprobs = [None] * len(preds)
         else:
             raise ValueError(f'Unsupported mode: {template.mode}')
 
         res = []
         for i, pred in enumerate(preds):
             usage_info = self._get_usage_info(num_prompt_tokens, 1)
-            choices = [
-                ChatCompletionResponseChoice(
-                    index=0,
-                    message=ChatMessage(role='assistant', content=pred, tool_calls=None),
-                    finish_reason='stop',
-                    logprobs=logprobs[i])
-            ]
-            res.append(ChatCompletionResponse(model=self.model_name, choices=choices, usage=usage_info))
+            if template.mode != 'embedding':
+                choices = [
+                    ChatCompletionResponseChoice(
+                        index=0,
+                        message=ChatMessage(role='assistant', content=pred, tool_calls=None),
+                        finish_reason='stop',
+                        logprobs=logprobs[i])
+                ]
+                res.append(ChatCompletionResponse(model=self.model_name, choices=choices, usage=usage_info))
+            else:
+                res.append(
+                    EmbeddingResponse(
+                        model=self.model_name,
+                        usage=usage_info,
+                        data=[EmbeddingResponseData(embedding=pred.to(torch.float32).cpu().numpy().tolist())]))
+
         return res
 
     def _infer_full(self,
@@ -502,7 +519,8 @@ def _gen_wrapper():
             return _gen_wrapper()
         else:
             if len(kwargs) > 0:
-                infer_func = self._infer_forward if template.mode in ('seq_cls', 'prm') else self._infer_full
+                infer_func = self._infer_forward if template.mode in ('seq_cls', 'prm',
+                                                                      'embedding') else self._infer_full
                 res = infer_func(**kwargs)
             else:
                 res = []
diff --git a/swift/llm/infer/infer_engine/sglang_engine.py b/swift/llm/infer/infer_engine/sglang_engine.py
@@ -13,7 +13,8 @@
 from swift.llm import InferRequest, Template, TemplateMeta, get_model_tokenizer
 from swift.plugin import Metric
 from ..protocol import (ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-                        ChatCompletionStreamResponse, ChatMessage, DeltaMessage, RequestConfig, random_uuid)
+                        ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingResponse,
+                        EmbeddingResponseData, RequestConfig, random_uuid)
 from .infer_engine import InferEngine
 
 
@@ -43,6 +44,7 @@ def __init__(
         log_level='error',
         engine_kwargs: Optional[Dict[str, Any]] = None,
         template: Optional[Template] = None,
+        task_type: Optional[str] = None,
     ):
         if engine_kwargs is None:
             engine_kwargs = {}
@@ -77,6 +79,9 @@ def __init__(
             log_level=log_level,
             **engine_kwargs,
         )
+        self.task_type = task_type
+        if task_type == 'embedding':
+            self.server_args.is_embedding = True
         self.engine = sgl.Engine(server_args=self.server_args)
         self._load_generation_config()
 
@@ -151,10 +156,16 @@ async def infer_async(self,
             template = self.default_template
 
         template.set_mode('pt')
+        if self.task_type == 'embedding':
+            # TODO Refactor me
+            template.infer_backend = 'sglang'
+            template.task_type = self.task_type
+            template.set_mode('embedding')
         loop = asyncio.get_running_loop()
         with torch.inference_mode():
             inputs = await loop.run_in_executor(None, template.encode, infer_request)
-
+        if self.task_type == 'embedding':
+            inputs.pop('length', None)
         self.set_default_max_tokens(request_config, inputs)
         generation_config = self._prepare_generation_config(request_config)
         self._add_stop_words(generation_config, request_config, template.template_meta)
@@ -163,9 +174,25 @@ async def infer_async(self,
             kwargs = pre_infer_hook(kwargs)
         if request_config.stream:
             return self._infer_stream_async(**kwargs)
+        elif self.task_type == 'embedding':
+            kwargs.pop('generation_config', None)
+            return await self._infer_embedding_async(**kwargs)
         else:
             return await self._infer_full_async(**kwargs)
 
+    async def _infer_embedding_async(self, template: Template, inputs: Dict[str, Any]) -> EmbeddingResponse:
+        from sglang.srt.managers.io_struct import EmbeddingReqInput
+        obj = EmbeddingReqInput(
+            input_ids=inputs['input_ids'], image_data=inputs.get('images'), audio_data=inputs.get('audios'))
+        generator = self.engine.tokenizer_manager.generate_request(obj, None)
+        output = await generator.__anext__()
+        usage_info = self._get_usage_info(output['meta_info']['prompt_tokens'], 0)
+        return EmbeddingResponse(
+            model=self.model_name,
+            data=[EmbeddingResponseData(embedding=output['embedding'])],
+            usage=usage_info,
+            id=random_uuid())
+
     async def _infer_full_async(self, template: Template, inputs: Dict[str, Any],
                                 generation_config: Dict[str, Any]) -> ChatCompletionResponse:
         output = await self.engine.async_generate(**inputs, sampling_params=generation_config)
diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py
diff --git a/swift/llm/infer/protocol.py b/swift/llm/infer/protocol.py
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
diff --git a/swift/utils/utils.py b/swift/utils/utils.py

Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,8 @@ def get_vllm_engine_kwargs(self):`
`104`	`104`	`'use_async_engine': self.use_async_engine,`
`105`	`105`	`'quantization': self.vllm_quantization,`
`106`	`106`	`}`
	`107`	`+ if self.task_type == 'embedding':`
	`108`	`+ kwargs['task_type'] = 'embed'`
`107`	`109`	`return kwargs`
`108`	`110`
`109`	`111`
`@@ -135,6 +137,8 @@ def get_sglang_engine_kwargs(self):`
`135`	`137`	`'enable_dp_attention': self.sglang_enable_dp_attention,`
`136`	`138`	`'disable_custom_all_reduce': self.sglang_disable_custom_all_reduce,`
`137`	`139`	`}`
	`140`	`+ if self.task_type == 'embedding':`
	`141`	`+ kwargs['task_type'] = 'embedding'`
`138`	`142`	`return kwargs`
`139`	`143`
`140`	`144`