lmdeploy==0.7.3 vllm==0.8.4

shell-nlp · shell-nlp · commit 0335623f9a07 · 2025-04-15T23:03:26.000+08:00
diff --git a/gpt_server/model_backend/vllm_backend.py b/gpt_server/model_backend/vllm_backend.py
@@ -6,7 +6,6 @@
 from fastchat.utils import is_partial_stop
 from gpt_server.model_backend.base import ModelBackend
 from loguru import logger
-import vllm
 from lmdeploy.serve.openai.reasoning_parser import ReasoningParserManager
 from vllm.lora.request import LoRARequest
 from transformers import AutoTokenizer
diff --git a/gpt_server/model_worker/internvl.py b/gpt_server/model_worker/internvl.py
@@ -37,33 +37,25 @@ def __init__(
         ]
         logger.info(f"{model_names[0]} 停用词: {self.stop}")
         # from https://github.com/xorbitsai/inference/blob/c70ea74fa820a613f8d577047ef1818da20a96b3/xinference/model/llm/llm_family_modelscope.json
-        self.vl_chat_template = (
-            "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
-        )
+        self.vl_chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
 
     async def generate_stream_gate(self, params):
         self.call_ct += 1
         logger.info(f"params {params}")
         logger.info(f"worker_id: {self.worker_id}")
         try:
             messages = params.get("messages", [])
-            if not self.vision_config:
-                if isinstance(messages, list):
-                    pass
-                elif isinstance(messages, str):
-                    text = messages
-
-            else:  # 多模态
-                if isinstance(messages, list):
-                    text = self.tokenizer.apply_chat_template(
-                        messages,
-                        chat_template=self.vl_chat_template,
-                        tokenize=False,
-                        add_generation_prompt=True,
-                    )
-                    params["prompt"] = text
-                    # 多模态不需要传入input_ids
-                    params["multimodal"] = True
+            # 一定是多模态
+            if isinstance(messages, list):
+                text = self.tokenizer.apply_chat_template(
+                    messages,
+                    chat_template=self.vl_chat_template,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                params["prompt"] = text
+                # 多模态不需要传入input_ids
+                params["multimodal"] = True
             params["messages"] = messages
             params["stop"].extend(self.stop)
             params["stop_words_ids"] = self.stop_words_ids
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "gpt_server"
-version = "0.4.0"
+version = "0.4.1"
 description = "gpt_server是一个用于生产级部署LLMs或Embedding的开源框架。"
 readme = "README.md"
 license = { text = "Apache 2.0" }
@@ -12,14 +12,14 @@ dependencies = [
     "ffmpy",
     "fschat==0.2.36",
     "infinity-emb[all]==0.0.73",
-    "lmdeploy==0.7.2.post1",
+    "lmdeploy==0.7.3",
     "loguru>=0.7.2",
     "openai==1.55.3",
     "setuptools==75.2.0",
     "streamlit==1.39.0",
     "torch==2.5.1",
     "torchvision==0.20.1",
-    "vllm==0.8.3",
+    "vllm==0.8.4",
     "qwen_vl_utils",
     "evalscope[perf]==0.10.1",
     "modelscope==1.20.1",
@@ -39,6 +39,7 @@ override-dependencies = [
     "outlines==0.1.11",
     "transformers==4.50.0",
     "soundfile==0.13.1", # infinity
+    "xgrammar==0.1.18", #  sglang[all]==0.4.5 depends on xgrammar==0.1.17
 ]
 
 [project.scripts]
diff --git a/requirements.txt b/requirements.txt
@@ -115,7 +115,7 @@ coloredlogs==15.0.1
     # via onnxruntime
 colpali-engine==0.3.9
     # via infinity-emb
-compressed-tensors==0.9.2
+compressed-tensors==0.9.3
     # via
     #   sglang
     #   vllm
@@ -148,6 +148,12 @@ decorator==5.2.1
     #   librosa
 decord==0.6.0
     # via sglang
+deprecated==1.2.18
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
 depyf==0.18.0
     # via vllm
 dill==0.3.8
@@ -240,14 +246,20 @@ fsspec==2024.6.1
     #   torch
 funasr==1.2.6
     # via gpt-server (pyproject.toml)
-gguf==0.10.0
+gguf==0.14.0
     # via vllm
 gitdb==4.0.12
     # via gitpython
 gitpython==3.1.44
     # via streamlit
+googleapis-common-protos==1.70.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
 gputil==1.4.0
     # via colpali-engine
+grpcio==1.71.0
+    # via opentelemetry-exporter-otlp-proto-grpc
 h11==0.14.0
     # via
     #   httpcore
@@ -294,9 +306,10 @@ idna==3.10
     #   httpx
     #   requests
     #   yarl
-importlib-metadata==8.6.1
+importlib-metadata==8.0.0
     # via
     #   litellm
+    #   opentelemetry-api
     #   vllm
 infinity-emb==0.0.73
     # via gpt-server (pyproject.toml)
@@ -376,7 +389,7 @@ llvmlite==0.44.0
     #   pynndescent
 lm-format-enforcer==0.10.11
     # via vllm
-lmdeploy==0.7.2.post1
+lmdeploy==0.7.3
     # via gpt-server (pyproject.toml)
 loguru==0.7.3
     # via gpt-server (pyproject.toml)
@@ -424,9 +437,7 @@ multiprocess==0.70.16
     # via
     #   datasets
     #   evaluate
-nanobind==2.6.1
-    # via xgrammar
-narwhals==1.34.1
+narwhals==1.35.0
     # via
     #   altair
     #   plotly
@@ -445,7 +456,7 @@ nltk==3.9.1
     # via
     #   evalscope
     #   rouge-score
-numba==0.61.0
+numba==0.61.2
     # via
     #   librosa
     #   pynndescent
@@ -558,6 +569,37 @@ opencv-python-headless==4.11.0.86
     # via
     #   mistral-common
     #   vllm
+opentelemetry-api==1.26.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   vllm
+opentelemetry-exporter-otlp==1.26.0
+    # via vllm
+opentelemetry-exporter-otlp-proto-common==1.26.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.26.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.26.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-proto==1.26.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.26.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   vllm
+opentelemetry-semantic-conventions==0.47b0
+    # via opentelemetry-sdk
+opentelemetry-semantic-conventions-ai==0.4.3
+    # via vllm
 optimum==1.24.0
     # via infinity-emb
 orjson==3.10.16
@@ -655,19 +697,21 @@ prometheus-fastapi-instrumentator==7.1.0
     # via
     #   infinity-emb
     #   vllm
-prompt-toolkit==3.0.50
+prompt-toolkit==3.0.51
     # via
     #   fschat
     #   ipython
 propcache==0.3.1
     # via
     #   aiohttp
     #   yarl
-protobuf==5.29.4
+protobuf==4.25.6
     # via
+    #   googleapis-common-protos
     #   lmdeploy
     #   onnx
     #   onnxruntime
+    #   opentelemetry-proto
     #   optimum
     #   ray
     #   streamlit
@@ -803,6 +847,7 @@ requests==2.32.3
     #   huggingface-hub
     #   mistral-common
     #   modelscope
+    #   opentelemetry-exporter-otlp-proto-http
     #   oss2
     #   outlines
     #   pooch
@@ -870,6 +915,7 @@ sentencepiece==0.2.0
     # via
     #   evalscope
     #   funasr
+    #   gguf
     #   lmdeploy
     #   mistral-common
     #   vllm
@@ -1081,6 +1127,7 @@ typing-extensions==4.13.2
     #   mistral-common
     #   multidict
     #   openai
+    #   opentelemetry-sdk
     #   outlines
     #   pydantic
     #   pydantic-core
@@ -1118,7 +1165,7 @@ uvloop==0.21.0
     # via
     #   sglang
     #   uvicorn
-vllm==0.8.3
+vllm==0.8.4
     # via gpt-server (pyproject.toml)
 watchdog==5.0.3
     # via streamlit
@@ -1132,10 +1179,13 @@ wcwidth==0.2.13
     # via prompt-toolkit
 websockets==15.0.1
     # via uvicorn
+wrapt==1.17.2
+    # via deprecated
 xformers==0.0.29.post2
     # via vllm
-xgrammar==0.1.17
+xgrammar==0.1.18
     # via
+    #   --override (workspace)
     #   sglang
     #   vllm
 xxhash==3.5.0
diff --git a/tests/test_chat_template.py b/tests/test_chat_template.py
@@ -0,0 +1,34 @@
+from transformers import AutoTokenizer
+
+url = "https://opencompass.oss-cn-shanghai.aliyuncs.com/image/compass-hub/botchat_banner.png"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "请描述这个图片",
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": url,
+                },
+            },
+        ],
+    }
+]
+
+chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+tokenizer = AutoTokenizer.from_pretrained(
+    "/home/dev/model/IntervitensInc/InternVL3-38B-AWQ"
+)
+# chat_template = None
+prompt = tokenizer.apply_chat_template(
+    conversation=messages,
+    chat_template=chat_template,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+
+print(prompt)
diff --git a/tests/test_openai_completion_response_format.py b/tests/test_openai_completion_response_format.py
@@ -5,15 +5,15 @@
 client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")
 # 方式一
 output = client.chat.completions.create(
-    model="qwen-3b",
+    model="qwen",
     messages=[{"role": "user", "content": "南京到北京多远"}],
     response_format={"type": "text"},
 )
 print(output.choices[0].message.content)
 print("-" * 100)
 # 方式二
 output = client.chat.completions.create(
-    model="qwen-3b",
+    model="qwen",
     messages=[
         {"role": "system", "content": "用json进行回答"},
         {"role": "user", "content": "南京到北京多远"},
@@ -31,7 +31,7 @@ class Distance(BaseModel):
 
 
 output = client.beta.chat.completions.parse(
-    model="qwen-3b",
+    model="qwen",
     messages=[{"role": "user", "content": "南京到北京多远"}],
     response_format=Distance,
 )
diff --git a/uv.lock b/uv.lock