Skip to content

Commit 0335623

Browse files
committed
lmdeploy==0.7.3 vllm==0.8.4
1 parent fcf210f commit 0335623

File tree

7 files changed

+491
-124
lines changed

7 files changed

+491
-124
lines changed

gpt_server/model_backend/vllm_backend.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from fastchat.utils import is_partial_stop
77
from gpt_server.model_backend.base import ModelBackend
88
from loguru import logger
9-
import vllm
109
from lmdeploy.serve.openai.reasoning_parser import ReasoningParserManager
1110
from vllm.lora.request import LoRARequest
1211
from transformers import AutoTokenizer

gpt_server/model_worker/internvl.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -37,33 +37,25 @@ def __init__(
3737
]
3838
logger.info(f"{model_names[0]} 停用词: {self.stop}")
3939
# from https://github.com/xorbitsai/inference/blob/c70ea74fa820a613f8d577047ef1818da20a96b3/xinference/model/llm/llm_family_modelscope.json
40-
self.vl_chat_template = (
41-
"{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
42-
)
40+
self.vl_chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
4341

4442
async def generate_stream_gate(self, params):
4543
self.call_ct += 1
4644
logger.info(f"params {params}")
4745
logger.info(f"worker_id: {self.worker_id}")
4846
try:
4947
messages = params.get("messages", [])
50-
if not self.vision_config:
51-
if isinstance(messages, list):
52-
pass
53-
elif isinstance(messages, str):
54-
text = messages
55-
56-
else: # 多模态
57-
if isinstance(messages, list):
58-
text = self.tokenizer.apply_chat_template(
59-
messages,
60-
chat_template=self.vl_chat_template,
61-
tokenize=False,
62-
add_generation_prompt=True,
63-
)
64-
params["prompt"] = text
65-
# 多模态不需要传入input_ids
66-
params["multimodal"] = True
48+
# 一定是多模态
49+
if isinstance(messages, list):
50+
text = self.tokenizer.apply_chat_template(
51+
messages,
52+
chat_template=self.vl_chat_template,
53+
tokenize=False,
54+
add_generation_prompt=True,
55+
)
56+
params["prompt"] = text
57+
# 多模态不需要传入input_ids
58+
params["multimodal"] = True
6759
params["messages"] = messages
6860
params["stop"].extend(self.stop)
6961
params["stop_words_ids"] = self.stop_words_ids

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "gpt_server"
3-
version = "0.4.0"
3+
version = "0.4.1"
44
description = "gpt_server是一个用于生产级部署LLMs或Embedding的开源框架。"
55
readme = "README.md"
66
license = { text = "Apache 2.0" }
@@ -12,14 +12,14 @@ dependencies = [
1212
"ffmpy",
1313
"fschat==0.2.36",
1414
"infinity-emb[all]==0.0.73",
15-
"lmdeploy==0.7.2.post1",
15+
"lmdeploy==0.7.3",
1616
"loguru>=0.7.2",
1717
"openai==1.55.3",
1818
"setuptools==75.2.0",
1919
"streamlit==1.39.0",
2020
"torch==2.5.1",
2121
"torchvision==0.20.1",
22-
"vllm==0.8.3",
22+
"vllm==0.8.4",
2323
"qwen_vl_utils",
2424
"evalscope[perf]==0.10.1",
2525
"modelscope==1.20.1",
@@ -39,6 +39,7 @@ override-dependencies = [
3939
"outlines==0.1.11",
4040
"transformers==4.50.0",
4141
"soundfile==0.13.1", # infinity
42+
"xgrammar==0.1.18", # sglang[all]==0.4.5 depends on xgrammar==0.1.17
4243
]
4344

4445
[project.scripts]

requirements.txt

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ coloredlogs==15.0.1
115115
# via onnxruntime
116116
colpali-engine==0.3.9
117117
# via infinity-emb
118-
compressed-tensors==0.9.2
118+
compressed-tensors==0.9.3
119119
# via
120120
# sglang
121121
# vllm
@@ -148,6 +148,12 @@ decorator==5.2.1
148148
# librosa
149149
decord==0.6.0
150150
# via sglang
151+
deprecated==1.2.18
152+
# via
153+
# opentelemetry-api
154+
# opentelemetry-exporter-otlp-proto-grpc
155+
# opentelemetry-exporter-otlp-proto-http
156+
# opentelemetry-semantic-conventions
151157
depyf==0.18.0
152158
# via vllm
153159
dill==0.3.8
@@ -240,14 +246,20 @@ fsspec==2024.6.1
240246
# torch
241247
funasr==1.2.6
242248
# via gpt-server (pyproject.toml)
243-
gguf==0.10.0
249+
gguf==0.14.0
244250
# via vllm
245251
gitdb==4.0.12
246252
# via gitpython
247253
gitpython==3.1.44
248254
# via streamlit
255+
googleapis-common-protos==1.70.0
256+
# via
257+
# opentelemetry-exporter-otlp-proto-grpc
258+
# opentelemetry-exporter-otlp-proto-http
249259
gputil==1.4.0
250260
# via colpali-engine
261+
grpcio==1.71.0
262+
# via opentelemetry-exporter-otlp-proto-grpc
251263
h11==0.14.0
252264
# via
253265
# httpcore
@@ -294,9 +306,10 @@ idna==3.10
294306
# httpx
295307
# requests
296308
# yarl
297-
importlib-metadata==8.6.1
309+
importlib-metadata==8.0.0
298310
# via
299311
# litellm
312+
# opentelemetry-api
300313
# vllm
301314
infinity-emb==0.0.73
302315
# via gpt-server (pyproject.toml)
@@ -376,7 +389,7 @@ llvmlite==0.44.0
376389
# pynndescent
377390
lm-format-enforcer==0.10.11
378391
# via vllm
379-
lmdeploy==0.7.2.post1
392+
lmdeploy==0.7.3
380393
# via gpt-server (pyproject.toml)
381394
loguru==0.7.3
382395
# via gpt-server (pyproject.toml)
@@ -424,9 +437,7 @@ multiprocess==0.70.16
424437
# via
425438
# datasets
426439
# evaluate
427-
nanobind==2.6.1
428-
# via xgrammar
429-
narwhals==1.34.1
440+
narwhals==1.35.0
430441
# via
431442
# altair
432443
# plotly
@@ -445,7 +456,7 @@ nltk==3.9.1
445456
# via
446457
# evalscope
447458
# rouge-score
448-
numba==0.61.0
459+
numba==0.61.2
449460
# via
450461
# librosa
451462
# pynndescent
@@ -558,6 +569,37 @@ opencv-python-headless==4.11.0.86
558569
# via
559570
# mistral-common
560571
# vllm
572+
opentelemetry-api==1.26.0
573+
# via
574+
# opentelemetry-exporter-otlp-proto-grpc
575+
# opentelemetry-exporter-otlp-proto-http
576+
# opentelemetry-sdk
577+
# opentelemetry-semantic-conventions
578+
# vllm
579+
opentelemetry-exporter-otlp==1.26.0
580+
# via vllm
581+
opentelemetry-exporter-otlp-proto-common==1.26.0
582+
# via
583+
# opentelemetry-exporter-otlp-proto-grpc
584+
# opentelemetry-exporter-otlp-proto-http
585+
opentelemetry-exporter-otlp-proto-grpc==1.26.0
586+
# via opentelemetry-exporter-otlp
587+
opentelemetry-exporter-otlp-proto-http==1.26.0
588+
# via opentelemetry-exporter-otlp
589+
opentelemetry-proto==1.26.0
590+
# via
591+
# opentelemetry-exporter-otlp-proto-common
592+
# opentelemetry-exporter-otlp-proto-grpc
593+
# opentelemetry-exporter-otlp-proto-http
594+
opentelemetry-sdk==1.26.0
595+
# via
596+
# opentelemetry-exporter-otlp-proto-grpc
597+
# opentelemetry-exporter-otlp-proto-http
598+
# vllm
599+
opentelemetry-semantic-conventions==0.47b0
600+
# via opentelemetry-sdk
601+
opentelemetry-semantic-conventions-ai==0.4.3
602+
# via vllm
561603
optimum==1.24.0
562604
# via infinity-emb
563605
orjson==3.10.16
@@ -655,19 +697,21 @@ prometheus-fastapi-instrumentator==7.1.0
655697
# via
656698
# infinity-emb
657699
# vllm
658-
prompt-toolkit==3.0.50
700+
prompt-toolkit==3.0.51
659701
# via
660702
# fschat
661703
# ipython
662704
propcache==0.3.1
663705
# via
664706
# aiohttp
665707
# yarl
666-
protobuf==5.29.4
708+
protobuf==4.25.6
667709
# via
710+
# googleapis-common-protos
668711
# lmdeploy
669712
# onnx
670713
# onnxruntime
714+
# opentelemetry-proto
671715
# optimum
672716
# ray
673717
# streamlit
@@ -803,6 +847,7 @@ requests==2.32.3
803847
# huggingface-hub
804848
# mistral-common
805849
# modelscope
850+
# opentelemetry-exporter-otlp-proto-http
806851
# oss2
807852
# outlines
808853
# pooch
@@ -870,6 +915,7 @@ sentencepiece==0.2.0
870915
# via
871916
# evalscope
872917
# funasr
918+
# gguf
873919
# lmdeploy
874920
# mistral-common
875921
# vllm
@@ -1081,6 +1127,7 @@ typing-extensions==4.13.2
10811127
# mistral-common
10821128
# multidict
10831129
# openai
1130+
# opentelemetry-sdk
10841131
# outlines
10851132
# pydantic
10861133
# pydantic-core
@@ -1118,7 +1165,7 @@ uvloop==0.21.0
11181165
# via
11191166
# sglang
11201167
# uvicorn
1121-
vllm==0.8.3
1168+
vllm==0.8.4
11221169
# via gpt-server (pyproject.toml)
11231170
watchdog==5.0.3
11241171
# via streamlit
@@ -1132,10 +1179,13 @@ wcwidth==0.2.13
11321179
# via prompt-toolkit
11331180
websockets==15.0.1
11341181
# via uvicorn
1182+
wrapt==1.17.2
1183+
# via deprecated
11351184
xformers==0.0.29.post2
11361185
# via vllm
1137-
xgrammar==0.1.17
1186+
xgrammar==0.1.18
11381187
# via
1188+
# --override (workspace)
11391189
# sglang
11401190
# vllm
11411191
xxhash==3.5.0

tests/test_chat_template.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from transformers import AutoTokenizer
2+
3+
url = "https://opencompass.oss-cn-shanghai.aliyuncs.com/image/compass-hub/botchat_banner.png"
4+
messages = [
5+
{
6+
"role": "user",
7+
"content": [
8+
{
9+
"type": "text",
10+
"text": "请描述这个图片",
11+
},
12+
{
13+
"type": "image_url",
14+
"image_url": {
15+
"url": url,
16+
},
17+
},
18+
],
19+
}
20+
]
21+
22+
chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
23+
tokenizer = AutoTokenizer.from_pretrained(
24+
"/home/dev/model/IntervitensInc/InternVL3-38B-AWQ"
25+
)
26+
# chat_template = None
27+
prompt = tokenizer.apply_chat_template(
28+
conversation=messages,
29+
chat_template=chat_template,
30+
tokenize=False,
31+
add_generation_prompt=True,
32+
)
33+
34+
print(prompt)

tests/test_openai_completion_response_format.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
client = OpenAI(api_key="EMPTY", base_url="http://localhost:8082/v1")
66
# 方式一
77
output = client.chat.completions.create(
8-
model="qwen-3b",
8+
model="qwen",
99
messages=[{"role": "user", "content": "南京到北京多远"}],
1010
response_format={"type": "text"},
1111
)
1212
print(output.choices[0].message.content)
1313
print("-" * 100)
1414
# 方式二
1515
output = client.chat.completions.create(
16-
model="qwen-3b",
16+
model="qwen",
1717
messages=[
1818
{"role": "system", "content": "用json进行回答"},
1919
{"role": "user", "content": "南京到北京多远"},
@@ -31,7 +31,7 @@ class Distance(BaseModel):
3131

3232

3333
output = client.beta.chat.completions.parse(
34-
model="qwen-3b",
34+
model="qwen",
3535
messages=[{"role": "user", "content": "南京到北京多远"}],
3636
response_format=Distance,
3737
)

0 commit comments

Comments
 (0)