Skip to content

Commit 8f37f3e

Browse files
authored
fix vl nothink mode (#3776)
* fix vl nothink mode * fix
1 parent 87ccab3 commit 8f37f3e

File tree

18 files changed

+155
-46
lines changed

18 files changed

+155
-46
lines changed

lmdeploy/serve/vl_async_engine.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ async def _get_prompt_input(self,
5252
sequence_start: bool,
5353
adapter_name: str,
5454
tools: Optional[List[object]] = None,
55+
enable_thinking: Optional[bool] = None,
5556
**kwargs):
5657
"""Process messages and return the required data for the inference
5758
engines.
@@ -60,14 +61,24 @@ async def _get_prompt_input(self,
6061
the argument specification.
6162
"""
6263
if isinstance(messages, str):
63-
return await super()._get_prompt_input(messages, do_preprocess, sequence_start, adapter_name, tools,
64+
return await super()._get_prompt_input(messages,
65+
do_preprocess,
66+
sequence_start,
67+
adapter_name,
68+
tools=tools,
69+
enable_thinking=enable_thinking,
6470
**kwargs)
6571
elif isinstance(messages, List):
6672
has_multimodal_input = any(
6773
isinstance(message['content'], list) and any(item['type'] in ['image_url', 'image_data']
6874
for item in message['content']) for message in messages)
6975
if not has_multimodal_input:
70-
return await super()._get_prompt_input(messages, do_preprocess, sequence_start, adapter_name, tools,
76+
return await super()._get_prompt_input(messages,
77+
do_preprocess,
78+
sequence_start,
79+
adapter_name,
80+
tools,
81+
enable_thinking=enable_thinking,
7182
**kwargs)
7283
else:
7384
raise RuntimeError(f'unsupported messages {messages}')
@@ -82,11 +93,21 @@ async def _get_prompt_input(self,
8293
# embedding_ranges and so on. All the returned values are passed
8394
# to tm engine for token generation
8495
results = await self.vl_encoder.async_infer(results)
85-
results = await self.vl_encoder.wrap_for_turbomind(results, chat_template, self.tokenizer, sequence_start)
96+
results = await self.vl_encoder.wrap_for_turbomind(results,
97+
chat_template,
98+
self.tokenizer,
99+
sequence_start,
100+
tools=tools,
101+
enable_thinking=enable_thinking)
86102
elif self.backend == 'pytorch':
87103
# for pt engine, this module only conduct the image preprocessing
88104
# It leaves the vision embedding to the pt engine
89-
results = await self.vl_encoder.wrap_for_pytorch(results, chat_template, self.tokenizer, sequence_start)
105+
results = await self.vl_encoder.wrap_for_pytorch(results,
106+
chat_template,
107+
self.tokenizer,
108+
sequence_start,
109+
tools=tools,
110+
enable_thinking=enable_thinking)
90111
return results
91112

92113
@classmethod

lmdeploy/vl/engine.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,15 @@ async def async_infer(self, messages: List[Dict]) -> List[Dict]:
6161
outputs = await future
6262
return outputs
6363

64-
async def wrap_for_pytorch(self, messages: List[Dict], chat_template, tokenizer, sequence_start) -> List[Dict]:
64+
async def wrap_for_pytorch(
65+
self,
66+
messages: List[Dict],
67+
chat_template,
68+
tokenizer,
69+
sequence_start,
70+
tools: Optional[List[object]] = None,
71+
enable_thinking: Optional[bool] = None,
72+
) -> List[Dict]:
6573
"""
6674
Args:
6775
messages (List[Dict]): a list of message, which is supposed to be
@@ -78,14 +86,27 @@ async def wrap_for_pytorch(self, messages: List[Dict], chat_template, tokenizer,
7886
]
7987
)
8088
"""
81-
result = self.model.to_pytorch(messages, chat_template, tokenizer, sequence_start)
89+
result = self.model.to_pytorch(messages,
90+
chat_template,
91+
tokenizer,
92+
sequence_start,
93+
tools=tools,
94+
enable_thinking=enable_thinking)
8295
# clear data
8396
for i, message in enumerate(messages):
8497
if isinstance(message['content'], List):
8598
messages[i]['preprocess'] = None
8699
return result
87100

88-
async def wrap_for_turbomind(self, messages: List[Dict], chat_template, tokenizer, sequence_start) -> Dict:
101+
async def wrap_for_turbomind(
102+
self,
103+
messages: List[Dict],
104+
chat_template,
105+
tokenizer,
106+
sequence_start,
107+
tools: Optional[List[object]] = None,
108+
enable_thinking: Optional[bool] = None,
109+
) -> Dict:
89110
"""
90111
Args:
91112
messages (List[Dict]): a list of message, which is supposed to be
@@ -100,7 +121,12 @@ async def wrap_for_turbomind(self, messages: List[Dict], chat_template, tokenize
100121
'input_embedding_ranges': list[torch.Tensor],
101122
...
102123
"""
103-
result = self.model.to_turbomind(messages, chat_template, tokenizer, sequence_start)
124+
result = self.model.to_turbomind(messages,
125+
chat_template,
126+
tokenizer,
127+
sequence_start,
128+
tools=tools,
129+
enable_thinking=enable_thinking)
104130
# clear data
105131
for i, message in enumerate(messages):
106132
if isinstance(message['content'], List):

lmdeploy/vl/model/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
119119
if self.backend == 'turbomind':
120120
raise NotImplementedError()
121121

122-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
122+
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
123123
"""Pack the preprocessing results in a format compatible with what is
124124
required by pytorch engine. ONLY implement it when the backend is
125125
pytorch engine.
@@ -133,7 +133,7 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
133133
if self.backend == 'pytorch':
134134
raise NotImplementedError()
135135

136-
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
136+
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
137137
"""Pack the forwarding results in a format compatible with what is
138138
required by turbomind engine. ONLY implement it when the backend is
139139
turbomind engine.

lmdeploy/vl/model/cogvlm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,6 @@ def proc_messages(messages, chat_template, sequence_start):
8585
prompt += prompt_i
8686
return prompt, IMAGE_TOKEN
8787

88-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
88+
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
8989
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
9090
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

lmdeploy/vl/model/deepseek.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,10 @@ def proc_messages(messages, chat_template, sequence_start):
164164
prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
165165
return prompt, IMAGE_TOKEN
166166

167-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
167+
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
168168
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
169169
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
170170

171-
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
171+
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
172172
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
173173
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

lmdeploy/vl/model/deepseek_vl2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,10 @@ def proc_messages(messages, chat_template, sequence_start):
159159
prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
160160
return prompt, IMAGE_TOKEN
161161

162-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
162+
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
163163
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
164164
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
165165

166-
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
166+
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
167167
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
168168
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

lmdeploy/vl/model/gemma3_vl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,10 @@ def proc_messages(messages, chat_template, sequence_start):
123123
prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
124124
return prompt, IMAGE_TOKEN
125125

126-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
126+
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
127127
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
128128
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
129129

130-
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
130+
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
131131
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
132132
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

lmdeploy/vl/model/glm_4v.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,6 @@ def proc_messages(messages, chat_template, sequence_start):
8686
prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
8787
return prompt, IMAGE_TOKEN
8888

89-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
89+
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
9090
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
9191
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

lmdeploy/vl/model/internvl.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
2-
from typing import Dict, List
2+
from typing import Dict, List, Optional
33

44
import torch
55
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
@@ -222,7 +222,13 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
222222
return messages
223223

224224
@staticmethod
225-
def proc_messages(messages, chat_template, sequence_start):
225+
def proc_messages(
226+
messages,
227+
chat_template,
228+
sequence_start,
229+
tools: Optional[List[object]] = None,
230+
enable_thinking: Optional[bool] = None,
231+
):
226232
"""Apply chat template to get the prompt."""
227233
prompt_messages = []
228234
IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -245,13 +251,38 @@ def proc_messages(messages, chat_template, sequence_start):
245251
else:
246252
pass
247253
prompt_messages.append(dict(role='user', content=prompt))
248-
prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
254+
prompt = chat_template.messages2prompt(prompt_messages,
255+
sequence_start,
256+
tools=tools,
257+
enable_thinking=enable_thinking)
249258
return prompt, IMAGE_TOKEN
250259

251-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
252-
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
260+
def to_pytorch(self,
261+
messages,
262+
chat_template,
263+
tokenizer,
264+
sequence_start,
265+
tools: Optional[List[object]] = None,
266+
enable_thinking: Optional[bool] = None,
267+
**kwargs):
268+
prompt, IMAGE_TOKEN = self.proc_messages(messages,
269+
chat_template,
270+
sequence_start,
271+
tools=tools,
272+
enable_thinking=enable_thinking)
253273
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
254274

255-
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
256-
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
275+
def to_turbomind(self,
276+
messages,
277+
chat_template,
278+
tokenizer,
279+
sequence_start,
280+
tools: Optional[List[object]] = None,
281+
enable_thinking: Optional[bool] = None,
282+
**kwargs):
283+
prompt, IMAGE_TOKEN = self.proc_messages(messages,
284+
chat_template,
285+
sequence_start,
286+
tools=tools,
287+
enable_thinking=enable_thinking)
257288
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

lmdeploy/vl/model/internvl3_hf.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,13 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
147147
return messages
148148

149149
@staticmethod
150-
def proc_messages(messages, chat_template, sequence_start):
150+
def proc_messages(
151+
messages,
152+
chat_template,
153+
sequence_start,
154+
tools: Optional[List[object]] = None,
155+
enable_thinking: Optional[bool] = None,
156+
):
151157
"""Apply chat template to get the prompt."""
152158
prompt_messages = []
153159
IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -170,13 +176,38 @@ def proc_messages(messages, chat_template, sequence_start):
170176
else:
171177
pass
172178
prompt_messages.append(dict(role='user', content=prompt))
173-
prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
179+
prompt = chat_template.messages2prompt(prompt_messages,
180+
sequence_start,
181+
tools=tools,
182+
enable_thinking=enable_thinking)
174183
return prompt, IMAGE_TOKEN
175184

176-
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
177-
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
185+
def to_pytorch(self,
186+
messages,
187+
chat_template,
188+
tokenizer,
189+
sequence_start,
190+
tools: Optional[List[object]] = None,
191+
enable_thinking: Optional[bool] = None,
192+
**kwargs):
193+
prompt, IMAGE_TOKEN = self.proc_messages(messages,
194+
chat_template,
195+
sequence_start,
196+
tools=tools,
197+
enable_thinking=enable_thinking)
178198
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
179199

180-
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
181-
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
200+
def to_turbomind(self,
201+
messages,
202+
chat_template,
203+
tokenizer,
204+
sequence_start,
205+
tools: Optional[List[object]] = None,
206+
enable_thinking: Optional[bool] = None,
207+
**kwargs):
208+
prompt, IMAGE_TOKEN = self.proc_messages(messages,
209+
chat_template,
210+
sequence_start,
211+
tools=tools,
212+
enable_thinking=enable_thinking)
182213
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)

0 commit comments

Comments
 (0)