fix vl nothink mode (#3776)

RunningLeon · web-flow · commit 8f37f3ec2283 · 2025-07-26T12:43:50.000+08:00
* fix vl nothink mode

* fix
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -52,6 +52,7 @@ async def _get_prompt_input(self,
                                 sequence_start: bool,
                                 adapter_name: str,
                                 tools: Optional[List[object]] = None,
+                                enable_thinking: Optional[bool] = None,
                                 **kwargs):
         """Process messages and return the required data for the inference
         engines.
@@ -60,14 +61,24 @@ async def _get_prompt_input(self,
         the argument specification.
         """
         if isinstance(messages, str):
-            return await super()._get_prompt_input(messages, do_preprocess, sequence_start, adapter_name, tools,
+            return await super()._get_prompt_input(messages,
+                                                   do_preprocess,
+                                                   sequence_start,
+                                                   adapter_name,
+                                                   tools=tools,
+                                                   enable_thinking=enable_thinking,
                                                    **kwargs)
         elif isinstance(messages, List):
             has_multimodal_input = any(
                 isinstance(message['content'], list) and any(item['type'] in ['image_url', 'image_data']
                                                              for item in message['content']) for message in messages)
             if not has_multimodal_input:
-                return await super()._get_prompt_input(messages, do_preprocess, sequence_start, adapter_name, tools,
+                return await super()._get_prompt_input(messages,
+                                                       do_preprocess,
+                                                       sequence_start,
+                                                       adapter_name,
+                                                       tools,
+                                                       enable_thinking=enable_thinking,
                                                        **kwargs)
         else:
             raise RuntimeError(f'unsupported messages {messages}')
@@ -82,11 +93,21 @@ async def _get_prompt_input(self,
             # embedding_ranges and so on. All the returned values are passed
             # to tm engine for token generation
             results = await self.vl_encoder.async_infer(results)
-            results = await self.vl_encoder.wrap_for_turbomind(results, chat_template, self.tokenizer, sequence_start)
+            results = await self.vl_encoder.wrap_for_turbomind(results,
+                                                               chat_template,
+                                                               self.tokenizer,
+                                                               sequence_start,
+                                                               tools=tools,
+                                                               enable_thinking=enable_thinking)
         elif self.backend == 'pytorch':
             # for pt engine, this module only conduct the image preprocessing
             # It leaves the vision embedding to the pt engine
-            results = await self.vl_encoder.wrap_for_pytorch(results, chat_template, self.tokenizer, sequence_start)
+            results = await self.vl_encoder.wrap_for_pytorch(results,
+                                                             chat_template,
+                                                             self.tokenizer,
+                                                             sequence_start,
+                                                             tools=tools,
+                                                             enable_thinking=enable_thinking)
         return results
 
     @classmethod
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
@@ -61,7 +61,15 @@ async def async_infer(self, messages: List[Dict]) -> List[Dict]:
         outputs = await future
         return outputs
 
-    async def wrap_for_pytorch(self, messages: List[Dict], chat_template, tokenizer, sequence_start) -> List[Dict]:
+    async def wrap_for_pytorch(
+        self,
+        messages: List[Dict],
+        chat_template,
+        tokenizer,
+        sequence_start,
+        tools: Optional[List[object]] = None,
+        enable_thinking: Optional[bool] = None,
+    ) -> List[Dict]:
         """
         Args:
             messages (List[Dict]): a list of message, which is supposed to be
@@ -78,14 +86,27 @@ async def wrap_for_pytorch(self, messages: List[Dict], chat_template, tokenizer,
                 ]
             )
         """
-        result = self.model.to_pytorch(messages, chat_template, tokenizer, sequence_start)
+        result = self.model.to_pytorch(messages,
+                                       chat_template,
+                                       tokenizer,
+                                       sequence_start,
+                                       tools=tools,
+                                       enable_thinking=enable_thinking)
         # clear data
         for i, message in enumerate(messages):
             if isinstance(message['content'], List):
                 messages[i]['preprocess'] = None
         return result
 
-    async def wrap_for_turbomind(self, messages: List[Dict], chat_template, tokenizer, sequence_start) -> Dict:
+    async def wrap_for_turbomind(
+        self,
+        messages: List[Dict],
+        chat_template,
+        tokenizer,
+        sequence_start,
+        tools: Optional[List[object]] = None,
+        enable_thinking: Optional[bool] = None,
+    ) -> Dict:
         """
         Args:
             messages (List[Dict]): a list of message, which is supposed to be
@@ -100,7 +121,12 @@ async def wrap_for_turbomind(self, messages: List[Dict], chat_template, tokenize
                 'input_embedding_ranges': list[torch.Tensor],
                 ...
         """
-        result = self.model.to_turbomind(messages, chat_template, tokenizer, sequence_start)
+        result = self.model.to_turbomind(messages,
+                                         chat_template,
+                                         tokenizer,
+                                         sequence_start,
+                                         tools=tools,
+                                         enable_thinking=enable_thinking)
         # clear data
         for i, message in enumerate(messages):
             if isinstance(message['content'], List):
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
@@ -119,7 +119,7 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         if self.backend == 'turbomind':
             raise NotImplementedError()
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         """Pack the preprocessing results in a format compatible with what is
         required by pytorch engine. ONLY implement it when the backend is
         pytorch engine.
@@ -133,7 +133,7 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         if self.backend == 'pytorch':
             raise NotImplementedError()
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         """Pack the forwarding results in a format compatible with what is
         required by turbomind engine. ONLY implement it when the backend is
         turbomind engine.
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
@@ -85,6 +85,6 @@ def proc_messages(messages, chat_template, sequence_start):
             prompt += prompt_i
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
@@ -164,10 +164,10 @@ def proc_messages(messages, chat_template, sequence_start):
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/vl/model/deepseek_vl2.py
@@ -159,10 +159,10 @@ def proc_messages(messages, chat_template, sequence_start):
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py
@@ -123,10 +123,10 @@ def proc_messages(messages, chat_template, sequence_start):
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py
@@ -86,6 +86,6 @@ def proc_messages(messages, chat_template, sequence_start):
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import torch
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
@@ -222,7 +222,13 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         return messages
 
     @staticmethod
-    def proc_messages(messages, chat_template, sequence_start):
+    def proc_messages(
+        messages,
+        chat_template,
+        sequence_start,
+        tools: Optional[List[object]] = None,
+        enable_thinking: Optional[bool] = None,
+    ):
         """Apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -245,13 +251,38 @@ def proc_messages(messages, chat_template, sequence_start):
             else:
                 pass
             prompt_messages.append(dict(role='user', content=prompt))
-        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        prompt = chat_template.messages2prompt(prompt_messages,
+                                               sequence_start,
+                                               tools=tools,
+                                               enable_thinking=enable_thinking)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
+    def to_pytorch(self,
+                   messages,
+                   chat_template,
+                   tokenizer,
+                   sequence_start,
+                   tools: Optional[List[object]] = None,
+                   enable_thinking: Optional[bool] = None,
+                   **kwargs):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages,
+                                                 chat_template,
+                                                 sequence_start,
+                                                 tools=tools,
+                                                 enable_thinking=enable_thinking)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
+    def to_turbomind(self,
+                     messages,
+                     chat_template,
+                     tokenizer,
+                     sequence_start,
+                     tools: Optional[List[object]] = None,
+                     enable_thinking: Optional[bool] = None,
+                     **kwargs):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages,
+                                                 chat_template,
+                                                 sequence_start,
+                                                 tools=tools,
+                                                 enable_thinking=enable_thinking)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py
@@ -147,7 +147,13 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         return messages
 
     @staticmethod
-    def proc_messages(messages, chat_template, sequence_start):
+    def proc_messages(
+        messages,
+        chat_template,
+        sequence_start,
+        tools: Optional[List[object]] = None,
+        enable_thinking: Optional[bool] = None,
+    ):
         """Apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -170,13 +176,38 @@ def proc_messages(messages, chat_template, sequence_start):
             else:
                 pass
             prompt_messages.append(dict(role='user', content=prompt))
-        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        prompt = chat_template.messages2prompt(prompt_messages,
+                                               sequence_start,
+                                               tools=tools,
+                                               enable_thinking=enable_thinking)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
+    def to_pytorch(self,
+                   messages,
+                   chat_template,
+                   tokenizer,
+                   sequence_start,
+                   tools: Optional[List[object]] = None,
+                   enable_thinking: Optional[bool] = None,
+                   **kwargs):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages,
+                                                 chat_template,
+                                                 sequence_start,
+                                                 tools=tools,
+                                                 enable_thinking=enable_thinking)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
+    def to_turbomind(self,
+                     messages,
+                     chat_template,
+                     tokenizer,
+                     sequence_start,
+                     tools: Optional[List[object]] = None,
+                     enable_thinking: Optional[bool] = None,
+                     **kwargs):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages,
+                                                 chat_template,
+                                                 sequence_start,
+                                                 tools=tools,
+                                                 enable_thinking=enable_thinking)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/vl/model/llama4.py
@@ -153,10 +153,10 @@ def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_star
             input_ids.extend(token_ids)
         return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
@@ -123,10 +123,10 @@ def proc_messages(messages, chat_template, sequence_start):
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py
@@ -234,10 +234,10 @@ def proc_messages(self, messages, chat_template, sequence_start):
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
         return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py