support interleave text and image in messages

lvhan028 · lvhan028 · commit 899f428bc502 · 2025-11-20T17:17:58.000+08:00
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py
@@ -44,11 +44,12 @@ def __init__(self,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
-        self.arch = hf_config.architectures[0]
+        self.arch = self.hf_config.architectures[0]
 
     def build_preprocessor(self):
         self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
         tokenizer = self.processor.tokenizer
+        self.image_token = self.processor.image_token
         self.image_token_id = tokenizer.context_image_token_id
         self.image_tokens_per_patch = self.processor.image_seq_length
         self.tokenizer_init_kwargs = tokenizer.init_kwargs
@@ -146,8 +147,8 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @staticmethod
     def proc_messages(
+        self,
         messages,
         chat_template,
         sequence_start,
@@ -156,31 +157,29 @@ def proc_messages(
     ):
         """Apply chat template to get the prompt."""
         prompt_messages = []
-        IMAGE_TOKEN = '<IMAGE_TOKEN>'
         for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['preprocess', 'forward']:
+            if message['role'] in ['preprocess', 'forward']:
                 continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [x.get('text', '') for x in message['content'] if x['type'] == 'text']
-            prompt = content[0]
-            if IMAGE_TOKEN in prompt and f'<img>{IMAGE_TOKEN}' not in prompt:
-                prompt = prompt.replace(f'{IMAGE_TOKEN}', f'<img>{IMAGE_TOKEN}</img>')
-                prompt = prompt.replace('</img><img>', '')
-                prompt = prompt.replace('<img><img>', '<img>')
-                prompt = prompt.replace('</img></img>', '</img>')
-            elif IMAGE_TOKEN not in prompt:
-                prompt = f'<img>{IMAGE_TOKEN * n_images}</img>\n' + prompt
+            role, content = message['role'], message['content']
+            if role == 'user' and isinstance(content, List):
+                _content = []
+                for item in content:
+                    if item['type'] == 'text':
+                        _content.append(item['text'])
+                    elif item['type'] in ['image', 'image_url']:
+                        _content.append(self.image_token)
+                    else:
+                        raise ValueError(f'Unsupported message type: {item["type"]}')
+                message = dict(role=role, content='\n'.join(_content))
+                prompt_messages.append(message)
             else:
-                pass
-            prompt_messages.append(dict(role='user', content=prompt))
+                prompt_messages.append(message)
+
         prompt = chat_template.messages2prompt(prompt_messages,
                                                sequence_start,
                                                tools=tools,
                                                enable_thinking=enable_thinking)
-        return prompt, IMAGE_TOKEN
+        return prompt, self.image_token
 
     def to_pytorch(self,
                    messages,
@@ -190,12 +189,12 @@ def to_pytorch(self,
                    tools: Optional[List[object]] = None,
                    enable_thinking: Optional[bool] = None,
                    **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages,
+        prompt, image_token = self.proc_messages(messages,
                                                  chat_template,
                                                  sequence_start,
                                                  tools=tools,
                                                  enable_thinking=enable_thinking)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
+        return self.to_pytorch_aux(messages, prompt, image_token, tokenizer, sequence_start)
 
     def to_turbomind(self,
                      messages,