tc-mb
diff --git a/‎src/transformers/models/minicpm_o_2_6/feature_extractor_minicpm_o_2_6.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/minicpm_o_2_6/feature_extractor_minicpm_o_2_6.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/minicpm_o_2_6/image_processing_minicpm.py
Lines changed: 10 additions & 2 deletions b/‎src/transformers/models/minicpm_o_2_6/image_processing_minicpm.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/transformers/models/minicpm_o_2_6/modeling_minicpm_o_2_6.py
Lines changed: 7 additions & 24 deletions b/‎src/transformers/models/minicpm_o_2_6/modeling_minicpm_o_2_6.py
Lines changed: 7 additions & 24 deletions
diff --git a/‎src/transformers/models/minicpm_o_2_6/processing_minicpm_o_2_6.py
Lines changed: 29 additions & 28 deletions b/‎src/transformers/models/minicpm_o_2_6/processing_minicpm_o_2_6.py
Lines changed: 29 additions & 28 deletions
diff --git a/‎src/transformers/models/minicpm_o_2_6/tokenization_minicpm_o_2_6_fast.py
Lines changed: 17 additions & 0 deletions b/‎src/transformers/models/minicpm_o_2_6/tokenization_minicpm_o_2_6_fast.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎tests/models/minicpm_o_2_6/test.py
Lines changed: 0 additions & 47 deletions b/‎tests/models/minicpm_o_2_6/test.py
Lines changed: 0 additions & 47 deletions
@@ -36,6 +36,7 @@ def __call__(
         chunk_length: Optional[int] = 1,
         **kwargs,
     ):
+        # in batch inference, it may be [[]]
         if isinstance(audios, np.ndarray):
             audios_list = [[audios]]
         elif isinstance(audios[0], np.ndarray):
 
@@ -259,7 +259,13 @@ def preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> MiniCPMOBatchFeature:
-        images_list = make_nested_list_of_images(images)
+        # in batch inference, it may be [[]], so we can't use `make_nested_list_of_images`
+        if isinstance(images, Image.Image):
+            images_list = [[images]]
+        elif isinstance(images[0], Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
 
         to_tensor = transforms.ToTensor()
         normalize_transform = transforms.Normalize(
@@ -308,7 +314,9 @@ def preprocess(
                             (slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size))
                     )
 
-            tgt_sizes = np.vstack(tgt_sizes)
+            # in batch inference, it may be []
+            if tgt_sizes:
+                tgt_sizes = np.vstack(tgt_sizes)
 
             new_images_list.append(new_images)
             image_sizes_list.append(image_sizes)
 
@@ -551,7 +551,7 @@ def __init__(self, config):
         # feature_extractor = MiniCPM_o_2_6FeatureExtractor.from_pretrained(config._name_or_path)
         # self.processor = MiniCPM_o_2_6Processor(image_processor=image_processor, feature_extractor=feature_extractor, tokenizer=tokenizer)
 
-        self.terminators = ["<|im_end|>", "<|endoftext|>"]
+        # self.terminators = ["<|im_end|>", "<|endoftext|>"]
 
         self.force_no_stop = False
 
@@ -1094,26 +1094,12 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def _decode(self, inputs_embeds, tokenizer, attention_mask, **kwargs):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        outputs = super().generate(
-            inputs_embeds=inputs_embeds,
-            pad_token_id=0,
-            eos_token_id=terminators,
-            attention_mask=attention_mask,
-            output_hidden_states=True,
-            return_dict_in_generate=True,
-            **kwargs,
-        )
-        return outputs
-
     def _decode_stream(self, inputs_embeds, tokenizer, **kwargs):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         streamer = TextIteratorStreamer(tokenizer=tokenizer)
         generation_kwargs = {
             "inputs_embeds": inputs_embeds,
             "pad_token_id": 0,
-            "eos_token_id": terminators,
+            "eos_token_id": tokenizer.terminator_ids,
             "streamer": streamer,
         }
         generation_kwargs.update(kwargs)
@@ -1199,11 +1185,10 @@ def generate(
             if stream:
                 result = self._decode_stream(model_inputs["inputs_embeds"], processor.tokenizer, **generation_config)
             else:
-                terminators = [processor.tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
                 outputs = super().generate(
                     inputs_embeds=model_inputs["inputs_embeds"],
                     pad_token_id=0,
-                    eos_token_id=terminators,
+                    eos_token_id=processor.tokenizer.terminator_ids,
                     attention_mask=attention_mask,
                     output_hidden_states=True,
                     return_dict_in_generate=True,
@@ -1213,7 +1198,7 @@ def generate(
         if stream:
             def stream_gen():
                 for text in result:
-                    for term in self.terminators:
+                    for term in processor.tokenizer.terminators:
                         text = text.replace(term, "")
                     yield text
 
@@ -1226,8 +1211,7 @@ def stream_gen():
             spk_embeds = wav_numpy = sr = None
 
             if not batched and use_tts_template and generate_audio:
-                # todo 这个地方怎么处理，必须得decode一次
-                result = processor.decode_text(outputs.sequences, processor.tokenizer, self.terminators)
+                result = processor.decode_text(outputs.sequences, processor.tokenizer)
                 mel_spec = self._generate_mel_spec(model_inputs, outputs, result[0], tts_config={'top_p': 0.7, 'top_k': 20, 'repetition_penalty': 1.0}, force_no_stop=force_no_stop)
                 wav_numpy, sr = self.decode_mel_to_audio(mel_spec, kwargs.get('output_audio_path', None))
 
@@ -1486,7 +1470,6 @@ def streaming_generate(
         self.llm_generate_completed = False
         self.audio_past_key_values = None  # apm kv cache
 
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         generate_prompt = "<|im_end|>\n<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>"
         input_ids = tokenizer(generate_prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].cuda()
 
@@ -1500,7 +1483,7 @@ def streaming_generate(
         attention_mask = torch.ones((1, cache_length + input_ids.shape[1]), dtype=torch.bool, device=self.device)
 
         generation_config["max_new_tokens"] = max_new_tokens
-        streamer = self.llm_generate_chunk(input_ids, attention_mask, tokenizer, terminators, generation_config)
+        streamer = self.llm_generate_chunk(input_ids, attention_mask, tokenizer, tokenizer.terminator_ids, generation_config)
 
         if generate_audio:
             result = self._generate_mel_spec_audio_streaming(
@@ -1552,7 +1535,7 @@ def check_uncompleted_token(ids):
             end = check_uncompleted_token(cur_ids[0])
             left_ids = cur_ids[:, end:]
             cur_ids = cur_ids[:, :end]
-            text = self.processor.decode_text(cur_ids, tokenizer, self.terminators)[0] if end > 0 else ""
+            text = self.processor.decode_text(cur_ids, tokenizer)[0] if end > 0 else ""
 
             self.llm_past_key_values = outputs.past_key_values
             input_ids = outputs.sequences[:, -1:]
 
@@ -158,16 +158,11 @@ class MiniCPM_o_2_6Processor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     feature_extractor_class = "MiniCPM_o_2_6FeatureExtractor"
 
-    def __init__(self, tokenizer=None, image_processor=None, feature_extractor=None):
-        super().__init__(tokenizer, image_processor, feature_extractor)
+    def __init__(self, tokenizer=None, image_processor=None, feature_extractor=None, chat_template=None):
+        super().__init__(tokenizer, image_processor,
+                         feature_extractor, chat_template=chat_template)
         self.version = image_processor.version
         self.default_tts_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"
-        self.image_tag = "(<image>./</image>)"
-        self.image_pattern = "\(<image>./</image>\)"
-        self.audio_tag = "(<audio>./</audio>)"
-        self.audio_pattern = "\(<audio>./</audio>\)"
-        self.terminators = ["<|im_end|>", "<|endoftext|>"]
-        self.split_pattern = f"({self.image_pattern}|{self.audio_pattern})"
 
     def __call__(
         self,
@@ -221,18 +216,19 @@ def apply_chat_template(
         msgs,
         chunk_input=True,
         max_slice_nums=None,
-        max_inp_length=32768,
+        max_length=32768,
         omni_input=False,
         use_image_id=None,
         use_tts_template=False,
+        **kwargs,
     ):
         """
         Unified chat function
 
         Args:
             msgs: the input chat msgs, support text: (string)  / image: (PIL.Image) / audio (numpy.ndarray)
             chunk_input: whether to split audio into 1s chunks
-            max_inp_length: the maximum length of input
+            max_length: the maximum length of input
             max_slice_nums: control the maximum number of image slices
             omni_input: determine whether it is omni mode
             use_image_id: for video understanding or omni understanding, use_image_id should be False
@@ -295,11 +291,9 @@ def apply_chat_template(
                     chat_template=self.default_tts_chat_template if use_tts_template else None,
                 )
             )
-            if images:
-                input_images_list.append(images)
-            if audios:
-                input_audios_list.append(audios)
-                audio_parts_list.append(audio_parts)
+            input_images_list.append(images)
+            input_audios_list.append(audios)
+            audio_parts_list.append(audio_parts)
 
         inputs = self.__call__(
             prompts_lists,
@@ -310,13 +304,13 @@ def apply_chat_template(
             use_image_id=use_image_id,
             chunk_input=chunk_input,
             return_tensors="pt",
-            max_length=max_inp_length,
+            max_length=max_length,
         )
         return inputs
 
     def decode(self, outputs, batched=False):
         result = self.decode_text(
-            outputs.sequences, self.tokenizer, self.terminators)
+            outputs.sequences, self.tokenizer)
         if not batched:
             result = result[0]
         if isinstance(result, list):
@@ -325,15 +319,22 @@ def decode(self, outputs, batched=False):
             result = result.replace(self.tokenizer.tts_end, "")
         return result
 
-    def decode_text(self, result_ids, tokenizer, terminators):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in terminators]
+    def decode_text(self, result_ids, tokenizer):
         result_text = []
         for result in result_ids:
             result = result[result != 0]
-            if result[0] == tokenizer.bos_id:
-                result = result[1:]
-            if result[-1] in terminators:
-                result = result[:-1]
+            start, end = 0, len(result)
+            for i, tok in enumerate(result):
+                if tok == tokenizer.bos_id:
+                    start = i+1
+                else:
+                    break
+            for i in range(len(result)-1, -1, -1):
+                if result[i] in tokenizer.terminator_ids:
+                    end = i
+                else:
+                    break
+            result = result[start:end]
             result_text.append(tokenizer.decode(result))
         return result_text
 
@@ -509,10 +510,10 @@ def _convert_omni_to_inputs(
         spk_bounds_list = []
 
         for index, text in enumerate(texts):
-            text_chunks = re.split(self.split_pattern, text)
+            text_chunks = re.split(self.tokenizer.split_pattern, text)
 
-            image_tags = re.findall(self.image_pattern, text)
-            audio_tags = re.findall(self.audio_pattern, text)
+            image_tags = re.findall(self.tokenizer.image_pattern, text)
+            audio_tags = re.findall(self.tokenizer.audio_pattern, text)
 
             if image_tags:
                 assert images is not None
@@ -524,13 +525,13 @@ def _convert_omni_to_inputs(
             image_id = 0
             audio_id = 0
             for i, chunk in enumerate(text_chunks):
-                if chunk == self.image_tag:
+                if chunk == self.tokenizer.image_tag:
                     image_placeholder = self.image_processor.get_slice_image_placeholder(
                         self.tokenizer, image_sizes[index][image_id], image_id, max_slice_nums, use_image_id
                     )
                     image_id += 1
                     text_chunks[i] = image_placeholder
-                elif chunk == self.audio_tag:
+                elif chunk == self.tokenizer.audio_tag:
                     audio_placeholder = audio_phs[index][audio_id]
                     audio_id += 1
                     text_chunks[i] = audio_placeholder
 
@@ -15,6 +15,7 @@
 
 from transformers import Qwen2TokenizerFast
 
+
 class MiniCPM_o_2_6TokenizerFast(Qwen2TokenizerFast):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -31,6 +32,8 @@ def __init__(self, **kwargs):
         self.slice_end = "</slice>"
         self.im_id_start = "<image_id>"
         self.im_id_end = "</image_id>"
+        self.image_tag = f"({self.im_start}./{self.im_end})"
+        self.image_pattern = "\(<image>./</image>\)"
 
         # audio
         self.audio_start = "<|audio_start|>"
@@ -40,6 +43,12 @@ def __init__(self, **kwargs):
         self.tts_start = "<|tts_bos|>"
         self.tts_end = "<|tts_eos|>"
         self.unk_token = "<unk>"
+        self.audio_tag = "(<audio>./</audio>)"
+        self.audio_pattern = "\(<audio>./</audio>\)"
+
+        self.split_pattern = f"({self.image_pattern}|{self.audio_pattern})"
+
+        self.terminator_tokens = ["<|im_end|>", "<|endoftext|>", self.tts_end]
 
     @property
     def eos_id(self):
@@ -53,6 +62,10 @@ def bos_id(self):
     def unk_id(self):
         return self.unk_token_id
 
+    @property
+    def terminators(self):
+        return self.terminator_tokens
+
     @property
     def im_start_id(self):
         return self.convert_tokens_to_ids(self.im_start)
@@ -101,6 +114,10 @@ def tts_start_id(self):
     def tts_end_id(self):
         return self.convert_tokens_to_ids(self.tts_end)
 
+    @property
+    def terminator_ids(self):
+        return [self.convert_tokens_to_ids(t) for t in self.terminator_tokens]
+
     @staticmethod
     def escape(text: str) -> str:
         return text