support internvl2.5-mpo (#1008)

cheng221 · nemonameless · luyao-cv · web-flow · commit 6ac30ff7afa5 · 2025-01-22T09:43:12.000+08:00
Co-authored-by: nifeng &lt;nemonameless@qq.com&gt;
Co-authored-by: luyao-cv &lt;1367355728@qq.com&gt;
diff --git a/paddlemix/examples/internvl2/README.md b/paddlemix/examples/internvl2/README.md
@@ -27,11 +27,15 @@
 |--------------------|
 | OpenGVLab/InternVL2-1B  |
 | OpenGVLab/InternVL2_5-1B  |
+| OpenGVLab/InternVL2_5-1B-MPO  |
 | OpenGVLab/InternVL2-2B  |
 | OpenGVLab/InternVL2_5-2B  |
+| OpenGVLab/InternVL2_5-2B-MPO  |
 | OpenGVLab/InternVL2_5-4B  |
+| OpenGVLab/InternVL2_5-4B-MPO  |
 | OpenGVLab/InternVL2-8B  |
 | OpenGVLab/InternVL2_5-8B  |
+| OpenGVLab/InternVL2_5-8B-MPO  |
 | OpenGVLab/InternVL2-26B |
 | OpenGVLab/InternVL2-40B |
 | OpenGVLab/InternVL2-8B-MPO |
@@ -198,7 +202,7 @@ sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen
 
 ## 多卡
 # 2B
-sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh 
+sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
 
 ## 多卡
 # 8B
@@ -211,7 +215,7 @@ sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_8b_inte
 
 ```bash
 python paddlemix/examples/internvl2/chat_demo.py \
-    --model_name_or_path "your_checkpoints" \
+    --model_name_or_path "your_checkpoint" \
     --image_path 'paddlemix/demo_images/examples_image1.jpg' \
     --text "Please describe this image in detail."
 ```
diff --git a/paddlemix/examples/internvl2/chat_demo.py b/paddlemix/examples/internvl2/chat_demo.py
@@ -29,7 +29,6 @@
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
 
-
 def check_dtype_compatibility():
     """
     检查当前环境下可用的数据类型
@@ -44,15 +43,15 @@ def check_dtype_compatibility():
     if gpu_arch is None:
         print("Unable to determine GPU architecture, falling back to float32")
         return paddle.float32
-    
+
     major, minor = gpu_arch
-    compute_capability = major + minor/10
+    compute_capability = major + minor / 10
     print(f"GPU compute capability: {compute_capability}")
-    
+
     try:
         # 测试bfloat16兼容性
         if compute_capability >= 8.0:  # Ampere及更新架构
-            test_tensor = paddle.zeros([2, 2], dtype='bfloat16')
+            test_tensor = paddle.zeros([2, 2], dtype="bfloat16")
             test_op = paddle.matmul(test_tensor, test_tensor)
             print("bfloat16 is supported and working")
             return paddle.bfloat16
@@ -62,7 +61,7 @@ def check_dtype_compatibility():
     try:
         # 测试float16兼容性
         if compute_capability >= 5.3:  # Maxwell及更新架构
-            test_tensor = paddle.zeros([2, 2], dtype='float16')
+            test_tensor = paddle.zeros([2, 2], dtype="float16")
             test_op = paddle.matmul(test_tensor, test_tensor)
             print("float16 is supported and working")
             return paddle.float16
@@ -99,7 +98,7 @@ def load_tokenizer(model_path):
     import re
 
     match = re.search(r"\d+B", model_path)
-    model2_5 = "InternVL2_5" in model_path 
+    model2_5 = "InternVL2_5" in model_path
     if match:
         model_size = match.group()
     else:
@@ -137,7 +136,7 @@ def main(args):
     print("len(tokenizer): ", len(tokenizer))
 
     model = InternVLChatModel.from_pretrained(MODEL_PATH, dtype=args.dtype).eval()
-    generation_config = dict(max_new_tokens=1024, do_sample=False, top_p=0.01)
+    generation_config = dict(max_new_tokens=1024, do_sample=False)
 
     with paddle.no_grad():
         response, history = model.chat(
@@ -157,11 +156,7 @@ def main(args):
     parser.add_argument("--image_path", type=str, default=None)
     parser.add_argument("--text", type=str, default="Please describe the image shortly.", required=True)
     parser.add_argument(
-        "--dtype",
-        type=str,
-        default="float16",
-        choices=["float32", "bfloat16", "float16"],
-        help="Model dtype"
+        "--dtype", type=str, default="float16", choices=["float32", "bfloat16", "float16"], help="Model dtype"
     )
     args = parser.parse_args()
 
@@ -171,11 +166,10 @@ def main(args):
         args.dtype = paddle.float16
     else:
         args.dtype = paddle.float32
-        
 
     # 检查环境支持的dtype并设置
     available_dtype = check_dtype_compatibility()
-    
+
     # 如果用户指定了dtype，尝试使用用户指定的类型
     if args.dtype == "bfloat16":
         desired_dtype = paddle.bfloat16
@@ -192,5 +186,5 @@ def main(args):
         args.dtype = desired_dtype
 
     print(f"Using dtype: {args.dtype}")
-        
-    main(args)
+
+    main(args)
diff --git a/paddlemix/models/internvl2/internlm2/tokenizer_internlm2.py b/paddlemix/models/internvl2/internlm2/tokenizer_internlm2.py
@@ -18,14 +18,17 @@
 
 """Tokenization classes for InternLM."""
 import os
+import re
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
 import sentencepiece as spm
 from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
+from paddlenlp.transformers.tokenizer_utils_base import AddedToken, TextInput
+
 from paddlemix.utils.log import logger
 
-VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
 # VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
 # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
@@ -38,18 +41,18 @@ class InternLM2Tokenizer(PretrainedTokenizer):
             Path to the vocabulary file.
     """
 
-    resource_files_names = VOCAB_FILES_NAMES # vocab_files_names in torch
-    pretrained_resource_files_map = {} # pretrained_vocab_files_map in torch
-    model_input_names = ['input_ids', 'attention_mask']
-    _auto_class = 'AutoTokenizer'
+    resource_files_names = VOCAB_FILES_NAMES  # vocab_files_names in torch
+    pretrained_resource_files_map = {}  # pretrained_vocab_files_map in torch
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
 
     def __init__(
         self,
         vocab_file,
-        unk_token='<unk>',
-        bos_token='<s>',
-        eos_token='</s>',
-        pad_token='</s>',
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         add_bos_token=True,
         add_eos_token=False,
@@ -78,7 +81,7 @@ def __init__(
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:
             vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
-            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')}
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
         return self._no_prefix_space_tokens
 
     @property
@@ -115,20 +118,20 @@ def _convert_id_to_token(self, index):
 
     def _maybe_add_prefix_space(self, tokens, decoded):
         if tokens and tokens[0] not in self.no_prefix_space_tokens:
-            return ' ' + decoded
+            return " " + decoded
         else:
             return decoded
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         current_sub_tokens = []
-        out_string = ''
+        out_string = ""
         prev_is_special = False
         for token in tokens:
             # make sure that special tokens are not decoded using sentencepiece model
             if token in self.all_special_tokens:
                 if not prev_is_special:
-                    out_string += ' '
+                    out_string += " "
                 out_string += self.sp_model.decode(current_sub_tokens) + token
                 prev_is_special = True
                 current_sub_tokens = []
@@ -152,16 +155,16 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
             `Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
-            logger.error(f'Vocabulary path ({save_directory}) should be a directory')
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
         out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
         )
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, 'wb') as fi:
+            with open(out_vocab_file, "wb") as fi:
                 content_spiece_model = self.sp_model.serialized_model_proto()
                 fi.write(content_spiece_model)
 
@@ -231,3 +234,77 @@ def create_token_type_ids_from_sequences(
         if token_ids_1 is None:
             return len(token_ids_0 + eos) * [0]
         return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+        """
+        Converts a string into a sequence of tokens, using the tokenizer.
+
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+        split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
+
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+
+        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
+        all_special_tokens_extended = dict(
+            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
+        )
+
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase. Might be super slow as well?
+            escaped_special_toks = [
+                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+
+        if split_special_tokens:
+            no_split_token = []
+            tokens = [text]
+        else:
+            no_split_token = set(self.unique_no_split_tokens)  # don't split on any of the added tokens
+            # "This is something<special_token_1>  else"
+            tokens = self.tokens_trie.split(text)
+
+        # ["This is something", "<special_token_1>", "  else"]
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = all_special_tokens_extended.get(token, None)
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        tokens[i + 1] = right.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()  # Opposite here
+                    if tok_extended.single_word and left and left[-1] != " ":
+                        tokens[i - 1] += token
+                        tokens[i] = ""
+                    elif tok_extended.single_word and right and right[0] != " ":
+                        tokens[i + 1] = token + tokens[i + 1]
+                        tokens[i] = ""
+        # ["This is something", "<special_token_1>", "else"]
+        tokenized_text = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token))
+        # ["This", " is", " something", "<special_token_1>", "else"]
+        return tokenized_text
diff --git a/paddlemix/models/internvl2/internvl_chat/modeling_internvl_chat.py b/paddlemix/models/internvl2/internvl_chat/modeling_internvl_chat.py
@@ -307,7 +307,7 @@ def chat(
 
         template = get_conv_template(self.template)
         template.system_message = self.system_message
-        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
 
         history = [] if history is None else history
         for (old_question, old_answer) in history:
@@ -324,16 +324,18 @@ def chat(
         for num_patches in num_patches_list:
             image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
             query = query.replace("<image>", image_tokens, 1)
+        model_inputs = tokenizer(query, add_special_tokens=True, return_tensors="pd")
 
-        model_inputs = tokenizer(query, return_tensors="pd")
         input_ids = model_inputs["input_ids"]
         attention_mask = model_inputs["attention_mask"]
         generation_config["eos_token_id"] = eos_token_id
+        generation_config = GenerationConfig(**generation_config)
+
         generation_output = self.generate(
             pixel_values=pixel_values,  # [7, 3, 448, 448]
+            generation_config=generation_config,
             input_ids=input_ids,  # [1, 1847]
             attention_mask=attention_mask,  # [1, 1847]
-            **generation_config,  # {'max_new_tokens': 1024, 'do_sample': False, 'eos_token_id': 92542}
         )
         response = tokenizer.batch_decode(generation_output[0], skip_special_tokens=True)[0]
         response = response.split(template.sep)[0].strip()