Fix mplug owl2, molmo (#2724)

Jintao-Huang · web-flow · commit 07e16a9b3ad3 · 2024-12-21T18:22:44.000+08:00
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@
 You can contact us and communicate with us by adding our group:
 
 
-[Discord Group](https://discord.com/invite/D27yfEFVz5)              |  微信群
+[Discord Group](https://discord.com/invite/D27yfEFVz5)              |  WeChat Group
 :-------------------------:|:-------------------------:
 <img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
 
diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
@@ -168,10 +168,10 @@ def _infer_stream(self,
         if generation_config.output_logits:
             generate_kwargs['logits_processor'] = LogitsProcessorList([LogitsStreamer()])
 
-        def _model_generate(*args, **kwargs):
+        def _model_generate(**kwargs):
             if is_torch_npu_available():
                 torch.npu.set_device(self.model.device)
-            self.model.generate(*args, **kwargs)
+            template.generate(self.model, **kwargs)
 
         generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model)
         thread = Thread(target=_model_generate, kwargs=generate_kwargs)
@@ -269,7 +269,7 @@ def _infer_full(self,
         num_prompt_tokens = self._get_num_tokens(inputs)
 
         generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model)
-        output = dict(self.model.generate(**generate_kwargs))
+        output = dict(template.generate(self.model, **generate_kwargs))
         output.pop('past_key_values', None)
         batched_generate_ids = output['sequences']
         batched_generate_ids = template.get_generate_ids(batched_generate_ids, num_prompt_tokens)
diff --git a/swift/llm/model/model/mllm.py b/swift/llm/model/model/mllm.py
@@ -104,20 +104,6 @@ def to_dict(self, *args, **kwargs):
     if model is not None:
         model.config._to_dict = model.config.to_dict
         model.config.to_dict = MethodType(to_dict, model.config)
-        from transformers import GenerationMixin
-        model.generate = MethodType(GenerationMixin.generate, model)
-
-    if model and hasattr(model, '_old_forward'):  # device_map
-        device = model.lm_head.weight.device
-        forward_origin = model._old_forward
-
-        def _forward(*args, **kwargs):
-            if kwargs.get('append_last_valid_logits') is not None:
-                kwargs['append_last_valid_logits'] = kwargs['append_last_valid_logits'].to(device)
-            return forward_origin(*args, **kwargs)
-
-        model._old_forward = _forward
-        model.forward_origin = forward_origin
 
     return model, processor
 
@@ -148,18 +134,8 @@ def get_model_tokenizer_molmo(model_dir: str,
     model_cls = get_class_from_dynamic_module('modeling_molmo.MolmoForCausalLM', model_dir)
     model_cls._no_split_modules = ['MolmoSequentialBlock']
     model, processor = get_model_tokenizer_multimodal(model_dir, model_info, model_kwargs, load_model, **kwargs)
-    if model:
-        device = next(model.model.transformer.ff_out.parameters()).device
-        forward_origin = model.model.forward
-
-        def _forward(*args, **kwargs):
-            if kwargs.get('append_last_valid_logits') is not None:
-                kwargs['append_last_valid_logits'] = kwargs['append_last_valid_logits'].to(device)
-            return forward_origin(*args, **kwargs)
-
-        model.model.forward = _forward
-        model.model.forward_origin = forward_origin
 
+    patch_output_clone(model.model.transformer.wte)
     return model, processor
 
 
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -23,8 +23,10 @@ def get_model_tokenizer_qwen(model_dir: str,
                              model_info: ModelInfo,
                              model_kwargs: Dict[str, Any],
                              load_model: bool = True,
+                             model_config=None,
                              **kwargs):
-    model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    if model_config is None:
+        model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
     if model_config.torch_dtype is not None:
         k_true = dtype_mapping[model_config.torch_dtype]
         for k in dtype_mapping.values():
diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -488,7 +488,7 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         MLLMModelArch.molmo,
         language_model='model.transformer',
         vision_tower='model.vision_backbone',
-    ))
+        aligner='model.vision_backbone.image_projector'))
 
 register_model_arch(
     MultiModelKeys(
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
@@ -313,17 +313,17 @@ def _check_torch_dtype(torch_dtype: torch.dtype):
 
 def get_default_torch_dtype(torch_dtype: Optional[torch.dtype]):
     # torch_dtype: torch_dtype in config.json
+    if torch_dtype is not None:
+        return torch_dtype
+
     if is_torch_cuda_available() or is_torch_npu_available():
         if is_torch_bf16_gpu_available():
-            if torch_dtype in {torch.float16, torch.bfloat16}:
-                res = torch_dtype
-            else:
-                res = torch.bfloat16
+            return torch.bfloat16
         else:
-            res = torch.float16
+            return torch.float16
     else:
         # cpu
-        res = torch.float32
+        return torch.float32
     return res
 
 
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -232,6 +232,9 @@ def decode(self, generate_ids: List[int], is_finished: bool = True, tokenizer_kw
         tokenizer_kwargs = tokenizer_kwargs or {}
         return self._skip_stop_decode(generate_ids, is_finished, **tokenizer_kwargs)
 
+    def generate(self, model, *args, **kwargs):
+        return model.generate(*args, **kwargs)
+
     def _skip_stop_decode(self, generate_ids: List[int], is_finished: bool, **decode_kwargs) -> Any:
         # Do not print template_meta.suffix[-1] and eos_token.
         # However, other stop_words will be printed.
diff --git a/swift/llm/template/template/molmo.py b/swift/llm/template/template/molmo.py
@@ -1,168 +1,58 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional
 
 import torch
 
 from ..base import Template
 from ..constant import MLLMTemplateType
 from ..register import TemplateMeta, register_template
 from ..template_inputs import StdTemplateInputs
-from ..utils import findall
+from ..utils import Context, findall
 
 
 class MolmoTemplate(Template):
-    system = None
-    use_model = True
-    image_placeholder = ['<|image|>']
-    DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
-    DEFAULT_IM_START_TOKEN = '<im_start>'
-    DEFAULT_IM_END_TOKEN = '<im_end>'
-    DEFAULT_IM_COL_TOKEN = '<im_col>'
 
-    def __init__(self, *args, **kwargs):
-        Template.__init__(self, *args, **kwargs)
-        self.processor_kwargs = {
-            'images_kwargs': {
-                'max_crops': 12,
-                'overlap_margins': [4, 4],
-                'base_image_input_size': [336, 336],
-                'image_token_length_w': 12,
-                'image_token_length_h': 12,
-                'image_patch_size': 14,
-                'image_padding_mask': True,
-            },
-            'text_kwargs': {
-                'style': 'long_caption',
-                'system_prompt': 'none',
-                'message_format': 'role',
-                'always_start_with_space': True,
-                'sequence_length': 1536,
-                'padding': False,
-            }
-        }
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        return []
 
     def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded = super()._encode(inputs)
         # image
-        raw_image = inputs.images
-        res = {}
+        images_inputs = self.processor.process(images=inputs.images or None, text='')
+        images_input_ids = images_inputs.pop('input_ids').tolist()
+        user_token = self._tokenize(' User')
+        assert len(user_token) == 1
+        idx = findall(images_input_ids, user_token[0])
+        assert len(idx) == 1
         labels = encoded['labels']
-        if raw_image:
-            image_id = self.tokenizer.convert_tokens_to_ids(self.image_placeholder)
-            idx_list = findall(encoded['input_ids'], image_id)
-            res = self._process_images(raw_image, encoded['input_ids'], idx_list, labels)
-            import numpy as np
-            if 'image_input_idx' in res:
-                # Shift patch mapping up by one since we added BOS
-                image_input_idx = res['image_input_idx']
-                res['image_input_idx'] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
-            encoded['input_ids'] = res.pop('input_ids').tolist()
-            if labels:
-                encoded['labels'] = [-100] + res.pop('labels')  # add one label for BOS
-
-            for k, v in res.items():
-                res[k] = torch.from_numpy(v).unsqueeze(0)
-        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
-        encoded['input_ids'] = [bos] + encoded['input_ids']
-        res.update({'input_ids': encoded['input_ids']})
-        # prepare meta inputs
-        encoded.update(self.prepare_meta_inputs(res))
-
+        encoded['input_ids'] = images_input_ids[:idx[0]] + encoded['input_ids']
+        if labels:
+            encoded['labels'] = [-100] * idx[0] + labels
+        if 'images' in images_inputs:
+            images_inputs['images'] = images_inputs['images'].to(self.config.torch_dtype)
+        encoded.update(images_inputs)
         return encoded
 
-    def _process_images(self, images: List, tokens: List, idx_list: List = None, labels: List = None) -> torch.Tensor:
-        from PIL.Image import Image
-        import numpy as np
-        if images is not None:
-            image_arrays = []
-            for image in images:
-                if isinstance(image, Image):
-                    image = image.convert('RGB')
-                    image_arrays.append(np.array(image))
-                else:
-                    assert len(image.shape) == 3 and image.shape[-1] == 3
-                    image_arrays.append(image.astype(np.uint8))
-            images = image_arrays
-            # For now only support inserting images at the start
-        if idx_list is None:
-            idx_list = [-1] * len(images)
-        image_patch_token_id = self.processor.special_token_ids[self.DEFAULT_IMAGE_PATCH_TOKEN]
-        image_col_token_id = self.processor.special_token_ids[self.DEFAULT_IM_COL_TOKEN]
-        image_start_token_id = self.processor.special_token_ids[self.DEFAULT_IM_START_TOKEN]
-        image_end_token_id = self.processor.special_token_ids[self.DEFAULT_IM_END_TOKEN]
-        sequence_length = self.processor_kwargs['text_kwargs']['sequence_length']
-        res = self.processor.image_processor.multimodal_preprocess(
-            images=images,
-            image_idx=idx_list,
-            tokens=np.asarray(tokens).astype(np.int32),
-            sequence_length=sequence_length,
-            image_patch_token_id=image_patch_token_id,
-            image_col_token_id=image_col_token_id,
-            image_start_token_id=image_start_token_id,
-            image_end_token_id=image_end_token_id,
-            **self.processor_kwargs['images_kwargs'])
-        if labels is not None:
-            new_labels = []
-            cur_idx = 0
-            for input_id in res['input_ids']:
-                if input_id in (image_start_token_id, image_end_token_id, image_col_token_id, image_patch_token_id):
-                    new_labels.append(-100)
-                    if tokens[cur_idx] == self.tokenizer.convert_tokens_to_ids(self.image_placeholder)[0]:
-                        cur_idx += 1
-                else:
-                    new_labels.append(labels[cur_idx])
-                    cur_idx += 1
-            res['labels'] = new_labels
-        return res
-
-    def prepare_meta_inputs(self, data: Any) -> Dict[str, Any]:
-
-        # prepare batch inputs
-        input_ids = torch.tensor(data['input_ids']).unsqueeze(0)
-        batch_size, seq_len = input_ids.shape
-        attention_mask = None
-        mask_len = seq_len
-        max_new_tokens = None
-        if not self.is_training:
-            generation_config = self.model.generation_config
-            max_new_tokens = generation_config.max_new_tokens
-            if not max_new_tokens:
-                max_new_tokens = 0
-            mask_len = mask_len + max_new_tokens if self.model.config.use_position_ids else mask_len
-        position_ids: Optional[torch.Tensor] = None
-        append_last_valid_logits: Optional[torch.Tensor] = None
-        if self.model.config.use_position_ids and attention_mask is None:
-            attention_mask = input_ids != -1
-            position_ids = torch.clamp(torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1, min=0)
-            append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
-            if max_new_tokens:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
-                    dim=1,
-                )
-        if attention_mask is not None:
-            assert attention_mask.shape == (batch_size, mask_len)
-        if self.is_training:
-            # no batch_size before data_collator
-            attention_mask = attention_mask.squeeze(0)
-            position_ids = position_ids.squeeze(0)
-        data.update({
-            'attention_mask': attention_mask,
-            'position_ids': position_ids,
-            'append_last_valid_logits': append_last_valid_logits,
-        })
-        if 'images' in data:
-            data['images'] = data['images'].to(self.model.dtype)
-        return data
+    def generate(self, model, **kwargs):
+        kwargs.pop('attention_mask', None)
+        generation_config = kwargs.pop('generation_config')
+        batch = {
+            k: kwargs.pop(k, None)
+            for k in ['input_ids', 'attention_mask', 'images', 'image_input_idx', 'image_masks']
+        }
+        return model.generate_from_batch(batch, generation_config, **kwargs)
 
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
-        res = super().data_collator(batch, padding_to=padding_to)
+        res = super()._data_collator(batch, padding_to=padding_to)
         # prepare batchfy inputs
-        keys = ['images', 'image_input_idx', 'image_masks', 'append_last_valid_logits']
+        keys = ['images', 'image_input_idx', 'image_masks']
+        images_res = self.fetch_inputs(batch, keys)
         for key in keys:
-            batch_input = [b[key] for b in batch if b.get(key) is not None]
-            res[key] = torch.concat(batch_input)
-
+            val = images_res.get(key)
+            if val:
+                images_res[key] = torch.stack(val)
+        res.update(images_res)
         return res
 
 
@@ -171,8 +61,8 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         MLLMTemplateType.molmo,
         prefix=[],
         prompt=[' User: {{QUERY}} Assistant:'],
-        chat_sep=['<|endoftext|>'],
+        chat_sep=None,
         suffix=['<|endoftext|>'],
         template_cls=MolmoTemplate,
-        placeholder_tokens=['<|image|>'],
+        placeholder_tokens=['<im_patch>'],
     ))
diff --git a/swift/llm/template/template/mplug.py b/swift/llm/template/template/mplug.py
@@ -43,7 +43,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         return res
 
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
-        res = super().data_collator(batch, padding_to=padding_to)
+        res = super()._data_collator(batch, padding_to=padding_to)
         images = [b['images'] for b in batch if 'images' in b]
         if images:
             res['images'] = torch.concat(images)
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -157,7 +157,9 @@ def test_deepseek_vl2():
 
 
 def test_mplug_owl2():
-    pass
+    # pt_engine = PtEngine('iic/mPLUG-Owl2')
+    pt_engine = PtEngine('iic/mPLUG-Owl2.1')
+    _infer_model(pt_engine, messages=[{'role': 'user', 'content': '<image>这是什么'}])
 
 
 def test_mplug_owl3():
@@ -218,6 +220,30 @@ def test_megrez_omni():
                         '没有阴影或明亮的阳光表明这不是正午时分，也没有雨滴或雪花的迹象，这可能意味着不是下雨或下雪的日子。')
 
 
+def test_molmo():
+    # pt_engine = PtEngine('LLM-Research/Molmo-7B-O-0924')
+    pt_engine = PtEngine('LLM-Research/Molmo-7B-D-0924')
+    _infer_model(pt_engine)
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': '<image>这是什么'}])
+    assert response == (
+        ' This is a close-up photograph of a young kitten. '
+        'The kitten has striking blue eyes and a mix of white and black fur, '
+        'with distinctive black stripes on its head and face. '
+        "It's looking directly at the camera with an alert and curious expression. "
+        "The kitten's fur appears soft and fluffy, and its pink nose and white whiskers are clearly visible. "
+        'The background is blurred, which emphasizes the kitten as the main subject of the image.')
+
+
+def test_molmoe():
+    pt_engine = PtEngine('LLM-Research/MolmoE-1B-0924')
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': '<image>这是什么'}])
+    assert response == (" This is a close-up photograph of a kitten's face. The kitten has striking blue eyes and "
+                        "a mix of white, black, and brown fur. It's looking directly at the camera with an adorable "
+                        "expression, its ears perked up and whiskers visible. The image captures the kitten's cute "
+                        'features in sharp detail, while the background is blurred, creating a soft, out-of-focus '
+                        "effect that emphasizes the young feline's charm.")
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig, get_template
     from swift.utils import get_logger, seed_everything
@@ -247,4 +273,7 @@ def test_megrez_omni():
     # test_mplug_owl3()
     # test_xcomposer2_5()
     # test_megrez_omni()
-    test_qvq()
+    # test_qvq()
+    # test_mplug_owl2()
+    # test_molmo()
+    test_molmoe()
diff --git a/tests/train/test_sft.py b/tests/train/test_sft.py