support iic/DocOwl2 (#2728)

Jintao-Huang · web-flow · commit bb5cbe62d633 · 2024-12-23T10:29:26.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -587,6 +587,7 @@
 |[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
 |[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
 |[iic/mPLUG-Owl3-7B-241101](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-241101)|mplug_owl3_241101|mplug_owl3_241101|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-241101](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-241101)|
+|[iic/DocOwl2](https://modelscope.cn/models/iic/DocOwl2)|doc_owl2|doc_owl2|transformers>=4.36, icecream|vision|[mPLUG/DocOwl2](https://huggingface.co/mPLUG/DocOwl2)|
 |[BAAI/Emu3-Gen](https://modelscope.cn/models/BAAI/Emu3-Gen)|emu3_gen|emu3_gen|-|t2i|[BAAI/Emu3-Gen](https://huggingface.co/BAAI/Emu3-Gen)|
 |[BAAI/Emu3-Chat](https://modelscope.cn/models/BAAI/Emu3-Chat)|emu3_chat|emu3_chat|transformers>=4.44.0|vision|[BAAI/Emu3-Chat](https://huggingface.co/BAAI/Emu3-Chat)|
 |[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0)|got_ocr2|got_ocr2|-|vision|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -587,6 +587,7 @@ The table below introduces the models integrated with ms-swift:
 |[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
 |[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)|mplug_owl3|mplug_owl3|transformers>=4.36, icecream, decord|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
 |[iic/mPLUG-Owl3-7B-241101](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-241101)|mplug_owl3_241101|mplug_owl3_241101|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-241101](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-241101)|
+|[iic/DocOwl2](https://modelscope.cn/models/iic/DocOwl2)|doc_owl2|doc_owl2|transformers>=4.36, icecream|vision|[mPLUG/DocOwl2](https://huggingface.co/mPLUG/DocOwl2)|
 |[BAAI/Emu3-Gen](https://modelscope.cn/models/BAAI/Emu3-Gen)|emu3_gen|emu3_gen|-|t2i|[BAAI/Emu3-Gen](https://huggingface.co/BAAI/Emu3-Gen)|
 |[BAAI/Emu3-Chat](https://modelscope.cn/models/BAAI/Emu3-Chat)|emu3_chat|emu3_chat|transformers>=4.44.0|vision|[BAAI/Emu3-Chat](https://huggingface.co/BAAI/Emu3-Chat)|
 |[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0)|got_ocr2|got_ocr2|-|vision|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)|
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
@@ -153,6 +153,7 @@ class MLLMModelType:
     mplug_owl2_1 = 'mplug_owl2_1'
     mplug_owl3 = 'mplug_owl3'
     mplug_owl3_241101 = 'mplug_owl3_241101'
+    doc_owl2 = 'doc_owl2'
 
     emu3_gen = 'emu3_gen'
     emu3_chat = 'emu3_chat'
diff --git a/swift/llm/model/model/mplug.py b/swift/llm/model/model/mplug.py
@@ -82,7 +82,7 @@ def get_model_tokenizer_mplug_owl3(model_dir: str,
     processor = model.init_processor(tokenizer)
     if model is not None:
         func_list = ['generate', 'forward']
-        _use_submodel_func(model, 'language_model', func_list)
+        use_submodel_func(model, 'language_model', func_list)
     return model, processor
 
 
@@ -115,3 +115,28 @@ def get_model_tokenizer_mplug_owl3(model_dir: str,
         model_arch=ModelArch.mplug_owl3,
         requires=['transformers>=4.36', 'icecream'],
         tags=['vision', 'video']))
+
+
+def get_model_tokenizer_doc_owl2(model_dir: str,
+                                 model_info: ModelInfo,
+                                 model_kwargs: Dict[str, Any],
+                                 load_model: bool = True,
+                                 **kwargs):
+    model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, model_info, model_kwargs, load_model, **kwargs)
+    processor = model.init_processor(tokenizer, basic_image_size=504, crop_anchors='grid_12')
+    return model, processor
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.doc_owl2, [
+            ModelGroup([
+                Model('iic/DocOwl2', 'mPLUG/DocOwl2'),
+            ]),
+        ],
+        TemplateType.doc_owl2,
+        get_model_tokenizer_doc_owl2,
+        architectures=['mPLUGDocOwl2'],
+        model_arch=ModelArch.doc_owl2,
+        requires=['transformers>=4.36', 'icecream'],
+        tags=['vision']))
diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -49,6 +49,7 @@ class MLLMModelArch:
     mplug_owl2 = 'mplug_owl2'
     mplug_owl2_1 = 'mplug_owl2_1'
     mplug_owl3 = 'mplug_owl3'
+    doc_owl2 = 'doc_owl2'
 
     phi3v = 'phi3v'
     florence = 'florence'
@@ -354,6 +355,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         vision_tower='vision_model',
     ))
 
+register_model_arch(
+    MultiModelKeys(
+        MLLMModelArch.doc_owl2,
+        language_model='model.layers',
+        aligner=['model.vision2text', 'model.hr_compressor'],
+        vision_tower='model.vision_model',
+    ))
+
 register_model_arch(
     MultiModelKeys(
         MLLMModelArch.deepseek_vl,
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -974,7 +974,12 @@ def _pad_sequence(self, sequences: List[torch.Tensor], padding_value: float = 0.
         return torch.stack(padded_sequences)
 
     def safe_decode(self, input_ids: List[int], **tokenizer_kwargs) -> str:
-        placeholder_tokens = self.template_meta.placeholder_tokens
+        if isinstance(self, Template):
+            tokenizer = self.tokenizer
+            placeholder_tokens = self.template_meta.placeholder_tokens
+        else:
+            tokenizer = self
+            placeholder_tokens = []
 
         def _is_special(token: int) -> bool:
             if isinstance(token, float) or token < 0:
@@ -995,12 +1000,12 @@ def _is_special(token: int) -> bool:
                 continue
             if _is_special(input_ids[i]) and not _is_special(input_ids[i - 1]):
                 s = i
-                result_str += self.tokenizer.decode(input_ids[e:s], **tokenizer_kwargs)
+                result_str += tokenizer.decode(input_ids[e:s], **tokenizer_kwargs)
             if not _is_special(input_ids[i]) and _is_special(input_ids[i - 1]):
                 e = i
                 result_str += f'[{input_ids[i - 1]} * {e - s}]'
         if _is_special(input_ids[i]):
             result_str += f'[{input_ids[i]} * {len(input_ids) - s}]'
         else:
-            result_str += self.tokenizer.decode(input_ids[e:], **tokenizer_kwargs)
+            result_str += tokenizer.decode(input_ids[e:], **tokenizer_kwargs)
         return result_str
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
@@ -127,6 +127,7 @@ class MLLMTemplateType:
     mplug_owl2 = 'mplug_owl2'
     mplug_owl3 = 'mplug_owl3'
     mplug_owl3_241101 = 'mplug_owl3_241101'
+    doc_owl2 = 'doc_owl2'
 
     emu3_chat = 'emu3_chat'
     emu3_gen = 'emu3_gen'
diff --git a/swift/llm/template/template/mplug.py b/swift/llm/template/template/mplug.py
@@ -174,3 +174,40 @@ class mPlugOwl3TemplateMeta(QwenTemplateMeta):
 register_template(mPlugOwl3TemplateMeta(MLLMTemplateType.mplug_owl3, template_cls=mPlugOwl3Template))
 
 register_template(mPlugOwl3TemplateMeta(MLLMTemplateType.mplug_owl3_241101, template_cls=mPlugOwl3_241101Template))
+
+
+class DocOwl2Template(Template):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'image':
+            return [f'<img {index + 1}>', [-200]]
+
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        if inputs.images:
+            image_tensor, patch_positions, _ = self.processor._process_image(inputs.images)
+            image_tensor = image_tensor.to(self.config.torch_dtype)
+            encoded.update({'images': image_tensor, 'patch_positions': patch_positions})
+        return encoded
+
+    def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
+        keys = ['images', 'patch_positions']
+        res = self.fetch_inputs(batch, keys)
+        for key in keys:
+            val = res.get(key)
+            if val:
+                res[key] = torch.concat([v for v in val if v is not None])
+        res.update(super()._data_collator(batch, padding_to=padding_to))
+        return res
+
+
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.doc_owl2,
+        prefix=['<s>'],
+        prompt=[' USER: {{QUERY}} ASSISTANT:'],
+        chat_sep=['</s>'],
+        suffix=['</s>'],
+        template_cls=DocOwl2Template,
+    ))
diff --git a/tests/llm/test_custom.py b/tests/llm/test_custom.py
@@ -42,7 +42,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         ],
         template='custom',
         get_function=get_model_tokenizer_with_flash_attn,
-        ignore_file_pattern=['nemo']))
+        ignore_patterns=['nemo']))
 
 
 class TestCustom(unittest.TestCase):
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -255,6 +255,32 @@ def test_molmoe():
                         "effect that emphasizes the young feline's charm.")
 
 
+def test_doc_owl2():
+    pt_engine = PtEngine('iic/DocOwl2', torch_dtype=torch.float16)
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': '你是谁'}], images=[])
+    images = [
+        'https://modelscope.cn/models/iic/DocOwl2/resolve/master/examples/docowl2_page0.png',
+        'https://modelscope.cn/models/iic/DocOwl2/resolve/master/examples/docowl2_page1.png',
+        'https://modelscope.cn/models/iic/DocOwl2/resolve/master/examples/docowl2_page2.png',
+        'https://modelscope.cn/models/iic/DocOwl2/resolve/master/examples/docowl2_page3.png',
+        'https://modelscope.cn/models/iic/DocOwl2/resolve/master/examples/docowl2_page4.png',
+        'https://modelscope.cn/models/iic/DocOwl2/resolve/master/examples/docowl2_page5.png',
+    ]
+    response = _infer_model(
+        pt_engine,
+        messages=[{
+            'role': 'user',
+            'content': '<image>' * len(images) + 'what is this paper about? provide detailed information.'
+        }],
+        images=images)
+    assert response == (
+        'This paper is about multimodal Language Models(MLMs) achieving promising OCR-free '
+        'Document Understanding by performing understanding by the cost of generating thorough sands of visual '
+        'tokens for a single document image, leading to excessive GPU computation time. The paper also discusses '
+        'the challenges and limitations of existing multimodal OCR approaches and proposes a new framework for '
+        'more efficient and accurate OCR-free document understanding.')
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig, get_template
     from swift.utils import get_logger, seed_everything
@@ -278,14 +304,14 @@ def test_molmoe():
     # test_llava_hf()
     # test_florence()
     # test_glm_edge_v()
-    #
     # test_phi3_vision()
     # test_internvl2_5()
-    test_internvl2_5_mpo()
+    # test_internvl2_5_mpo()
     # test_mplug_owl3()
     # test_xcomposer2_5()
     # test_megrez_omni()
     # test_qvq()
     # test_mplug_owl2()
     # test_molmo()
     # test_molmoe()
+    test_doc_owl2()