[model] support dots.ocr (#5333)

0russwest0 · Jintao-Huang · commit 1a3bd9451e96 · 2025-08-14T15:27:35.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -848,6 +848,7 @@
 |[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
 |[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
 |[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|keye_vl_utils|&#x2718;|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
+|[rednote-hilab/dots.ocr](https://modelscope.cn/models/rednote-hilab/dots.ocr)|dots_ocr|dots_ocr|transformers>=4.51.0|&#x2718;|-|[rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr)|
 |[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
 |[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
 |[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|&#x2718;|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -848,6 +848,7 @@ The table below introduces the models integrated with ms-swift:
 |[moonshotai/Kimi-VL-A3B-Thinking](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)|
 |[moonshotai/Kimi-VL-A3B-Thinking-2506](https://modelscope.cn/models/moonshotai/Kimi-VL-A3B-Thinking-2506)|kimi_vl|kimi_vl|transformers<4.49|&#x2718;|-|[moonshotai/Kimi-VL-A3B-Thinking-2506](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking-2506)|
 |[Kwai-Keye/Keye-VL-8B-Preview](https://modelscope.cn/models/Kwai-Keye/Keye-VL-8B-Preview)|keye_vl|keye_vl|keye_vl_utils|&#x2718;|vision|[Kwai-Keye/Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview)|
+|[rednote-hilab/dots.ocr](https://modelscope.cn/models/rednote-hilab/dots.ocr)|dots_ocr|dots_ocr|transformers>=4.51.0|&#x2718;|-|[rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr)|
 |[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
 |[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)|phi3_vision|phi3_vision|transformers>=4.36|&#x2718;|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
 |[LLM-Research/Phi-4-multimodal-instruct](https://modelscope.cn/models/LLM-Research/Phi-4-multimodal-instruct)|phi4_multimodal|phi4_multimodal|transformers>=4.36,<4.49, backoff, soundfile|&#x2718;|vision, audio|[microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)|
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
@@ -229,6 +229,7 @@ class MLLMModelType:
     step_audio = 'step_audio'
     kimi_vl = 'kimi_vl'
     keye_vl = 'keye_vl'
+    dots_ocr = 'dots_ocr'
 
     phi3_vision = 'phi3_vision'
     phi4_multimodal = 'phi4_multimodal'
diff --git a/swift/llm/model/model/mllm.py b/swift/llm/model/model/mllm.py
@@ -203,3 +203,24 @@ def get_model_tokenizer_keye_vl(model_dir: str, *args, **kwargs):
         tags=['vision'],
         requires=['keye_vl_utils'],
     ))
+
+
+def get_model_tokenizer_dots_ocr(model_dir, *args, **kwargs):
+    model_cls = get_class_from_dynamic_module('modeling_dots_vision.DotsVisionTransformer', model_dir)
+    model_cls._no_split_modules = ['DotsVisionBlock']
+    model, processor = get_model_tokenizer_multimodal(model_dir, *args, **kwargs)
+    return model, processor
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.dots_ocr,
+        [ModelGroup([
+            Model('rednote-hilab/dots.ocr', 'rednote-hilab/dots.ocr'),
+        ])],
+        TemplateType.dots_ocr,
+        get_model_tokenizer_dots_ocr,
+        model_arch=ModelArch.dots_ocr,
+        architectures=['DotsOCRForCausalLM'],
+        requires=['transformers>=4.51.0'],
+    ))
diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -66,6 +66,7 @@ class MLLMModelArch:
     idefics3 = 'idefics3'
 
     got_ocr2 = 'got_ocr2'
+    dots_ocr = 'dots_ocr'
 
     ovis1_6 = 'ovis1_6'
     molmo = 'molmo'
@@ -640,6 +641,11 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         vision_tower='visual',
     ))
 
+register_model_arch(MultiModelKeys(
+    MLLMModelArch.dots_ocr,
+    language_model='model',
+))
+
 
 def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
     return MODEL_ARCH_MAPPING.get(arch_name)
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
@@ -185,6 +185,7 @@ class MLLMTemplateType:
     step_audio = 'step_audio'
     kimi_vl = 'kimi_vl'
     keye_vl = 'keye_vl'
+    dots_ocr = 'dots_ocr'
 
     idefics3 = 'idefics3'
     pixtral = 'pixtral'
diff --git a/swift/llm/template/template/__init__.py b/swift/llm/template/template/__init__.py
@@ -1,3 +1,3 @@
-from . import (baidu, bert, deepseek, emu3, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm, megrez,
-               microsoft, midashenglm, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen,
-               stepfun, valley, yi)
+from . import (baidu, bert, deepseek, dots, emu3, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm,
+               megrez, microsoft, midashenglm, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral,
+               qwen, stepfun, valley, yi)
diff --git a/swift/llm/template/template/dots.py b/swift/llm/template/template/dots.py
@@ -0,0 +1,69 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, List, Literal
+
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, findall
+from .utils import TemplateMeta
+
+
+class DotsOCRTemplate(Template):
+    image_token_id = 151665
+    placeholder_tokens = ['<|imgpad|>']
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        from qwen_vl_utils import fetch_image
+        assert media_type == 'image'
+        inputs.images[index] = fetch_image({'image': inputs.images[index]})
+        if self.mode == 'lmdeploy':
+            return ['<|img|>', [-100], '<|endofimg|>']
+        else:
+            return ['<|img|><|imgpad|><|endofimg|>']
+
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        processor = self.processor
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        loss_scale = encoded.get('loss_scale', None)
+
+        images = inputs.images
+        media_token = self.image_token_id
+        media_inputs = processor.image_processor(images=images, videos=None, return_tensors='pt', do_resize=False)
+        media_grid_thw = media_inputs['image_grid_thw']
+        idx_list = findall(input_ids, media_token)
+        merge_length = processor.image_processor.merge_size**2
+
+        def _get_new_tokens(i):
+            token_len = (media_grid_thw[i].prod() // merge_length)
+            return [media_token] * token_len
+
+        input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens)
+        encoded.update(media_inputs)
+
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+        encoded['loss_scale'] = loss_scale
+        return encoded
+
+    def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        res = super()._data_collator_mm_data(batch)
+        grid_thw = self.concat_tensor(batch, 'image_grid_thw', 0)
+        if grid_thw is not None:
+            res['image_grid_thw'] = grid_thw
+        return res
+
+
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.dots_ocr,
+        prefix=[''],
+        prompt=['<|user|>{{QUERY}}<|endofuser|><|assistant|>'],
+        chat_sep=['<|endofassistant|>'],
+        suffix=['<|endofassistant|>'],
+        system_prefix=['<|system|>{{SYSTEM}}<|endofsystem|>\n'],
+        template_cls=DotsOCRTemplate,
+    ))
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -601,6 +601,18 @@ def test_keye_vl():
         'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
         'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png'
     ]
+    pt_engine.default_template.template_backend = 'swift'
+    response = _infer_model(pt_engine, messages=messages, images=images)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, images=images)
+    assert response == response2
+
+
+def test_dots_ocr():
+    # https://github.com/modelscope/ms-swift/issues/2122
+    pt_engine = PtEngine('rednote-hilab/dots.ocr')
+    messages = [{'role': 'user', 'content': '<image>Extract the text content from this image.'}]
+    images = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png']
     response = _infer_model(pt_engine, messages=messages, images=images)
     pt_engine.default_template.template_backend = 'jinja'
     response2 = _infer_model(pt_engine, messages=messages, images=images)
@@ -629,7 +641,7 @@ def test_keye_vl():
     # test_glm4v()
     # test_cogagent()
     # test_llava_onevision_hf()
-    test_minicpmv()
+    # test_minicpmv()
     # test_got_ocr()
     # test_got_ocr_hf()
     # test_paligemma()
@@ -664,3 +676,4 @@ def test_keye_vl():
     # test_glm4_1v()
     # test_gemma3n()
     # test_keye_vl()
+    test_dots_ocr()