[model] support hunyuan_ocr (#7038)

slin000111 · web-flow · commit 69a5161e0091 · 2025-12-15T15:20:36.000+08:00
* support hunyuanOCR

* fix bug

* fix typos

* fix position_ids for multi images
diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md
@@ -1053,6 +1053,7 @@
 |[mistralai/Ministral-3-14B-Reasoning-2512](https://modelscope.cn/models/mistralai/Ministral-3-14B-Reasoning-2512)|mistral_2512_thinking|mistral_2512_thinking|transformers>=5.0.0.dev0, mistral-common>=1.8.6|&#x2718;|vision|[mistralai/Ministral-3-14B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512)|
 |[PaddlePaddle/PaddleOCR-VL](https://modelscope.cn/models/PaddlePaddle/PaddleOCR-VL)|paddle_ocr|paddle_ocr|-|&#x2718;|-|[PaddlePaddle/PaddleOCR-VL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL)|
 |[JinaAI/jina-reranker-m0](https://modelscope.cn/models/JinaAI/jina-reranker-m0)|jina_reranker_m0|jina_reranker_m0|-|&#x2718;|reranker, vision|[JinaAI/jina-reranker-m0](https://huggingface.co/JinaAI/jina-reranker-m0)|
+|[Tencent-Hunyuan/HunyuanOCR](https://modelscope.cn/models/Tencent-Hunyuan/HunyuanOCR)|hunyuan_ocr|hunyuan_ocr|transformers>=4.49.0|&#x2718;|vision|[tencent/HunyuanOCR](https://huggingface.co/tencent/HunyuanOCR)|
 
 
 ## 数据集
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -1053,6 +1053,7 @@ The table below introduces the models integrated with ms-swift:
 |[mistralai/Ministral-3-14B-Reasoning-2512](https://modelscope.cn/models/mistralai/Ministral-3-14B-Reasoning-2512)|mistral_2512_thinking|mistral_2512_thinking|transformers>=5.0.0.dev0, mistral-common>=1.8.6|&#x2718;|vision|[mistralai/Ministral-3-14B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512)|
 |[PaddlePaddle/PaddleOCR-VL](https://modelscope.cn/models/PaddlePaddle/PaddleOCR-VL)|paddle_ocr|paddle_ocr|-|&#x2718;|-|[PaddlePaddle/PaddleOCR-VL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL)|
 |[JinaAI/jina-reranker-m0](https://modelscope.cn/models/JinaAI/jina-reranker-m0)|jina_reranker_m0|jina_reranker_m0|-|&#x2718;|reranker, vision|[JinaAI/jina-reranker-m0](https://huggingface.co/JinaAI/jina-reranker-m0)|
+|[Tencent-Hunyuan/HunyuanOCR](https://modelscope.cn/models/Tencent-Hunyuan/HunyuanOCR)|hunyuan_ocr|hunyuan_ocr|transformers>=4.49.0|&#x2718;|vision|[tencent/HunyuanOCR](https://huggingface.co/tencent/HunyuanOCR)|
 
 
 ## Datasets
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
@@ -279,6 +279,7 @@ class MLLMModelType:
     mistral_2512 = 'mistral_2512'
     mistral_2512_thinking = 'mistral_2512_thinking'
     paddle_ocr = 'paddle_ocr'
+    hunyuan_ocr = 'hunyuan_ocr'
 
 
 class RerankerModelType:
diff --git a/swift/llm/model/model/__init__.py b/swift/llm/model/model/__init__.py
@@ -1,3 +1,3 @@
 from . import (baai, baichuan, baidu, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba,
                microsoft, minicpm, minimax, mistral, mllm, moonshot, mplug, openbuddy, qwen, seed, skywork, stepfun,
-               telechat, valley, yi)
+               telechat, tencent, valley, yi)
diff --git a/swift/llm/model/model/tencent.py b/swift/llm/model/model/tencent.py
@@ -0,0 +1,35 @@
+from typing import Any, Dict
+
+from swift.llm import TemplateType
+from ..constant import MLLMModelType
+from ..model_arch import ModelArch
+from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_multimodal, register_model
+from ..utils import ModelInfo
+
+
+def get_model_tokenizer_hunyuan_vl(model_dir: str,
+                                   model_info: ModelInfo,
+                                   model_kwargs: Dict[str, Any],
+                                   load_model: bool = True,
+                                   **kwargs):
+    from transformers import HunYuanVLForConditionalGeneration
+    kwargs['automodel_class'] = kwargs['automodel_class'] or HunYuanVLForConditionalGeneration
+    kwargs['attn_impl'] = kwargs['attn_impl'] or 'eager'
+    model, processor = get_model_tokenizer_multimodal(model_dir, model_info, model_kwargs, load_model, **kwargs)
+    return model, processor
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.hunyuan_ocr,
+        [
+            ModelGroup([
+                Model('Tencent-Hunyuan/HunyuanOCR', 'tencent/HunyuanOCR'),
+            ]),
+        ],
+        TemplateType.hunyuan_ocr,
+        get_model_tokenizer_hunyuan_vl,
+        architectures=['HunYuanVLForConditionalGeneration'],
+        model_arch=ModelArch.hunyuan_vl,
+        requires=['transformers>=4.49.0'],
+    ))
diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -85,6 +85,7 @@ class MLLMModelArch:
 
     midashenglm = 'midashenglm'
     step_audio2_mini = 'step_audio2_mini'
+    hunyuan_vl = 'hunyuan_vl'
 
 
 class ModelArch(LLMModelArch, MLLMModelArch):
@@ -722,6 +723,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         vision_tower='model.visual',
     ))
 
+register_model_arch(
+    MultiModelKeys(
+        MLLMModelArch.hunyuan_vl,
+        language_model='model',
+        aligner='vit.perceive',
+        vision_tower='vit',
+    ))
+
 
 def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
     return MODEL_ARCH_MAPPING.get(arch_name)
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
@@ -233,6 +233,7 @@ class MLLMTemplateType:
     mistral_2512 = 'mistral_2512'
     mistral_2512_thinking = 'mistral_2512_thinking'
     paddle_ocr = 'paddle_ocr'
+    hunyuan_ocr = 'hunyuan_ocr'
 
 
 class TemplateType(LLMTemplateType, MLLMTemplateType, RMTemplateType):
diff --git a/swift/llm/template/template/__init__.py b/swift/llm/template/template/__init__.py
@@ -1,3 +1,3 @@
 from . import (baai, baidu, bert, deepseek, dots, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm,
                megrez, microsoft, midashenglm, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral,
-               qwen, seed, stepfun, valley, yi)
+               qwen, seed, stepfun, tencent, valley, yi)
diff --git a/swift/llm/template/template/tencent.py b/swift/llm/template/template/tencent.py
@@ -0,0 +1,89 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Literal, Optional
+
+import torch
+
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context, Prompt, findall
+
+
+@dataclass
+class HunYuanVLTemplateMeta(TemplateMeta):
+    prefix: Prompt = field(default_factory=lambda: ['<｜hy_begin▁of▁sentence｜>'])
+    prompt: Prompt = field(default_factory=lambda: ['{{QUERY}}<｜hy_User｜>'])
+    chat_sep: Optional[Prompt] = field(default_factory=lambda: ['<｜hy_Assistant｜><｜hy_begin▁of▁sentence｜>'])
+    suffix: Prompt = field(default_factory=lambda: ['<｜hy_Assistant｜>'])
+    system_prefix: Optional[Prompt] = field(
+        default_factory=lambda: ['<｜hy_begin▁of▁sentence｜>{{SYSTEM}}<｜hy_place▁holder▁no▁3｜>'])
+
+
+class HunYuanVLTemplate(Template):
+    image_token_id = 120120
+    image_token = '<｜hy_place▁holder▁no▁102｜>'
+    image_placeholder = ['<｜hy_place▁holder▁no▁102｜>']
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type == 'image'
+        if self.mode == 'vllm':
+            return ['<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>']
+        return [[-100]]
+
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = super()._encode(inputs)
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        loss_scale = encoded.get('loss_scale', None)
+        idx_list = findall(input_ids, -100)
+        processor = self.processor
+        images = inputs.images
+        if images:
+            image_inputs = processor.image_processor(images=images, return_tensors='pt')
+            image_grid_thw = image_inputs['image_grid_thw']
+            merge_size = processor.image_processor.merge_size
+
+            def _get_new_tokens(i):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // merge_size
+                patch_w = grid_w // merge_size
+                img_tokens: List[int] = [self.image_token_id] * (patch_h * (patch_w + 1) + 2)
+                return img_tokens
+
+            encoded['input_ids'], encoded['labels'], encoded['loss_scale'] = self._extend_tokens(
+                input_ids, labels, loss_scale, idx_list, _get_new_tokens)
+            encoded['pixel_values'] = image_inputs['pixel_values']
+            encoded['image_grid_thw'] = image_grid_thw
+
+            input_ids = encoded['input_ids']
+            position_ids = torch.arange(len(input_ids))
+            position_ids_w = torch.arange(len(input_ids))
+            position_ids_h = torch.arange(len(input_ids))
+            position_ids_t = torch.arange(len(input_ids))
+            image_tokens_cumsum = [0]
+            for i in range(len(image_grid_thw)):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // merge_size
+                patch_w = grid_w // merge_size
+                num_image_tokens = patch_h * (patch_w + 1) + 2
+                image_tokens_cumsum.append(image_tokens_cumsum[-1] + int(num_image_tokens))
+                image_token_pos_indices = torch.where(torch.tensor(input_ids) == self.image_token_id)
+                start_pos = image_token_pos_indices[0][image_tokens_cumsum[i]] + 1
+                replace_num = (patch_w + 1) * patch_h
+                position_ids_w[start_pos:start_pos + replace_num] = torch.tensor(
+                    list(range(patch_w + 1)) * patch_h, dtype=torch.int64)
+                patch_h_list = []
+                for h in range(patch_h):
+                    patch_h_list += [h] * (patch_w + 1)
+                position_ids_h[start_pos:start_pos + replace_num] = torch.tensor(patch_h_list, dtype=torch.int64)
+                position_ids_t[start_pos:start_pos + replace_num] = 0
+            position_ids = torch.stack([position_ids, position_ids_w, position_ids_h, position_ids_t]).unsqueeze(0)
+            encoded['position_ids'] = position_ids
+            attention_mask = torch.tensor(input_ids).ne(processor.pad_id)
+            encoded['attention_mask'] = attention_mask
+        return encoded
+
+
+register_template(HunYuanVLTemplateMeta(MLLMTemplateType.hunyuan_ocr, template_cls=HunYuanVLTemplate))
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -1127,6 +1127,24 @@ def test_mistral_2512_thinking():
     assert response1[:256] == response2[:256]
 
 
+def test_hunyuan_ocr():
+    pt_engine = PtEngine('Tencent-Hunyuan/HunyuanOCR')
+    images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/ocr.png']
+    messages = [{
+        'role':
+        'user',
+        'content': ('Extract all information from the main body of the document image '
+                    'and represent it in markdown format, ignoring headers and footers. '
+                    'Tables should be expressed in HTML format, formulas in the document '
+                    'should be represented using LaTeX format, and the parsing should be '
+                    'organized according to the reading order.')
+    }]
+    response1 = _infer_model(pt_engine, messages=messages, images=images)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, images=images)
+    assert response1 == response2
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig
     from swift.utils import get_logger, seed_everything
@@ -1206,5 +1224,6 @@ def test_mistral_2512_thinking():
     # test_ernie_vl_thinking()
     # test_mistral_2506()
     # test_sensenova_si()
-    test_mistral_2512()
-    test_mistral_2512_thinking()
+    # test_mistral_2512()
+    # test_mistral_2512_thinking()
+    test_hunyuan_ocr()