|
| 1 | +# Copyright (c) Alibaba, Inc. and its affiliates. |
| 2 | +from typing import Any, Dict, List, Literal |
| 3 | + |
| 4 | +from ..base import Template |
| 5 | +from ..constant import MLLMTemplateType |
| 6 | +from ..register import register_template |
| 7 | +from ..template_inputs import StdTemplateInputs |
| 8 | +from ..utils import Context, findall |
| 9 | +from .utils import TemplateMeta |
| 10 | + |
| 11 | + |
| 12 | +class DotsOCRTemplate(Template): |
| 13 | + image_token_id = 151665 |
| 14 | + placeholder_tokens = ['<|imgpad|>'] |
| 15 | + |
| 16 | + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int, |
| 17 | + inputs: StdTemplateInputs) -> List[Context]: |
| 18 | + from qwen_vl_utils import fetch_image |
| 19 | + assert media_type == 'image' |
| 20 | + inputs.images[index] = fetch_image({'image': inputs.images[index]}) |
| 21 | + if self.mode == 'lmdeploy': |
| 22 | + return ['<|img|>', [-100], '<|endofimg|>'] |
| 23 | + else: |
| 24 | + return ['<|img|><|imgpad|><|endofimg|>'] |
| 25 | + |
| 26 | + def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: |
| 27 | + encoded = super()._encode(inputs) |
| 28 | + processor = self.processor |
| 29 | + input_ids = encoded['input_ids'] |
| 30 | + labels = encoded['labels'] |
| 31 | + loss_scale = encoded.get('loss_scale', None) |
| 32 | + |
| 33 | + images = inputs.images |
| 34 | + media_token = self.image_token_id |
| 35 | + media_inputs = processor.image_processor(images=images, videos=None, return_tensors='pt', do_resize=False) |
| 36 | + media_grid_thw = media_inputs['image_grid_thw'] |
| 37 | + idx_list = findall(input_ids, media_token) |
| 38 | + merge_length = processor.image_processor.merge_size**2 |
| 39 | + |
| 40 | + def _get_new_tokens(i): |
| 41 | + token_len = (media_grid_thw[i].prod() // merge_length) |
| 42 | + return [media_token] * token_len |
| 43 | + |
| 44 | + input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens) |
| 45 | + encoded.update(media_inputs) |
| 46 | + |
| 47 | + encoded['input_ids'] = input_ids |
| 48 | + encoded['labels'] = labels |
| 49 | + encoded['loss_scale'] = loss_scale |
| 50 | + return encoded |
| 51 | + |
| 52 | + def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]: |
| 53 | + res = super()._data_collator_mm_data(batch) |
| 54 | + grid_thw = self.concat_tensor(batch, 'image_grid_thw', 0) |
| 55 | + if grid_thw is not None: |
| 56 | + res['image_grid_thw'] = grid_thw |
| 57 | + return res |
| 58 | + |
| 59 | + |
| 60 | +register_template( |
| 61 | + TemplateMeta( |
| 62 | + MLLMTemplateType.dots_ocr, |
| 63 | + prefix=[''], |
| 64 | + prompt=['<|user|>{{QUERY}}<|endofuser|><|assistant|>'], |
| 65 | + chat_sep=['<|endofassistant|>'], |
| 66 | + suffix=['<|endofassistant|>'], |
| 67 | + system_prefix=['<|system|>{{SYSTEM}}<|endofsystem|>\n'], |
| 68 | + template_cls=DotsOCRTemplate, |
| 69 | + )) |
0 commit comments