|
| 1 | +from dataclasses import dataclass, field |
| 2 | +from typing import Any, Dict, List, Literal, Optional |
| 3 | + |
| 4 | +import torch |
| 5 | + |
| 6 | +from ..base import Template |
| 7 | +from ..constant import MLLMTemplateType |
| 8 | +from ..register import TemplateMeta, register_template |
| 9 | +from ..template_inputs import StdTemplateInputs |
| 10 | +from ..utils import Context, Prompt, findall |
| 11 | + |
| 12 | + |
| 13 | +@dataclass |
| 14 | +class HunYuanVLTemplateMeta(TemplateMeta): |
| 15 | + prefix: Prompt = field(default_factory=lambda: ['<|hy_begin▁of▁sentence|>']) |
| 16 | + prompt: Prompt = field(default_factory=lambda: ['{{QUERY}}<|hy_User|>']) |
| 17 | + chat_sep: Optional[Prompt] = field(default_factory=lambda: ['<|hy_Assistant|><|hy_begin▁of▁sentence|>']) |
| 18 | + suffix: Prompt = field(default_factory=lambda: ['<|hy_Assistant|>']) |
| 19 | + system_prefix: Optional[Prompt] = field( |
| 20 | + default_factory=lambda: ['<|hy_begin▁of▁sentence|>{{SYSTEM}}<|hy_place▁holder▁no▁3|>']) |
| 21 | + |
| 22 | + |
| 23 | +class HunYuanVLTemplate(Template): |
| 24 | + image_token_id = 120120 |
| 25 | + image_token = '<|hy_place▁holder▁no▁102|>' |
| 26 | + image_placeholder = ['<|hy_place▁holder▁no▁102|>'] |
| 27 | + |
| 28 | + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int, |
| 29 | + inputs: StdTemplateInputs) -> List[Context]: |
| 30 | + assert media_type == 'image' |
| 31 | + if self.mode == 'vllm': |
| 32 | + return ['<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>'] |
| 33 | + return [[-100]] |
| 34 | + |
| 35 | + def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: |
| 36 | + encoded = super()._encode(inputs) |
| 37 | + input_ids = encoded['input_ids'] |
| 38 | + labels = encoded['labels'] |
| 39 | + loss_scale = encoded.get('loss_scale', None) |
| 40 | + idx_list = findall(input_ids, -100) |
| 41 | + processor = self.processor |
| 42 | + images = inputs.images |
| 43 | + if images: |
| 44 | + image_inputs = processor.image_processor(images=images, return_tensors='pt') |
| 45 | + image_grid_thw = image_inputs['image_grid_thw'] |
| 46 | + merge_size = processor.image_processor.merge_size |
| 47 | + |
| 48 | + def _get_new_tokens(i): |
| 49 | + grid_h, grid_w = image_grid_thw[i][-2:] |
| 50 | + patch_h = grid_h // merge_size |
| 51 | + patch_w = grid_w // merge_size |
| 52 | + img_tokens: List[int] = [self.image_token_id] * (patch_h * (patch_w + 1) + 2) |
| 53 | + return img_tokens |
| 54 | + |
| 55 | + encoded['input_ids'], encoded['labels'], encoded['loss_scale'] = self._extend_tokens( |
| 56 | + input_ids, labels, loss_scale, idx_list, _get_new_tokens) |
| 57 | + encoded['pixel_values'] = image_inputs['pixel_values'] |
| 58 | + encoded['image_grid_thw'] = image_grid_thw |
| 59 | + |
| 60 | + input_ids = encoded['input_ids'] |
| 61 | + position_ids = torch.arange(len(input_ids)) |
| 62 | + position_ids_w = torch.arange(len(input_ids)) |
| 63 | + position_ids_h = torch.arange(len(input_ids)) |
| 64 | + position_ids_t = torch.arange(len(input_ids)) |
| 65 | + image_tokens_cumsum = [0] |
| 66 | + for i in range(len(image_grid_thw)): |
| 67 | + grid_h, grid_w = image_grid_thw[i][-2:] |
| 68 | + patch_h = grid_h // merge_size |
| 69 | + patch_w = grid_w // merge_size |
| 70 | + num_image_tokens = patch_h * (patch_w + 1) + 2 |
| 71 | + image_tokens_cumsum.append(image_tokens_cumsum[-1] + int(num_image_tokens)) |
| 72 | + image_token_pos_indices = torch.where(torch.tensor(input_ids) == self.image_token_id) |
| 73 | + start_pos = image_token_pos_indices[0][image_tokens_cumsum[i]] + 1 |
| 74 | + replace_num = (patch_w + 1) * patch_h |
| 75 | + position_ids_w[start_pos:start_pos + replace_num] = torch.tensor( |
| 76 | + list(range(patch_w + 1)) * patch_h, dtype=torch.int64) |
| 77 | + patch_h_list = [] |
| 78 | + for h in range(patch_h): |
| 79 | + patch_h_list += [h] * (patch_w + 1) |
| 80 | + position_ids_h[start_pos:start_pos + replace_num] = torch.tensor(patch_h_list, dtype=torch.int64) |
| 81 | + position_ids_t[start_pos:start_pos + replace_num] = 0 |
| 82 | + position_ids = torch.stack([position_ids, position_ids_w, position_ids_h, position_ids_t]).unsqueeze(0) |
| 83 | + encoded['position_ids'] = position_ids |
| 84 | + attention_mask = torch.tensor(input_ids).ne(processor.pad_id) |
| 85 | + encoded['attention_mask'] = attention_mask |
| 86 | + return encoded |
| 87 | + |
| 88 | + |
| 89 | +register_template(HunYuanVLTemplateMeta(MLLMTemplateType.hunyuan_ocr, template_cls=HunYuanVLTemplate)) |
0 commit comments