|
8 | 8 | from lightllm.models.registry import ModelRegistry |
9 | 9 | from lightllm.models.qwen2.model import Qwen2TpPartModel |
10 | 10 | from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer |
11 | | -from lightllm.models.internvl.layer_weights.pre_and_post_layer_weight import InternVLLlamaPreAndPostLayerWeight |
12 | | -from lightllm.models.internvl.img_process import get_image_patch |
13 | 11 |
|
14 | 12 | from ..mineru2_qwen.image_processing_mineru2 import Mineru2ImageProcessor |
15 | 13 | from .image_processing_mineru2 import get_anyres_image_grid_shape |
@@ -65,34 +63,20 @@ def init_audioitem_extral_params( |
65 | 63 | def get_image_token_length(self, img: ImageItem): |
66 | 64 | # 切回 patch 序列:总token数 = 视图数 × 每视图patch数 |
67 | 65 | # 每视图patch数 = self.image_length = (image_size // patch_size) ** 2 |
68 | | - aspect = getattr(self.image_processor, "image_aspect_ratio", None) |
69 | 66 | patch_len = int(self.image_length) |
70 | | - try: |
71 | | - if aspect and (aspect == "anyres" or (isinstance(aspect, str) and "anyres_max" in aspect)): |
72 | | - crop_size = self.image_processor.crop_size["height"] |
73 | | - grid_w, grid_h = get_anyres_image_grid_shape( |
74 | | - (img.image_w, img.image_h), self.image_processor.image_grid_pinpoints, crop_size |
75 | | - ) |
76 | | - views = int(grid_w * grid_h + 1) |
77 | | - token_num = views * patch_len |
78 | | - print( |
79 | | - f"[debug] mineru2_tokenizer anyres img_size=({img.image_w},{img.image_h}) " |
80 | | - f"crop={crop_size} grid=({grid_w},{grid_h}) views={views}" |
81 | | - f" patch_len={patch_len} token_num={token_num}" |
82 | | - ) |
83 | | - return token_num |
84 | | - else: |
85 | | - token_num = patch_len |
86 | | - print( |
87 | | - f"[debug] mineru2_tokenizer non-anyres views=1 patch_len={patch_len}" |
88 | | - f" token_num={token_num} aspect={aspect}" |
89 | | - ) |
90 | | - return token_num |
91 | | - except Exception as e: |
92 | | - # 兜底:按单视图返回 |
93 | | - token_num = patch_len |
94 | | - print(f"[debug] mineru2_tokenizer token_num_fallback due to {e}, return {token_num}") |
95 | | - return token_num |
| 67 | + |
| 68 | + crop_size = self.image_processor.crop_size["height"] |
| 69 | + grid_w, grid_h = get_anyres_image_grid_shape( |
| 70 | + (img.image_w, img.image_h), self.image_processor.image_grid_pinpoints, crop_size |
| 71 | + ) |
| 72 | + views = int(grid_w * grid_h + 1) |
| 73 | + token_num = views * patch_len |
| 74 | + print( |
| 75 | + f"[debug] mineru2_tokenizer anyres img_size=({img.image_w},{img.image_h}) " |
| 76 | + f"crop={crop_size} grid=({grid_w},{grid_h}) views={views}" |
| 77 | + f" patch_len={patch_len} token_num={token_num}" |
| 78 | + ) |
| 79 | + return token_num |
96 | 80 |
|
97 | 81 | def get_audio_token_length(self, audio: AudioItem): |
98 | 82 | raise NotImplementedError |
@@ -132,15 +116,11 @@ def encode(self, prompt, multimodal_params: MultimodalParams = None, add_special |
132 | 116 | if image_id < len(multimodal_params.images): |
133 | 117 | print(f"[warning] mineru2_tokenizer unused images: {len(multimodal_params.images) - image_id}") |
134 | 118 |
|
135 | | - print(f"[debug] mineru2_tokenizer input_ids={input_ids}") |
136 | 119 | return input_ids |
137 | 120 |
|
138 | 121 |
|
139 | 122 | @ModelRegistry("mineru2_qwen", is_multimodal=True) |
140 | 123 | class Mineru2QwenForCausalLM(Qwen2TpPartModel): |
141 | | - # weight class |
142 | | - # pre_and_post_weight_class = InternVLLlamaPreAndPostLayerWeight |
143 | | - |
144 | 124 | # infer class |
145 | 125 | pre_layer_infer_class = LlamaMultimodalPreLayerInfer |
146 | 126 |
|
|
0 commit comments