Skip to content

Commit bbbd6df

Browse files
committed
fix
1 parent d05b160 commit bbbd6df

File tree

5 files changed

+96
-12
lines changed

5 files changed

+96
-12
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ dist
66
.idea
77
.vscode
88
tmp/
9+
ref/

lightllm/models/mineru2_qwen/image_processing_mineru2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
9393
return width // patch_size, height // patch_size
9494

9595

96-
def process_anyres_image(image, processor, grid_pinpoints):
96+
def process_anyres_image(image, processor: "Mineru2ImageProcessor", grid_pinpoints):
9797
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
9898
patch_size = processor.crop_size["height"]
9999
assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
@@ -120,7 +120,7 @@ def process_anyres_image(image, processor, grid_pinpoints):
120120

121121
image_patches = [image_original_resize] + patches
122122
image_patches = [
123-
processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches
123+
processor.preprocess([image_patch], return_tensors="pt")["pixel_values"][0] for image_patch in image_patches
124124
]
125125
return torch.stack(image_patches, dim=0)
126126

lightllm/models/mineru2_qwen/mineru2_visual.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import torch
88
import torch.nn as nn
9+
import numpy as np
910
from transformers import (
1011
CLIPVisionModel,
1112
CLIPVisionConfig,
@@ -14,7 +15,7 @@
1415
)
1516

1617
from .configuration_mineru2 import Mineru2QwenConfig
17-
from .image_processing_mineru2 import Mineru2ImageProcessor
18+
from .image_processing_mineru2 import Mineru2ImageProcessor, expand2square, process_anyres_image
1819

1920
from lightllm.server.multimodal_params import ImageItem
2021
from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
@@ -80,7 +81,11 @@ def load_model(self, weight_dir):
8081

8182
self.vision_tower = build_vision_tower(vision_config)
8283
self.projector = build_vision_projector(vision_config)
83-
self.image_processor = Mineru2ImageProcessor()
84+
# 取配置参数传下去
85+
self.image_processor = Mineru2ImageProcessor(
86+
image_aspect_ratio=getattr(vision_config, "image_aspect_ratio", None),
87+
image_grid_pinpoints=getattr(vision_config, "image_grid_pinpoints", None),
88+
)
8489

8590
def cuda(self):
8691
self.vision_tower = self.vision_tower.cuda()
@@ -97,24 +102,44 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
97102
uuids: List[str] = []
98103
valid_id = 0
99104
valid_ids: List[List[int]] = []
100-
105+
image_aspect_ratio = getattr(self.image_processor, "image_aspect_ratio", None)
106+
image_grid_pinpoints = getattr(self.image_processor, "image_grid_pinpoints", None)
101107
for i, img in enumerate(images):
102108
if isinstance(img, ImageItem):
103109
uuids.append(img.uuid)
104110
image_data = read_shm(get_shm_name_data(img.uuid))
105111
image_data = Image.open(BytesIO(image_data)).convert("RGB")
106-
t = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"]
112+
if image_aspect_ratio == "pad":
113+
image_proc = expand2square(image_data, tuple(int(x * 255) for x in self.image_processor.image_mean))
114+
t = self.image_processor.preprocess(image_proc, return_tensors="pt")["pixel_values"]
115+
elif image_aspect_ratio and (image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio):
116+
t = process_anyres_image(image_data, self.image_processor, image_grid_pinpoints)
117+
if isinstance(t, np.ndarray):
118+
t = torch.from_numpy(t)
119+
else:
120+
t = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"]
121+
122+
if t.ndim == 5:
123+
print(f"[debug] mineru2_visual reshape t.ndim: {t.ndim}, t.shape: {t.shape}")
124+
t = t.view(-1, t.shape[-3], t.shape[-2], t.shape[-1])
125+
elif t.ndim == 3: # [3, H, W]
126+
print(f"[debug] mineru2_visual unsqueeze t.ndim: {t.ndim}, t.shape: {t.shape}")
127+
t = t.unsqueeze(0)
107128
img_tensors.append(t)
108129
else:
109130
raise Exception("Unsupport input types: {} for {}".format(type(img), img))
110131

111-
cur_num = img_tensors[-1].shape[0]
132+
cur_num = (
133+
img_tensors[-1].shape[0]
134+
if isinstance(img_tensors[-1], torch.Tensor) and img_tensors[-1].dim() == 4
135+
else 1
136+
)
112137
valid_ids.append([valid_id, valid_id + cur_num])
113138
valid_id += cur_num
114139

115140
if len(img_tensors) <= 0:
116141
return None, [], []
117-
142+
# 保证全部为4维后拼接
118143
img = torch.cat(img_tensors, dim=0)
119144
img = img.cuda()
120145
all_img_embeds = self.forward(img)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"_name_or_path": "",
3+
"architectures": [
4+
"Mineru2QwenForCausalLM"
5+
],
6+
"attention_dropout": 0.0,
7+
"bos_token_id": 151643,
8+
"eos_token_id": 151645,
9+
"freeze_mm_mlp_adapter": false,
10+
"hidden_act": "silu",
11+
"hidden_size": 896,
12+
"image_aspect_ratio": "square_anyres_max_9",
13+
"image_crop_resolution": "None",
14+
"image_grid_pinpoints": "(1x1),...,(4x4)",
15+
"image_split_resolution": "None",
16+
"image_token_index": 151646,
17+
"initializer_range": 0.02,
18+
"intermediate_size": 4864,
19+
"max_position_embeddings": 32768,
20+
"max_window_layers": 24,
21+
"mm_hidden_size": 1152,
22+
"mm_newline_position": "one_token",
23+
"mm_patch_merge_type": "spatial_unpad",
24+
"mm_projector_lr": 1e-05,
25+
"mm_projector_type": "mlp2x_gelu",
26+
"mm_resampler_type": "None",
27+
"mm_spatial_pool_mode": "bilinear",
28+
"mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
29+
"mm_use_box_start_end": true,
30+
"mm_use_im_patch_token": false,
31+
"mm_use_im_start_end": false,
32+
"mm_vision_select_feature": "full",
33+
"mm_vision_select_layer": -2,
34+
"mm_vision_tower": "google/siglip-so400m-patch14-384",
35+
"mm_vision_tower_lr": 1e-06,
36+
"model_type": "mineru2_qwen",
37+
"num_attention_heads": 14,
38+
"num_hidden_layers": 24,
39+
"num_key_value_heads": 2,
40+
"pos_skipping_range": 4096,
41+
"rms_norm_eps": 1e-06,
42+
"rope_scaling": "None",
43+
"rope_theta": 1000000.0,
44+
"sliding_window": 32768,
45+
"tie_word_embeddings": true,
46+
"tokenizer_model_max_length": 16384,
47+
"tokenizer_padding_side": "right",
48+
"torch_dtype": "bfloat16",
49+
"transformers_version": "4.49.0",
50+
"tune_entire_model": true,
51+
"tune_mm_mlp_adapter": false,
52+
"unfreeze_mm_vision_tower": true,
53+
"use_cache": false,
54+
"use_mm_proj": true,
55+
"use_pos_skipping": false,
56+
"use_sliding_window": false,
57+
"vision_tower_pretrained": "None",
58+
"vocab_size": 151654,
59+
"_commit_hash": "None"
60+
}

mm_test.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,11 @@ def run(query, uris):
3838
You are a helpful assistant.<|im_end|>
3939
<|im_start|>user
4040
<img></img>
41-
帮我提取这里面的文字,告诉我文字内容<|im_end|>
41+
这是什么?<|im_end|>
4242
<|im_start|>assistant
4343
"""
4444

45-
response = run(
46-
uris=["https://pigkiller-011955-1319328397.cos.ap-beijing.myqcloud.com/img/202509081804761.png"], query=query
47-
)
45+
response = run(uris=["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"], query=query)
4846

4947
if response.status_code == 200:
5048
print(f"Result: {response.json()}")

0 commit comments

Comments
 (0)