Skip to content

Commit df8ec7f

Browse files
committed
fix
1 parent 9e270be commit df8ec7f

File tree

3 files changed

+74
-5
lines changed

3 files changed

+74
-5
lines changed

lightllm/models/mineru2_qwen/mineru2_visual.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,52 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
125125
elif t.ndim == 3:
126126
print(f"[debug] mineru2_visual unsqueeze t.ndim: {t.ndim}, t.shape: {t.shape}")
127127
t = t.unsqueeze(0)
128+
# 在修改前记录 manager 分配的 token_num
129+
try:
130+
print(f"[debug] mineru2_visual manager_token_num_before={img.token_num} uuid={img.uuid}")
131+
except Exception:
132+
pass
133+
# 对齐实际 K 与期望 token_num
134+
expected_k = img.token_num if getattr(img, "token_num", None) is not None else None
135+
actual_k = t.shape[0]
136+
if expected_k is None or expected_k <= 0:
137+
expected_k = actual_k
138+
print(f"[debug] mineru2_visual expected_k_from_actual uuid={img.uuid} expected_k={expected_k}")
139+
if actual_k != expected_k:
140+
if actual_k % expected_k == 0:
141+
factor = actual_k // expected_k
142+
print(
143+
f"[debug] mineru2_visual down_aggregate uuid={img.uuid}"
144+
f" actual_k={actual_k} expected_k={expected_k} factor={factor}"
145+
)
146+
t = t.view(expected_k, factor, t.shape[1], t.shape[2], t.shape[3]).mean(dim=1)
147+
elif expected_k % actual_k == 0:
148+
factor = expected_k // actual_k
149+
print(
150+
f"[debug] mineru2_visual up_repeat uuid={img.uuid}"
151+
f" actual_k={actual_k} expected_k={expected_k} factor={factor}"
152+
)
153+
t = t.repeat_interleave(repeats=factor, dim=0)
154+
else:
155+
k = min(actual_k, expected_k)
156+
print(
157+
f"[debug] mineru2_visual fallback_slice uuid={img.uuid}"
158+
f" actual_k={actual_k} expected_k={expected_k} k={k}"
159+
)
160+
if actual_k >= expected_k:
161+
t = t[:expected_k]
162+
else:
163+
# pad by repeating last
164+
pad = t[-1:].repeat(expected_k - actual_k, 1, 1, 1)
165+
t = torch.cat([t, pad], dim=0)
128166
img_tensors.append(t)
129-
img.token_num = t.shape[0]
167+
# 最终 K
168+
final_k = t.shape[0]
169+
img.token_num = final_k
170+
print(
171+
f"[debug] mineru2_visual actual_k={actual_k} "
172+
f"expected_k={expected_k} final_k={final_k} uuid={img.uuid}"
173+
)
130174
else:
131175
raise Exception("Unsupport input types: {} for {}".format(type(img), img))
132176

@@ -136,6 +180,10 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
136180
else 1
137181
)
138182
valid_ids.append([valid_id, valid_id + cur_num])
183+
print(
184+
f"[debug] mineru2_visual valid_ids_append uuid={img.uuid}"
185+
f" range=({valid_id},{valid_id + cur_num}) cur_num={cur_num}"
186+
)
139187
valid_id += cur_num
140188

141189
if len(img_tensors) <= 0:
@@ -144,5 +192,6 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
144192
img = torch.cat(img_tensors, dim=0)
145193
img = img.cuda()
146194
all_img_embeds = self.forward(img)
195+
print(f"[debug] mineru2_visual all_img_embeds.shape={tuple(all_img_embeds.shape)} " f"total_K={img.shape[0]}")
147196

148197
return all_img_embeds, uuids, valid_ids

lightllm/models/mineru2_qwen/model.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def __init__(self, tokenizer, model_cfg):
4747
image_size = model_cfg.get("mm_image_size", image_size)
4848

4949
self.image_processor = Mineru2ImageProcessor(
50-
image_aspect_ratio=getattr(model_cfg, "image_aspect_ratio", None),
51-
image_grid_pinpoints=getattr(model_cfg, "image_grid_pinpoints", None),
50+
image_aspect_ratio=(model_cfg.get("image_aspect_ratio", None)),
51+
image_grid_pinpoints=(model_cfg.get("image_grid_pinpoints", None)),
5252
)
5353
self.image_length = (image_size // patch_size) ** 2
5454

@@ -63,7 +63,27 @@ def init_audioitem_extral_params(
6363
raise NotImplementedError
6464

6565
def get_image_token_length(self, img: ImageItem):
66-
return self.image_length
66+
# 对于 Mineru2 集成,视觉塔返回的是每个裁剪的一条 pooled 向量。
67+
# token 数应与裁剪数量一致:anyres 模式为 1(原图)+ 网格裁剪数,否则为 1。
68+
aspect = getattr(self.image_processor, "image_aspect_ratio", None)
69+
try:
70+
if aspect and (aspect == "anyres" or (isinstance(aspect, str) and "anyres_max" in aspect)):
71+
crop_size = self.image_processor.crop_size["height"]
72+
grid_w, grid_h = get_anyres_image_grid_shape(
73+
(img.image_w, img.image_h), self.image_processor.image_grid_pinpoints, crop_size
74+
)
75+
token_num = int(grid_w * grid_h + 1)
76+
print(
77+
f"[debug] mineru2_tokenizer anyres img_size=({img.image_w},{img.image_h}) "
78+
f"crop={crop_size} grid=({grid_w},{grid_h}) token_num={token_num}"
79+
)
80+
return token_num
81+
else:
82+
print(f"[debug] mineru2_tokenizer non-anyres token_num=1 aspect={aspect}")
83+
return 1
84+
except Exception as e:
85+
print(f"[debug] mineru2_tokenizer token_num_fallback due to {e}, return 1")
86+
return 1
6787

6888
def get_audio_token_length(self, audio: AudioItem):
6989
raise NotImplementedError

mm_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def run(query, uris):
3838
You are a helpful assistant.<|im_end|>
3939
<|im_start|>user
4040
<img></img>
41-
这是什么?<|im_end|>
41+
这张图片中的文字是什么,告诉我<|im_end|>
4242
<|im_start|>assistant
4343
"""
4444

0 commit comments

Comments
 (0)