Skip to content

Commit 16e5d6c

Browse files
committed
fix
1 parent b5dabf5 commit 16e5d6c

File tree

3 files changed

+123
-49
lines changed

3 files changed

+123
-49
lines changed

lightllm/models/mineru2_qwen/mineru2_visual.py

Lines changed: 105 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,18 @@ def _resolve_path(name):
4343
elif "siglip" in vision_tower.lower():
4444
vt_path = _resolve_path(vision_tower)
4545
print(f"[debug] load siglip from {vt_path}")
46-
model = SiglipVisionModel.from_pretrained(vt_path)
47-
if hasattr(model, "config") and hasattr(model.config, "num_hidden_layers"):
48-
model.config.num_hidden_layers = max(0, model.config.num_hidden_layers - 1)
49-
if hasattr(model, "config") and hasattr(model.config, "vision_use_head"):
50-
model.config.vision_use_head = False
46+
# 方案A:使用配置减层并按该配置实例化模型,再加载权重(忽略不匹配尺寸)
47+
cfg = SiglipVisionConfig.from_pretrained(vt_path)
48+
old_layers = getattr(cfg, "num_hidden_layers", None)
49+
cfg.num_hidden_layers = max(0, cfg.num_hidden_layers - 1)
50+
cfg.vision_use_head = False
51+
model = SiglipVisionModel.from_pretrained(vt_path, config=cfg, ignore_mismatched_sizes=True)
52+
try:
53+
actual_layers = len(model.vision_model.encoder.layers) # type: ignore[attr-defined]
54+
except Exception:
55+
actual_layers = None
56+
new_cfg_layers = getattr(getattr(model, "config", None), "num_hidden_layers", None)
57+
print(f"[debug] siglip_layers planA old={old_layers} new_cfg={new_cfg_layers} actual_module={actual_layers}")
5158
return model
5259
else:
5360
raise ValueError(f"Unknown vision tower: {vision_tower}")
@@ -211,11 +218,46 @@ def cuda(self):
211218
return self
212219

213220
def forward(self, x) -> torch.Tensor:
221+
# 运行时形状与精度/设备检查
222+
try:
223+
print(f"[debug] mineru2_visual.forward x.shape={tuple(x.shape)} dtype={x.dtype} device={x.device}")
224+
except Exception:
225+
pass
214226
vision_out = self.vision_tower(x, output_hidden_states=True)
215-
hidden = vision_out.hidden_states[-1]
216-
# 对patch维度做平均池化,得到每视图一个向量
217-
pooled_per_view = hidden.mean(dim=1)
218-
proj = self.projector(pooled_per_view)
227+
hiddens = vision_out.hidden_states
228+
# hidden_states 数量与 config 层数的关系(一般为 num_layers + 1)
229+
try:
230+
cfg_layers = getattr(getattr(self.vision_tower, "config", None), "num_hidden_layers", None)
231+
eff_layers = len(hiddens) - 1 if isinstance(hiddens, (list, tuple)) else None
232+
print(
233+
f"[debug] mineru2_visual.hidden_states len={len(hiddens)}"
234+
f" cfg_layers={cfg_layers} eff_layers={eff_layers}"
235+
)
236+
except Exception:
237+
pass
238+
# 对齐ref的“减一层”语义:优先使用倒数第二层;若不可用则回退最后一层
239+
try:
240+
chosen_idx = -2 if isinstance(hiddens, (list, tuple)) and len(hiddens) >= 2 else -1
241+
feat = hiddens[chosen_idx]
242+
print(f"[debug] mineru2_visual.select_layer idx={chosen_idx} feat.shape={tuple(feat.shape)}")
243+
except Exception:
244+
feat = hiddens[-2] if isinstance(hiddens, (list, tuple)) and len(hiddens) >= 2 else hiddens[-1]
245+
# 切回 patch 序列特征:去除 CLS(若存在),按序列过 projector,再展平为 (views*patch, hidden)
246+
patch_side = self.vision_tower.config.image_size // self.vision_tower.config.patch_size
247+
patch_len = patch_side * patch_side
248+
if feat.shape[1] == patch_len + 1:
249+
feat = feat[:, 1:, :]
250+
print(f"[debug] mineru2_visual.drop_cls patch_len={patch_len} feat_no_cls.shape={tuple(feat.shape)}")
251+
proj_seq = self.projector(feat)
252+
try:
253+
print(f"[debug] mineru2_visual.projector_seq_out shape={tuple(proj_seq.shape)} (views, patch, hidden)")
254+
except Exception:
255+
pass
256+
proj = proj_seq.reshape(-1, proj_seq.shape[-1])
257+
try:
258+
print(f"[debug] mineru2_visual.projector_flat_out shape={tuple(proj.shape)} (views*patch, hidden)")
259+
except Exception:
260+
pass
219261
return proj
220262

221263
def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List[List[int]]]:
@@ -226,6 +268,10 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
226268
valid_ids: List[List[int]] = []
227269
image_aspect_ratio = getattr(self.image_processor, "image_aspect_ratio", None)
228270
image_grid_pinpoints = getattr(self.image_processor, "image_grid_pinpoints", None)
271+
# 每视图 patch_len(例如 384/14=27, 27^2=729)
272+
patch_side = self.vision_tower.config.image_size // self.vision_tower.config.patch_size
273+
patch_len = patch_side * patch_side
274+
print(f"[debug] mineru2_visual.patch_len={patch_len} (side={patch_side})")
229275
for i, img in enumerate(images):
230276
if isinstance(img, ImageItem):
231277
uuids.append(img.uuid)
@@ -247,60 +293,77 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
247293
elif t.ndim == 3:
248294
print(f"[debug] mineru2_visual unsqueeze t.ndim: {t.ndim}, t.shape: {t.shape}")
249295
t = t.unsqueeze(0)
250-
# 在修改前记录 manager 分配的 token_num
296+
# 在修改前记录 manager 分配的 token_num(可能是视图数或视图*patch数)
251297
try:
252298
print(f"[debug] mineru2_visual manager_token_num_before={img.token_num} uuid={img.uuid}")
253299
except Exception:
254300
pass
255-
# 对齐实际 K 与期望 token_num
256-
expected_k = img.token_num if getattr(img, "token_num", None) is not None else None
301+
# 对齐实际视图数 K 与期望 token(可能是 K 或 K*patch_len)
302+
expected_token = img.token_num if getattr(img, "token_num", None) is not None else None
257303
actual_k = t.shape[0]
258-
if expected_k is None or expected_k <= 0:
259-
expected_k = actual_k
260-
print(f"[debug] mineru2_visual expected_k_from_actual uuid={img.uuid} expected_k={expected_k}")
261-
if actual_k != expected_k:
262-
if actual_k % expected_k == 0:
263-
factor = actual_k // expected_k
304+
if expected_token is None or expected_token <= 0:
305+
expected_views = actual_k
306+
print(
307+
f"[debug] mineru2_visual expected_views_from_actual uuid={img.uuid}"
308+
f" expected_views={expected_views}"
309+
)
310+
else:
311+
if expected_token >= patch_len and expected_token % patch_len == 0:
312+
expected_views = expected_token // patch_len
313+
print(
314+
f"[debug] mineru2_visual expected_views_from_tokens uuid={img.uuid}"
315+
f" expected_token={expected_token} patch_len={patch_len} expected_views={expected_views}"
316+
)
317+
else:
318+
expected_views = expected_token
319+
print(
320+
f"[debug] mineru2_visual expected_views_interpret_as_views uuid={img.uuid}"
321+
f" expected_views={expected_views}"
322+
)
323+
if actual_k != expected_views:
324+
if actual_k % expected_views == 0:
325+
factor = actual_k // expected_views
264326
print(
265327
f"[debug] mineru2_visual down_aggregate uuid={img.uuid}"
266-
f" actual_k={actual_k} expected_k={expected_k} factor={factor}"
328+
f" actual_k={actual_k} expected_views={expected_views} factor={factor}"
267329
)
268-
t = t.view(expected_k, factor, t.shape[1], t.shape[2], t.shape[3]).mean(dim=1)
269-
elif expected_k % actual_k == 0:
270-
factor = expected_k // actual_k
330+
t = t.view(expected_views, factor, t.shape[1], t.shape[2], t.shape[3]).mean(dim=1)
331+
elif expected_views % actual_k == 0:
332+
factor = expected_views // actual_k
271333
print(
272334
f"[debug] mineru2_visual up_repeat uuid={img.uuid}"
273-
f" actual_k={actual_k} expected_k={expected_k} factor={factor}"
335+
f" actual_k={actual_k} expected_views={expected_views} factor={factor}"
274336
)
275337
t = t.repeat_interleave(repeats=factor, dim=0)
276338
else:
277-
k = min(actual_k, expected_k)
339+
k = min(actual_k, expected_views)
278340
print(
279341
f"[debug] mineru2_visual fallback_slice uuid={img.uuid}"
280-
f" actual_k={actual_k} expected_k={expected_k} k={k}"
342+
f" actual_k={actual_k} expected_views={expected_views} k={k}"
281343
)
282-
if actual_k >= expected_k:
283-
t = t[:expected_k]
344+
if actual_k >= expected_views:
345+
t = t[:expected_views]
284346
else:
285347
# pad by repeating last
286-
pad = t[-1:].repeat(expected_k - actual_k, 1, 1, 1)
348+
pad = t[-1:].repeat(expected_views - actual_k, 1, 1, 1)
287349
t = torch.cat([t, pad], dim=0)
288350
img_tensors.append(t)
289-
# 最终 K
290-
final_k = t.shape[0]
291-
img.token_num = final_k
351+
# 最终视图数 K
352+
final_views = t.shape[0]
353+
# 对齐 patch 序列后的总 token 数
354+
img.token_num = final_views * patch_len
292355
print(
293-
f"[debug] mineru2_visual actual_k={actual_k} "
294-
f"expected_k={expected_k} final_k={final_k} uuid={img.uuid}"
356+
f"[debug] mineru2_visual actual_k={actual_k} expected_views={expected_views}"
357+
f" final_views={final_views} final_token_num={img.token_num} uuid={img.uuid}"
295358
)
296359
else:
297360
raise Exception("Unsupport input types: {} for {}".format(type(img), img))
298361

299-
cur_num = (
300-
img_tensors[-1].shape[0]
301-
if isinstance(img_tensors[-1], torch.Tensor) and img_tensors[-1].dim() == 4
302-
else 1
303-
)
362+
# 本图对应的 token 数(视图 * patch_len)
363+
if isinstance(img_tensors[-1], torch.Tensor) and img_tensors[-1].dim() == 4:
364+
cur_num = img_tensors[-1].shape[0] * patch_len
365+
else:
366+
cur_num = patch_len
304367
valid_ids.append([valid_id, valid_id + cur_num])
305368
print(
306369
f"[debug] mineru2_visual valid_ids_append uuid={img.uuid}"
@@ -314,6 +377,9 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
314377
img = torch.cat(img_tensors, dim=0)
315378
img = img.cuda()
316379
all_img_embeds = self.forward(img)
317-
print(f"[debug] mineru2_visual all_img_embeds.shape={tuple(all_img_embeds.shape)} " f"total_K={img.shape[0]}")
380+
print(
381+
f"[debug] mineru2_visual all_img_embeds.shape={tuple(all_img_embeds.shape)}"
382+
f" total_tokens={img.shape[0] * patch_len}"
383+
)
318384

319385
return all_img_embeds, uuids, valid_ids

lightllm/models/mineru2_qwen/model.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,28 +63,36 @@ def init_audioitem_extral_params(
6363
raise NotImplementedError
6464

6565
def get_image_token_length(self, img: ImageItem):
66-
# 对于 Mineru2 集成,视觉塔返回的是每个裁剪的一条 pooled 向量。
67-
# token 数应与裁剪数量一致:anyres 模式为 1(原图)+ 网格裁剪数,且每块含双视图(factor=2)。
66+
# 切回 patch 序列:总token数 = 视图数 × 每视图patch数
67+
# 每视图patch数 = self.image_length = (image_size // patch_size) ** 2
6868
aspect = getattr(self.image_processor, "image_aspect_ratio", None)
69+
patch_len = int(self.image_length)
6970
try:
7071
if aspect and (aspect == "anyres" or (isinstance(aspect, str) and "anyres_max" in aspect)):
7172
crop_size = self.image_processor.crop_size["height"]
7273
grid_w, grid_h = get_anyres_image_grid_shape(
7374
(img.image_w, img.image_h), self.image_processor.image_grid_pinpoints, crop_size
7475
)
75-
base = int(grid_w * grid_h + 1)
76-
token_num = base
76+
views = int(grid_w * grid_h + 1)
77+
token_num = views * patch_len
7778
print(
7879
f"[debug] mineru2_tokenizer anyres img_size=({img.image_w},{img.image_h}) "
79-
f"crop={crop_size} grid=({grid_w},{grid_h}) base={base} token_num={token_num}"
80+
f"crop={crop_size} grid=({grid_w},{grid_h}) views={views}"
81+
f" patch_len={patch_len} token_num={token_num}"
8082
)
8183
return token_num
8284
else:
83-
print(f"[debug] mineru2_tokenizer non-anyres token_num=1 aspect={aspect}")
84-
return 1
85+
token_num = patch_len
86+
print(
87+
f"[debug] mineru2_tokenizer non-anyres views=1 patch_len={patch_len}"
88+
f" token_num={token_num} aspect={aspect}"
89+
)
90+
return token_num
8591
except Exception as e:
86-
print(f"[debug] mineru2_tokenizer token_num_fallback due to {e}, return 1")
87-
return 1
92+
# 兜底:按单视图返回
93+
token_num = patch_len
94+
print(f"[debug] mineru2_tokenizer token_num_fallback due to {e}, return {token_num}")
95+
return token_num
8896

8997
def get_audio_token_length(self, audio: AudioItem):
9098
raise NotImplementedError

mm_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def run(query, uris):
3939
You are a helpful assistant.<|im_end|>
4040
<|im_start|>user
4141
<image>
42-
这张图片中的文字是什么<|im_end|>
42+
这张图片中的文字是什么,告诉我<|im_end|>
4343
<|im_start|>assistant
4444
"""
4545

0 commit comments

Comments
 (0)