fix

zhhangBian · zhhangBian · commit 16e5d6c6d30a · 2025-09-16T19:22:43.000+08:00
diff --git a/lightllm/models/mineru2_qwen/mineru2_visual.py b/lightllm/models/mineru2_qwen/mineru2_visual.py
@@ -43,11 +43,18 @@ def _resolve_path(name):
     elif "siglip" in vision_tower.lower():
         vt_path = _resolve_path(vision_tower)
         print(f"[debug] load siglip from {vt_path}")
-        model = SiglipVisionModel.from_pretrained(vt_path)
-        if hasattr(model, "config") and hasattr(model.config, "num_hidden_layers"):
-            model.config.num_hidden_layers = max(0, model.config.num_hidden_layers - 1)
-        if hasattr(model, "config") and hasattr(model.config, "vision_use_head"):
-            model.config.vision_use_head = False
+        # 方案A：使用配置减层并按该配置实例化模型，再加载权重（忽略不匹配尺寸）
+        cfg = SiglipVisionConfig.from_pretrained(vt_path)
+        old_layers = getattr(cfg, "num_hidden_layers", None)
+        cfg.num_hidden_layers = max(0, cfg.num_hidden_layers - 1)
+        cfg.vision_use_head = False
+        model = SiglipVisionModel.from_pretrained(vt_path, config=cfg, ignore_mismatched_sizes=True)
+        try:
+            actual_layers = len(model.vision_model.encoder.layers)  # type: ignore[attr-defined]
+        except Exception:
+            actual_layers = None
+        new_cfg_layers = getattr(getattr(model, "config", None), "num_hidden_layers", None)
+        print(f"[debug] siglip_layers planA old={old_layers} new_cfg={new_cfg_layers} actual_module={actual_layers}")
         return model
     else:
         raise ValueError(f"Unknown vision tower: {vision_tower}")
@@ -211,11 +218,46 @@ def cuda(self):
         return self
 
     def forward(self, x) -> torch.Tensor:
+        # 运行时形状与精度/设备检查
+        try:
+            print(f"[debug] mineru2_visual.forward x.shape={tuple(x.shape)} dtype={x.dtype} device={x.device}")
+        except Exception:
+            pass
         vision_out = self.vision_tower(x, output_hidden_states=True)
-        hidden = vision_out.hidden_states[-1]
-        # 对patch维度做平均池化，得到每视图一个向量
-        pooled_per_view = hidden.mean(dim=1)
-        proj = self.projector(pooled_per_view)
+        hiddens = vision_out.hidden_states
+        # hidden_states 数量与 config 层数的关系（一般为 num_layers + 1）
+        try:
+            cfg_layers = getattr(getattr(self.vision_tower, "config", None), "num_hidden_layers", None)
+            eff_layers = len(hiddens) - 1 if isinstance(hiddens, (list, tuple)) else None
+            print(
+                f"[debug] mineru2_visual.hidden_states len={len(hiddens)}"
+                f" cfg_layers={cfg_layers} eff_layers={eff_layers}"
+            )
+        except Exception:
+            pass
+        # 对齐ref的“减一层”语义：优先使用倒数第二层；若不可用则回退最后一层
+        try:
+            chosen_idx = -2 if isinstance(hiddens, (list, tuple)) and len(hiddens) >= 2 else -1
+            feat = hiddens[chosen_idx]
+            print(f"[debug] mineru2_visual.select_layer idx={chosen_idx} feat.shape={tuple(feat.shape)}")
+        except Exception:
+            feat = hiddens[-2] if isinstance(hiddens, (list, tuple)) and len(hiddens) >= 2 else hiddens[-1]
+        # 切回 patch 序列特征：去除 CLS（若存在），按序列过 projector，再展平为 (views*patch, hidden)
+        patch_side = self.vision_tower.config.image_size // self.vision_tower.config.patch_size
+        patch_len = patch_side * patch_side
+        if feat.shape[1] == patch_len + 1:
+            feat = feat[:, 1:, :]
+            print(f"[debug] mineru2_visual.drop_cls patch_len={patch_len} feat_no_cls.shape={tuple(feat.shape)}")
+        proj_seq = self.projector(feat)
+        try:
+            print(f"[debug] mineru2_visual.projector_seq_out shape={tuple(proj_seq.shape)} (views, patch, hidden)")
+        except Exception:
+            pass
+        proj = proj_seq.reshape(-1, proj_seq.shape[-1])
+        try:
+            print(f"[debug] mineru2_visual.projector_flat_out shape={tuple(proj.shape)} (views*patch, hidden)")
+        except Exception:
+            pass
         return proj
 
     def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List[List[int]]]:
@@ -226,6 +268,10 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
         valid_ids: List[List[int]] = []
         image_aspect_ratio = getattr(self.image_processor, "image_aspect_ratio", None)
         image_grid_pinpoints = getattr(self.image_processor, "image_grid_pinpoints", None)
+        # 每视图 patch_len（例如 384/14=27, 27^2=729）
+        patch_side = self.vision_tower.config.image_size // self.vision_tower.config.patch_size
+        patch_len = patch_side * patch_side
+        print(f"[debug] mineru2_visual.patch_len={patch_len} (side={patch_side})")
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
@@ -247,60 +293,77 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
                 elif t.ndim == 3:
                     print(f"[debug] mineru2_visual unsqueeze t.ndim: {t.ndim}, t.shape: {t.shape}")
                     t = t.unsqueeze(0)
-                # 在修改前记录 manager 分配的 token_num
+                # 在修改前记录 manager 分配的 token_num（可能是视图数或视图*patch数）
                 try:
                     print(f"[debug] mineru2_visual manager_token_num_before={img.token_num} uuid={img.uuid}")
                 except Exception:
                     pass
-                # 对齐实际 K 与期望 token_num
-                expected_k = img.token_num if getattr(img, "token_num", None) is not None else None
+                # 对齐实际视图数 K 与期望 token（可能是 K 或 K*patch_len）
+                expected_token = img.token_num if getattr(img, "token_num", None) is not None else None
                 actual_k = t.shape[0]
-                if expected_k is None or expected_k <= 0:
-                    expected_k = actual_k
-                    print(f"[debug] mineru2_visual expected_k_from_actual uuid={img.uuid} expected_k={expected_k}")
-                if actual_k != expected_k:
-                    if actual_k % expected_k == 0:
-                        factor = actual_k // expected_k
+                if expected_token is None or expected_token <= 0:
+                    expected_views = actual_k
+                    print(
+                        f"[debug] mineru2_visual expected_views_from_actual uuid={img.uuid}"
+                        f" expected_views={expected_views}"
+                    )
+                else:
+                    if expected_token >= patch_len and expected_token % patch_len == 0:
+                        expected_views = expected_token // patch_len
+                        print(
+                            f"[debug] mineru2_visual expected_views_from_tokens uuid={img.uuid}"
+                            f" expected_token={expected_token} patch_len={patch_len} expected_views={expected_views}"
+                        )
+                    else:
+                        expected_views = expected_token
+                        print(
+                            f"[debug] mineru2_visual expected_views_interpret_as_views uuid={img.uuid}"
+                            f" expected_views={expected_views}"
+                        )
+                if actual_k != expected_views:
+                    if actual_k % expected_views == 0:
+                        factor = actual_k // expected_views
                         print(
                             f"[debug] mineru2_visual down_aggregate uuid={img.uuid}"
-                            f" actual_k={actual_k} expected_k={expected_k} factor={factor}"
+                            f" actual_k={actual_k} expected_views={expected_views} factor={factor}"
                         )
-                        t = t.view(expected_k, factor, t.shape[1], t.shape[2], t.shape[3]).mean(dim=1)
-                    elif expected_k % actual_k == 0:
-                        factor = expected_k // actual_k
+                        t = t.view(expected_views, factor, t.shape[1], t.shape[2], t.shape[3]).mean(dim=1)
+                    elif expected_views % actual_k == 0:
+                        factor = expected_views // actual_k
                         print(
                             f"[debug] mineru2_visual up_repeat uuid={img.uuid}"
-                            f" actual_k={actual_k} expected_k={expected_k} factor={factor}"
+                            f" actual_k={actual_k} expected_views={expected_views} factor={factor}"
                         )
                         t = t.repeat_interleave(repeats=factor, dim=0)
                     else:
-                        k = min(actual_k, expected_k)
+                        k = min(actual_k, expected_views)
                         print(
                             f"[debug] mineru2_visual fallback_slice uuid={img.uuid}"
-                            f" actual_k={actual_k} expected_k={expected_k} k={k}"
+                            f" actual_k={actual_k} expected_views={expected_views} k={k}"
                         )
-                        if actual_k >= expected_k:
-                            t = t[:expected_k]
+                        if actual_k >= expected_views:
+                            t = t[:expected_views]
                         else:
                             # pad by repeating last
-                            pad = t[-1:].repeat(expected_k - actual_k, 1, 1, 1)
+                            pad = t[-1:].repeat(expected_views - actual_k, 1, 1, 1)
                             t = torch.cat([t, pad], dim=0)
                 img_tensors.append(t)
-                # 最终 K
-                final_k = t.shape[0]
-                img.token_num = final_k
+                # 最终视图数 K
+                final_views = t.shape[0]
+                # 对齐 patch 序列后的总 token 数
+                img.token_num = final_views * patch_len
                 print(
-                    f"[debug] mineru2_visual actual_k={actual_k} "
-                    f"expected_k={expected_k} final_k={final_k} uuid={img.uuid}"
+                    f"[debug] mineru2_visual actual_k={actual_k} expected_views={expected_views}"
+                    f" final_views={final_views} final_token_num={img.token_num} uuid={img.uuid}"
                 )
             else:
                 raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
-            cur_num = (
-                img_tensors[-1].shape[0]
-                if isinstance(img_tensors[-1], torch.Tensor) and img_tensors[-1].dim() == 4
-                else 1
-            )
+            # 本图对应的 token 数（视图 * patch_len）
+            if isinstance(img_tensors[-1], torch.Tensor) and img_tensors[-1].dim() == 4:
+                cur_num = img_tensors[-1].shape[0] * patch_len
+            else:
+                cur_num = patch_len
             valid_ids.append([valid_id, valid_id + cur_num])
             print(
                 f"[debug] mineru2_visual valid_ids_append uuid={img.uuid}"
@@ -314,6 +377,9 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
         img = torch.cat(img_tensors, dim=0)
         img = img.cuda()
         all_img_embeds = self.forward(img)
-        print(f"[debug] mineru2_visual all_img_embeds.shape={tuple(all_img_embeds.shape)} " f"total_K={img.shape[0]}")
+        print(
+            f"[debug] mineru2_visual all_img_embeds.shape={tuple(all_img_embeds.shape)}"
+            f" total_tokens={img.shape[0] * patch_len}"
+        )
 
         return all_img_embeds, uuids, valid_ids
diff --git a/lightllm/models/mineru2_qwen/model.py b/lightllm/models/mineru2_qwen/model.py
@@ -63,28 +63,36 @@ def init_audioitem_extral_params(
         raise NotImplementedError
 
     def get_image_token_length(self, img: ImageItem):
-        # 对于 Mineru2 集成，视觉塔返回的是每个裁剪的一条 pooled 向量。
-        # token 数应与裁剪数量一致：anyres 模式为 1（原图）+ 网格裁剪数，且每块含双视图（factor=2）。
+        # 切回 patch 序列：总token数 = 视图数 × 每视图patch数
+        # 每视图patch数 = self.image_length = (image_size // patch_size) ** 2
         aspect = getattr(self.image_processor, "image_aspect_ratio", None)
+        patch_len = int(self.image_length)
         try:
             if aspect and (aspect == "anyres" or (isinstance(aspect, str) and "anyres_max" in aspect)):
                 crop_size = self.image_processor.crop_size["height"]
                 grid_w, grid_h = get_anyres_image_grid_shape(
                     (img.image_w, img.image_h), self.image_processor.image_grid_pinpoints, crop_size
                 )
-                base = int(grid_w * grid_h + 1)
-                token_num = base
+                views = int(grid_w * grid_h + 1)
+                token_num = views * patch_len
                 print(
                     f"[debug] mineru2_tokenizer anyres img_size=({img.image_w},{img.image_h}) "
-                    f"crop={crop_size} grid=({grid_w},{grid_h}) base={base} token_num={token_num}"
+                    f"crop={crop_size} grid=({grid_w},{grid_h}) views={views}"
+                    f" patch_len={patch_len} token_num={token_num}"
                 )
                 return token_num
             else:
-                print(f"[debug] mineru2_tokenizer non-anyres token_num=1 aspect={aspect}")
-                return 1
+                token_num = patch_len
+                print(
+                    f"[debug] mineru2_tokenizer non-anyres views=1 patch_len={patch_len}"
+                    f" token_num={token_num} aspect={aspect}"
+                )
+                return token_num
         except Exception as e:
-            print(f"[debug] mineru2_tokenizer token_num_fallback due to {e}, return 1")
-            return 1
+            # 兜底：按单视图返回
+            token_num = patch_len
+            print(f"[debug] mineru2_tokenizer token_num_fallback due to {e}, return {token_num}")
+            return token_num
 
     def get_audio_token_length(self, audio: AudioItem):
         raise NotImplementedError
diff --git a/mm_test.py b/mm_test.py
@@ -39,7 +39,7 @@ def run(query, uris):
 You are a helpful assistant.<|im_end|>
 <|im_start|>user
 <image>
-这张图片中的文字是什么<|im_end|>
+这张图片中的文字是什么，告诉我<|im_end|>
 <|im_start|>assistant
 """