feat: check token num

zhhangBian · zhhangBian · commit abe4402552f5 · 2025-09-17T16:46:54.000+08:00
diff --git a/lightllm/models/mineru2_qwen/mineru2_visual.py b/lightllm/models/mineru2_qwen/mineru2_visual.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import numpy as np
 from transformers import (
     CLIPVisionModel,
@@ -14,7 +15,12 @@
 )
 
 from .configuration_mineru2 import Mineru2QwenConfig
-from .image_processing_mineru2 import Mineru2ImageProcessor, expand2square, process_anyres_image
+from .image_processing_mineru2 import (
+    Mineru2ImageProcessor,
+    expand2square,
+    process_anyres_image,
+    get_anyres_image_grid_shape,
+)
 
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
@@ -179,7 +185,9 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
                 uuids.append(img.uuid)
                 image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data)).convert("RGB")
-                if image_aspect_ratio == "pad":
+                # 多图/视频强制 pad，单图才允许 anyres
+                force_pad = len(images) > 1
+                if image_aspect_ratio == "pad" or force_pad:
                     image_proc = expand2square(image_data, tuple(int(x * 255) for x in self.image_processor.image_mean))
                     t = self.image_processor.preprocess(image_proc, return_tensors="pt")["pixel_values"]
                 elif image_aspect_ratio and (image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio):
@@ -194,16 +202,18 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
                 elif t.ndim == 3:
                     t = t.unsqueeze(0)
 
-                # 对齐实际视图数 K 与期望 token（可能是 K 或 K*patch_len）
-                expected_token = img.token_num if getattr(img, "token_num", None) is not None else None
+                # 对齐实际视图数 K 与期望视图数（anyres: Nx*Ny+1；否则：1）
                 actual_k = t.shape[0]
-                if expected_token is None or expected_token <= 0:
-                    expected_views = actual_k
+                if (
+                    image_aspect_ratio and (image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio)
+                ) and not force_pad:
+                    crop_size = self.image_processor.crop_size["height"]
+                    grid_w, grid_h = get_anyres_image_grid_shape(
+                        (img.image_w, img.image_h), image_grid_pinpoints, crop_size
+                    )
+                    expected_views = int(grid_w * grid_h + 1)
                 else:
-                    if expected_token >= patch_len and expected_token % patch_len == 0:
-                        expected_views = expected_token // patch_len
-                    else:
-                        expected_views = expected_token
+                    expected_views = 1
                 if actual_k != expected_views:
                     if actual_k % expected_views == 0:
                         factor = actual_k // expected_views
@@ -219,26 +229,86 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
                             pad = t[-1:].repeat(expected_views - actual_k, 1, 1, 1)
                             t = torch.cat([t, pad], dim=0)
                 img_tensors.append(t)
-                # 最终视图数 K
-                final_views = t.shape[0]
-                # 对齐 patch 序列后的总 token 数
-                img.token_num = final_views * patch_len
             else:
                 raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
-            # 本图对应的 token 数（视图 * patch_len）
-            if isinstance(img_tensors[-1], torch.Tensor) and img_tensors[-1].dim() == 4:
-                cur_num = img_tensors[-1].shape[0] * patch_len
-            else:
-                cur_num = patch_len
-            valid_ids.append([valid_id, valid_id + cur_num])
-            valid_id += cur_num
+            # 暂不累加 valid_ids，待完成重组后依据真实长度填写
 
         if len(img_tensors) <= 0:
             return None, [], []
         # 保证全部为4维后拼接
         img = torch.cat(img_tensors, dim=0)
         img = img.cuda()
+        # 提取所有视图的 patch 序列嵌入（views * patch_len, hidden）
         all_img_embeds = self.forward(img)
 
-        return all_img_embeds, uuids, valid_ids
+        # 将每张图的视图嵌入进行 spatial+unpad(+anyres_max) 重组，并追加换行列
+        new_embeds: List[torch.Tensor] = []
+        cur = 0
+        for i, img in enumerate(images):
+            # 计算本图视图数
+            t = img_tensors[i]
+            K = t.shape[0]
+            # 取出本图的所有 view 的 patch 序列嵌入
+            tokens_len = K * patch_len
+            cur_views_embeds = all_img_embeds[cur : cur + tokens_len]
+            cur += tokens_len
+
+            # 非 anyres 或多图/视频强制 pad：直接使用展平序列（K 通常为 1）
+            force_pad = len(images) > 1
+            aspect = getattr(self.image_processor, "image_aspect_ratio", None)
+            if not aspect or ("anyres" not in str(aspect)) or force_pad or K <= 1:
+                seq = cur_views_embeds
+                new_embeds.append(seq)
+                # 记录区间
+                valid_ids.append([valid_id, valid_id + seq.shape[0]])
+                valid_id += seq.shape[0]
+                continue
+
+            # anyres 单图路径：
+            # 切分 base 视图与其余视图
+            base_feature = cur_views_embeds[:patch_len]
+            rest = cur_views_embeds[patch_len:]
+            # (K-1, patch_len, hidden)
+            hidden = rest.shape[-1]
+            rest = rest.view(K - 1, patch_len, hidden)
+
+            # 计算 Nx, Ny
+            crop_size = self.image_processor.crop_size["height"]
+            grid_w, grid_h = get_anyres_image_grid_shape((img.image_w, img.image_h), image_grid_pinpoints, crop_size)
+            # (Ny, Nx, patch_side, patch_side, hidden)
+            rest = rest.view(grid_w * grid_h, patch_side, patch_side, hidden)
+            rest = rest.view(grid_h, grid_w, patch_side, patch_side, hidden)
+            # (hidden, Ny, patch_side, Nx, patch_side) -> (hidden, H, W)
+            rest = rest.permute(4, 0, 2, 1, 3).contiguous()
+            H = grid_h * patch_side
+            W = grid_w * patch_side
+            rest = rest.view(hidden, H, W)
+
+            # anyres_max 下采样
+            m = re.search(r"anyres_max_(\d+)", str(aspect))
+            if m is not None:
+                max_num_patches = int(m.group(1))
+                times = (H * W) / (max_num_patches * patch_len)
+                if times > 1.1:
+                    scale = (int(H // (times ** 0.5)), int(W // (times ** 0.5)))
+                    rest = F.interpolate(rest.unsqueeze(0), size=scale, mode="bilinear", align_corners=False)[0]
+                    H, W = rest.shape[1], rest.shape[2]
+
+            # 追加换行列（列数+1），换行列取 0 向量占位
+            newline_col = torch.zeros((hidden, H, 1), device=rest.device, dtype=rest.dtype)
+            rest = torch.cat([rest, newline_col], dim=2)  # (hidden, H, W+1)
+            # 展平成 (H*(W+1), hidden)
+            rest = rest.flatten(1, 2).transpose(0, 1).contiguous()
+
+            # 拼接 base + 其余
+            seq = torch.cat([base_feature, rest], dim=0)
+            new_embeds.append(seq)
+
+            # 记录区间
+            valid_ids.append([valid_id, valid_id + seq.shape[0]])
+            valid_id += seq.shape[0]
+
+        # 拼接所有图的重组后嵌入
+        all_new = torch.cat(new_embeds, dim=0)
+        return all_new, uuids, valid_ids
diff --git a/lightllm/models/mineru2_qwen/model.py b/lightllm/models/mineru2_qwen/model.py
@@ -11,6 +11,7 @@
 
 from ..mineru2_qwen.image_processing_mineru2 import Mineru2ImageProcessor
 from .image_processing_mineru2 import get_anyres_image_grid_shape
+import math
 
 IMG_START_TOKEN = "<img>"
 IMG_END_TOKEN = "</img>"
@@ -61,22 +62,46 @@ def init_audioitem_extral_params(
         raise NotImplementedError
 
     def get_image_token_length(self, img: ImageItem):
-        # 切回 patch 序列：总token数 = 视图数 × 每视图patch数
-        # 每视图patch数 = self.image_length = (image_size // patch_size) ** 2
+        # 非 anyres：单视图，仅 base patch 序列
         patch_len = int(self.image_length)
+        aspect_ratio = getattr(self.image_processor, "image_aspect_ratio", None)
+        if not aspect_ratio or ("anyres" not in str(aspect_ratio)):
+            return patch_len
 
+        # anyres：按 ref 的 spatial + unpad + anyres_max 逻辑计数
         crop_size = self.image_processor.crop_size["height"]
         grid_w, grid_h = get_anyres_image_grid_shape(
             (img.image_w, img.image_h), self.image_processor.image_grid_pinpoints, crop_size
         )
-        views = int(grid_w * grid_h + 1)
-        token_num = views * patch_len
-        print(
-            f"[debug] mineru2_tokenizer anyres img_size=({img.image_w},{img.image_h}) "
-            f"crop={crop_size} grid=({grid_w},{grid_h}) views={views}"
-            f" patch_len={patch_len} token_num={token_num}"
-        )
-        return token_num
+        # base 视图（原图等比到 crop）
+        base_tokens = patch_len
+        patch_side = int(math.sqrt(patch_len))
+        # h, w 为拼接后的整体网格尺寸（单位：patch）
+        h = int(grid_h * patch_side)
+        w = int(grid_w * patch_side)
+
+        new_h, new_w = h, w
+        max_num_patches = None
+        m = re.search(r"anyres_max_(\d+)", str(aspect_ratio))
+        if m:
+            max_num_patches = int(m.group(1))
+            times = math.sqrt((h * w) / (max_num_patches * patch_len))
+            if times > 1.1:
+                new_h = int(new_h // times)
+                new_w = int(new_w // times)
+        # 每行追加换行 token，数量等于行数 new_h
+        extra_tokens = int(new_h * (new_w + 1))
+        total_tokens = int(base_tokens + extra_tokens)
+
+        print(f"[debug][spatial] P={patch_side}, N={patch_len}, Nx={grid_w}, Ny={grid_h}, crops={grid_w*grid_h}")
+        if max_num_patches is not None:
+            times = math.sqrt((h * w) / (max_num_patches * patch_len))
+            print(
+                f"[debug][spatial+unpad+anyres_max] h={h}, w={w}, "
+                f"times={times:.4f}, h'={new_h}, w'={new_w}, newline={new_h}, extra_tokens~={extra_tokens}"
+            )
+        print(f"[debug][spatial] base_tokens={base_tokens}, extra_tokens={extra_tokens}, total_tokens={total_tokens}")
+        return total_tokens
 
     def get_audio_token_length(self, audio: AudioItem):
         raise NotImplementedError