File tree Expand file tree Collapse file tree 1 file changed +6
-3
lines changed
lightllm/models/mineru2_qwen Expand file tree Collapse file tree 1 file changed +6
-3
lines changed Original file line number Diff line number Diff line change @@ -64,18 +64,21 @@ def init_audioitem_extral_params(
6464
6565 def get_image_token_length (self , img : ImageItem ):
6666 # 对于 Mineru2 集成,视觉塔返回的是每个裁剪的一条 pooled 向量。
67- # token 数应与裁剪数量一致:anyres 模式为 1(原图)+ 网格裁剪数,否则为 1 。
67+ # token 数应与裁剪数量一致:anyres 模式为 1(原图)+ 网格裁剪数,且每块含双视图(factor=2) 。
6868 aspect = getattr (self .image_processor , "image_aspect_ratio" , None )
6969 try :
7070 if aspect and (aspect == "anyres" or (isinstance (aspect , str ) and "anyres_max" in aspect )):
7171 crop_size = self .image_processor .crop_size ["height" ]
7272 grid_w , grid_h = get_anyres_image_grid_shape (
7373 (img .image_w , img .image_h ), self .image_processor .image_grid_pinpoints , crop_size
7474 )
75- token_num = int (grid_w * grid_h + 1 )
75+ base = int (grid_w * grid_h + 1 )
76+ view_factor = 2 # 与 encode 中观测到的 t.shape[1]==2 对齐
77+ token_num = base * view_factor
7678 print (
7779 f"[debug] mineru2_tokenizer anyres img_size=({ img .image_w } ,{ img .image_h } ) "
78- f"crop={ crop_size } grid=({ grid_w } ,{ grid_h } ) token_num={ token_num } "
80+ f"crop={ crop_size } grid=({ grid_w } ,{ grid_h } ) base={ base } view_factor={ view_factor } "
81+ f" token_num={ token_num } "
7982 )
8083 return token_num
8184 else :
You can’t perform that action at this time.
0 commit comments