@@ -43,11 +43,18 @@ def _resolve_path(name):
4343 elif "siglip" in vision_tower .lower ():
4444 vt_path = _resolve_path (vision_tower )
4545 print (f"[debug] load siglip from { vt_path } " )
46- model = SiglipVisionModel .from_pretrained (vt_path )
47- if hasattr (model , "config" ) and hasattr (model .config , "num_hidden_layers" ):
48- model .config .num_hidden_layers = max (0 , model .config .num_hidden_layers - 1 )
49- if hasattr (model , "config" ) and hasattr (model .config , "vision_use_head" ):
50- model .config .vision_use_head = False
46+ # 方案A:使用配置减层并按该配置实例化模型,再加载权重(忽略不匹配尺寸)
47+ cfg = SiglipVisionConfig .from_pretrained (vt_path )
48+ old_layers = getattr (cfg , "num_hidden_layers" , None )
49+ cfg .num_hidden_layers = max (0 , cfg .num_hidden_layers - 1 )
50+ cfg .vision_use_head = False
51+ model = SiglipVisionModel .from_pretrained (vt_path , config = cfg , ignore_mismatched_sizes = True )
52+ try :
53+ actual_layers = len (model .vision_model .encoder .layers ) # type: ignore[attr-defined]
54+ except Exception :
55+ actual_layers = None
56+ new_cfg_layers = getattr (getattr (model , "config" , None ), "num_hidden_layers" , None )
57+ print (f"[debug] siglip_layers planA old={ old_layers } new_cfg={ new_cfg_layers } actual_module={ actual_layers } " )
5158 return model
5259 else :
5360 raise ValueError (f"Unknown vision tower: { vision_tower } " )
@@ -211,11 +218,46 @@ def cuda(self):
211218 return self
212219
213220 def forward (self , x ) -> torch .Tensor :
221+ # 运行时形状与精度/设备检查
222+ try :
223+ print (f"[debug] mineru2_visual.forward x.shape={ tuple (x .shape )} dtype={ x .dtype } device={ x .device } " )
224+ except Exception :
225+ pass
214226 vision_out = self .vision_tower (x , output_hidden_states = True )
215- hidden = vision_out .hidden_states [- 1 ]
216- # 对patch维度做平均池化,得到每视图一个向量
217- pooled_per_view = hidden .mean (dim = 1 )
218- proj = self .projector (pooled_per_view )
227+ hiddens = vision_out .hidden_states
228+ # hidden_states 数量与 config 层数的关系(一般为 num_layers + 1)
229+ try :
230+ cfg_layers = getattr (getattr (self .vision_tower , "config" , None ), "num_hidden_layers" , None )
231+ eff_layers = len (hiddens ) - 1 if isinstance (hiddens , (list , tuple )) else None
232+ print (
233+ f"[debug] mineru2_visual.hidden_states len={ len (hiddens )} "
234+ f" cfg_layers={ cfg_layers } eff_layers={ eff_layers } "
235+ )
236+ except Exception :
237+ pass
238+ # 对齐ref的“减一层”语义:优先使用倒数第二层;若不可用则回退最后一层
239+ try :
240+ chosen_idx = - 2 if isinstance (hiddens , (list , tuple )) and len (hiddens ) >= 2 else - 1
241+ feat = hiddens [chosen_idx ]
242+ print (f"[debug] mineru2_visual.select_layer idx={ chosen_idx } feat.shape={ tuple (feat .shape )} " )
243+ except Exception :
244+ feat = hiddens [- 2 ] if isinstance (hiddens , (list , tuple )) and len (hiddens ) >= 2 else hiddens [- 1 ]
245+ # 切回 patch 序列特征:去除 CLS(若存在),按序列过 projector,再展平为 (views*patch, hidden)
246+ patch_side = self .vision_tower .config .image_size // self .vision_tower .config .patch_size
247+ patch_len = patch_side * patch_side
248+ if feat .shape [1 ] == patch_len + 1 :
249+ feat = feat [:, 1 :, :]
250+ print (f"[debug] mineru2_visual.drop_cls patch_len={ patch_len } feat_no_cls.shape={ tuple (feat .shape )} " )
251+ proj_seq = self .projector (feat )
252+ try :
253+ print (f"[debug] mineru2_visual.projector_seq_out shape={ tuple (proj_seq .shape )} (views, patch, hidden)" )
254+ except Exception :
255+ pass
256+ proj = proj_seq .reshape (- 1 , proj_seq .shape [- 1 ])
257+ try :
258+ print (f"[debug] mineru2_visual.projector_flat_out shape={ tuple (proj .shape )} (views*patch, hidden)" )
259+ except Exception :
260+ pass
219261 return proj
220262
221263 def encode (self , images : List [ImageItem ]) -> Tuple [torch .Tensor , List [str ], List [List [int ]]]:
@@ -226,6 +268,10 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
226268 valid_ids : List [List [int ]] = []
227269 image_aspect_ratio = getattr (self .image_processor , "image_aspect_ratio" , None )
228270 image_grid_pinpoints = getattr (self .image_processor , "image_grid_pinpoints" , None )
271+ # 每视图 patch_len(例如 384/14=27, 27^2=729)
272+ patch_side = self .vision_tower .config .image_size // self .vision_tower .config .patch_size
273+ patch_len = patch_side * patch_side
274+ print (f"[debug] mineru2_visual.patch_len={ patch_len } (side={ patch_side } )" )
229275 for i , img in enumerate (images ):
230276 if isinstance (img , ImageItem ):
231277 uuids .append (img .uuid )
@@ -247,60 +293,77 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
247293 elif t .ndim == 3 :
248294 print (f"[debug] mineru2_visual unsqueeze t.ndim: { t .ndim } , t.shape: { t .shape } " )
249295 t = t .unsqueeze (0 )
250- # 在修改前记录 manager 分配的 token_num
296+ # 在修改前记录 manager 分配的 token_num(可能是视图数或视图*patch数)
251297 try :
252298 print (f"[debug] mineru2_visual manager_token_num_before={ img .token_num } uuid={ img .uuid } " )
253299 except Exception :
254300 pass
255- # 对齐实际 K 与期望 token_num
256- expected_k = img .token_num if getattr (img , "token_num" , None ) is not None else None
301+ # 对齐实际视图数 K 与期望 token(可能是 K 或 K*patch_len)
302+ expected_token = img .token_num if getattr (img , "token_num" , None ) is not None else None
257303 actual_k = t .shape [0 ]
258- if expected_k is None or expected_k <= 0 :
259- expected_k = actual_k
260- print (f"[debug] mineru2_visual expected_k_from_actual uuid={ img .uuid } expected_k={ expected_k } " )
261- if actual_k != expected_k :
262- if actual_k % expected_k == 0 :
263- factor = actual_k // expected_k
304+ if expected_token is None or expected_token <= 0 :
305+ expected_views = actual_k
306+ print (
307+ f"[debug] mineru2_visual expected_views_from_actual uuid={ img .uuid } "
308+ f" expected_views={ expected_views } "
309+ )
310+ else :
311+ if expected_token >= patch_len and expected_token % patch_len == 0 :
312+ expected_views = expected_token // patch_len
313+ print (
314+ f"[debug] mineru2_visual expected_views_from_tokens uuid={ img .uuid } "
315+ f" expected_token={ expected_token } patch_len={ patch_len } expected_views={ expected_views } "
316+ )
317+ else :
318+ expected_views = expected_token
319+ print (
320+ f"[debug] mineru2_visual expected_views_interpret_as_views uuid={ img .uuid } "
321+ f" expected_views={ expected_views } "
322+ )
323+ if actual_k != expected_views :
324+ if actual_k % expected_views == 0 :
325+ factor = actual_k // expected_views
264326 print (
265327 f"[debug] mineru2_visual down_aggregate uuid={ img .uuid } "
266- f" actual_k={ actual_k } expected_k= { expected_k } factor={ factor } "
328+ f" actual_k={ actual_k } expected_views= { expected_views } factor={ factor } "
267329 )
268- t = t .view (expected_k , factor , t .shape [1 ], t .shape [2 ], t .shape [3 ]).mean (dim = 1 )
269- elif expected_k % actual_k == 0 :
270- factor = expected_k // actual_k
330+ t = t .view (expected_views , factor , t .shape [1 ], t .shape [2 ], t .shape [3 ]).mean (dim = 1 )
331+ elif expected_views % actual_k == 0 :
332+ factor = expected_views // actual_k
271333 print (
272334 f"[debug] mineru2_visual up_repeat uuid={ img .uuid } "
273- f" actual_k={ actual_k } expected_k= { expected_k } factor={ factor } "
335+ f" actual_k={ actual_k } expected_views= { expected_views } factor={ factor } "
274336 )
275337 t = t .repeat_interleave (repeats = factor , dim = 0 )
276338 else :
277- k = min (actual_k , expected_k )
339+ k = min (actual_k , expected_views )
278340 print (
279341 f"[debug] mineru2_visual fallback_slice uuid={ img .uuid } "
280- f" actual_k={ actual_k } expected_k= { expected_k } k={ k } "
342+ f" actual_k={ actual_k } expected_views= { expected_views } k={ k } "
281343 )
282- if actual_k >= expected_k :
283- t = t [:expected_k ]
344+ if actual_k >= expected_views :
345+ t = t [:expected_views ]
284346 else :
285347 # pad by repeating last
286- pad = t [- 1 :].repeat (expected_k - actual_k , 1 , 1 , 1 )
348+ pad = t [- 1 :].repeat (expected_views - actual_k , 1 , 1 , 1 )
287349 t = torch .cat ([t , pad ], dim = 0 )
288350 img_tensors .append (t )
289- # 最终 K
290- final_k = t .shape [0 ]
291- img .token_num = final_k
351+ # 最终视图数 K
352+ final_views = t .shape [0 ]
353+ # 对齐 patch 序列后的总 token 数
354+ img .token_num = final_views * patch_len
292355 print (
293- f"[debug] mineru2_visual actual_k={ actual_k } "
294- f"expected_k= { expected_k } final_k= { final_k } uuid={ img .uuid } "
356+ f"[debug] mineru2_visual actual_k={ actual_k } expected_views= { expected_views } "
357+ f" final_views= { final_views } final_token_num= { img . token_num } uuid={ img .uuid } "
295358 )
296359 else :
297360 raise Exception ("Unsupport input types: {} for {}" .format (type (img ), img ))
298361
299- cur_num = (
300- img_tensors [- 1 ].shape [ 0 ]
301- if isinstance ( img_tensors [- 1 ], torch . Tensor ) and img_tensors [ - 1 ]. dim () == 4
302- else 1
303- )
362+ # 本图对应的 token 数(视图 * patch_len)
363+ if isinstance ( img_tensors [ - 1 ], torch . Tensor ) and img_tensors [- 1 ].dim () == 4 :
364+ cur_num = img_tensors [- 1 ]. shape [ 0 ] * patch_len
365+ else :
366+ cur_num = patch_len
304367 valid_ids .append ([valid_id , valid_id + cur_num ])
305368 print (
306369 f"[debug] mineru2_visual valid_ids_append uuid={ img .uuid } "
@@ -314,6 +377,9 @@ def encode(self, images: List[ImageItem]) -> Tuple[torch.Tensor, List[str], List
314377 img = torch .cat (img_tensors , dim = 0 )
315378 img = img .cuda ()
316379 all_img_embeds = self .forward (img )
317- print (f"[debug] mineru2_visual all_img_embeds.shape={ tuple (all_img_embeds .shape )} " f"total_K={ img .shape [0 ]} " )
380+ print (
381+ f"[debug] mineru2_visual all_img_embeds.shape={ tuple (all_img_embeds .shape )} "
382+ f" total_tokens={ img .shape [0 ] * patch_len } "
383+ )
318384
319385 return all_img_embeds , uuids , valid_ids
0 commit comments