@@ -301,6 +301,7 @@ def qwen_prompt_path_encoder(
301301def deepseekvl2_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
302302 """Patches and returns an instance of the HfRunner to use for GLM4."""
303303 hf_processor = hf_model .processor
304+ assert hf_processor is not None
304305
305306 def processor (* args , text = "" , images = None , ** kwargs ):
306307 if isinstance (images , Image ):
@@ -320,15 +321,16 @@ def processor(*args, text="", images=None, **kwargs):
320321 return BatchFeature (data = inputs , tensor_type = "pt" )
321322
322323 hf_model .processor = processor
323- hf_model .model .get_output_embeddings = (
324- lambda : hf_model .model .language .model .embed_tokens
324+ hf_model .model .get_output_embeddings = lambda : (
325+ hf_model .model .language .model .embed_tokens
325326 )
326327 return hf_model
327328
328329
329330def gemma3_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
330331 """Patches and returns an instance of the HfRunner to use for Gemma 3."""
331332 hf_processor = hf_model .processor
333+ assert hf_processor is not None
332334
333335 def processor (* args , ** kwargs ):
334336 return hf_processor (* args , do_pan_and_scan = True , ** kwargs )
@@ -408,6 +410,7 @@ def patched_forward(*args, **kwargs):
408410 hf_model .model .forward = patched_forward
409411
410412 hf_processor = hf_model .processor
413+ assert hf_processor is not None
411414
412415 def processor (* args , text = "" , images = None , ** kwargs ):
413416 if images is None :
@@ -433,15 +436,16 @@ def processor(*args, text="", images=None, **kwargs):
433436 )
434437
435438 hf_model .processor = processor
436- hf_model .model .get_output_embeddings = (
437- lambda : hf_model .model .transformer .output_layer
439+ hf_model .model .get_output_embeddings = lambda : (
440+ hf_model .model .transformer .output_layer
438441 )
439442 return hf_model
440443
441444
442445def glm4_1v_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
443446 """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
444447 hf_processor = hf_model .processor
448+ assert hf_processor is not None
445449
446450 def processor (* args , videos = None , ** kwargs ):
447451 if videos is not None and is_list_of (videos , tuple ):
@@ -521,8 +525,8 @@ def __call__(self, text: str, images: Image | list[Image], **kwargs):
521525 img_context_token_id = hf_model .tokenizer .convert_tokens_to_ids ("<IMG_CONTEXT>" )
522526 hf_model .model .img_context_token_id = img_context_token_id
523527 hf_model .processor = H2OVLProcessor (hf_model )
524- hf_model .model .get_output_embeddings = (
525- lambda : hf_model .model .language_model .get_output_embeddings ()
528+ hf_model .model .get_output_embeddings = lambda : (
529+ hf_model .model .language_model .get_output_embeddings ()
526530 )
527531 hf_model .model .generate = types .MethodType (_internvl_generate , hf_model .model )
528532 return hf_model
@@ -555,6 +559,7 @@ def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
555559 # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
556560 # ----------------------------
557561 original_processor = hf_model .processor
562+ assert original_processor is not None
558563
559564 def patched_processor (* args , ** kwargs ):
560565 result = original_processor (* args , ** kwargs )
@@ -782,8 +787,8 @@ def __call__(self, text: str, images: Image | list[Image], **kwargs):
782787 img_context_token_id = hf_model .tokenizer .convert_tokens_to_ids ("<IMG_CONTEXT>" )
783788 hf_model .model .img_context_token_id = img_context_token_id
784789 hf_model .processor = SkyworkR1VProcessor (hf_model )
785- hf_model .model .get_output_embeddings = (
786- lambda : hf_model .model .language_model .get_output_embeddings ()
790+ hf_model .model .get_output_embeddings = lambda : (
791+ hf_model .model .language_model .get_output_embeddings ()
787792 )
788793 hf_model .model .generate = types .MethodType (_internvl_generate , hf_model .model )
789794 return hf_model
@@ -890,8 +895,8 @@ def __call__(
890895 img_context_token_id = hf_model .tokenizer .convert_tokens_to_ids ("<IMG_CONTEXT>" )
891896 hf_model .model .img_context_token_id = img_context_token_id
892897 hf_model .processor = InternVLProcessor (hf_model )
893- hf_model .model .get_output_embeddings = (
894- lambda : hf_model .model .language_model .get_output_embeddings ()
898+ hf_model .model .get_output_embeddings = lambda : (
899+ hf_model .model .language_model .get_output_embeddings ()
895900 )
896901 hf_model .model .generate = types .MethodType (_internvl_generate , hf_model .model )
897902 return hf_model
@@ -1029,6 +1034,7 @@ def _generate(self, *args, image_sizes=None, **kwargs):
10291034def molmo_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
10301035 """Patches and returns an instance of the HfRunner to use for Molmo."""
10311036 hf_processor = hf_model .processor
1037+ assert hf_processor is not None
10321038
10331039 def _processor (* args , ** kwargs ):
10341040 return hf_processor .process (* args , ** kwargs )
@@ -1060,8 +1066,8 @@ def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
10601066
10611067def ovis_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
10621068 """Patches and returns an instance of the HfRunner to use for Ovis2."""
1063- hf_model .model .get_output_embeddings = (
1064- lambda : hf_model .model .llm .get_output_embeddings ()
1069+ hf_model .model .get_output_embeddings = lambda : (
1070+ hf_model .model .llm .get_output_embeddings ()
10651071 )
10661072
10671073 def processor (* args , text = "" , images = None , ** kwargs ):
@@ -1096,8 +1102,8 @@ def processor(*args, text="", images=None, **kwargs):
10961102
10971103def ovis2_5_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
10981104 """Patches and returns an instance of the HfRunner to use for Ovis2."""
1099- hf_model .model .get_output_embeddings = (
1100- lambda : hf_model .model .llm .get_output_embeddings ()
1105+ hf_model .model .get_output_embeddings = lambda : (
1106+ hf_model .model .llm .get_output_embeddings ()
11011107 )
11021108
11031109 def processor (* args , text = "" , images = None , videos = None , ** kwargs ):
@@ -1160,6 +1166,7 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
11601166def qwen3_vl_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
11611167 """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
11621168 hf_processor = hf_model .processor
1169+ assert hf_processor is not None
11631170
11641171 def processor (* args , videos = None , ** kwargs ):
11651172 if videos is not None and is_list_of (videos , tuple ):
@@ -1211,6 +1218,7 @@ def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
12111218 vision_encoder_info = get_vision_encoder_info (hf_model .config )
12121219
12131220 hf_processor = hf_model .processor
1221+ assert hf_processor is not None
12141222 if hf_processor .patch_size is None :
12151223 hf_processor .patch_size = vision_encoder_info .get_patch_size ()
12161224
@@ -1287,18 +1295,17 @@ def _encode_vision(pixel_values, tilings):
12871295 pv = pv .to (device = device , dtype = dtype )
12881296
12891297 features = native_model ._vis_enc (pv )
1290- grid_size = (
1291- config .vision .crop_size // config .vision .enc_patch_size
1292- )
1298+ grid_size = config .vision .crop_size // config .vision .enc_patch_size
12931299 global_feat = features [0 ]
12941300
12951301 if features .shape [0 ] > 1 and tilings is not None :
12961302 tiling = _normalize_tiling (tilings )
1297- local = features [1 :].view (
1298- - 1 , grid_size , grid_size , config .vision .enc_dim
1299- )
1303+ local = features [1 :].view (- 1 , grid_size , grid_size , config .vision .enc_dim )
13001304 reconstructed = reconstruct_from_crops (
1301- local , tiling , config .vision .overlap_margin , patch_size = 1 ,
1305+ local ,
1306+ tiling ,
1307+ config .vision .overlap_margin ,
1308+ patch_size = 1 ,
13021309 )
13031310 else :
13041311 reconstructed = global_feat .view (
@@ -1362,20 +1369,14 @@ def _generate(
13621369
13631370 # --- Prefill BOS + vision embeddings ---
13641371 bos_emb = F .embedding (
1365- torch .tensor (
1366- [[config .tokenizer .bos_id ]], device = device
1367- ),
1372+ torch .tensor ([[config .tokenizer .bos_id ]], device = device ),
13681373 native_model .text .wte ,
13691374 )
1370- img_input = torch .cat (
1371- [bos_emb , img_emb .unsqueeze (0 )], dim = 1
1372- )
1375+ img_input = torch .cat ([bos_emb , img_emb .unsqueeze (0 )], dim = 1 )
13731376 prefix_len = img_input .size (1 ) # 730
13741377
13751378 mask = native_model .attn_mask [:, :, :prefix_len , :]
1376- pos_ids = torch .arange (
1377- prefix_len , dtype = torch .long , device = device
1378- )
1379+ pos_ids = torch .arange (prefix_len , dtype = torch .long , device = device )
13791380 native_model ._prefill (img_input , mask , pos_ids , None )
13801381
13811382 # --- Extract prompt tokens after BOS + <image> ---
@@ -1391,7 +1392,7 @@ def _generate(
13911392 )
13921393 return sequences
13931394
1394- prompt_tokens = ids [img_start + len (image_placeholder_ids ):]
1395+ prompt_tokens = ids [img_start + len (image_placeholder_ids ) :]
13951396
13961397 # --- Prefill prompt tokens and get first logits ---
13971398 if not prompt_tokens :
@@ -1403,35 +1404,23 @@ def _generate(
14031404 )
14041405 return sequences
14051406
1406- prompt_tensor = torch .tensor (
1407- [prompt_tokens ], device = device
1408- )
1409- prompt_emb = F .embedding (
1410- prompt_tensor , native_model .text .wte
1411- )
1407+ prompt_tensor = torch .tensor ([prompt_tokens ], device = device )
1408+ prompt_emb = F .embedding (prompt_tensor , native_model .text .wte )
14121409 prompt_len = prompt_emb .size (1 )
14131410
1414- mask = native_model .attn_mask [
1415- :, :, prefix_len : prefix_len + prompt_len , :
1416- ]
1411+ mask = native_model .attn_mask [:, :, prefix_len : prefix_len + prompt_len , :]
14171412 pos_ids = torch .arange (
14181413 prefix_len ,
14191414 prefix_len + prompt_len ,
14201415 dtype = torch .long ,
14211416 device = device ,
14221417 )
1423- hidden = native_model ._prefill (
1424- prompt_emb , mask , pos_ids , None
1425- )
1418+ hidden = native_model ._prefill (prompt_emb , mask , pos_ids , None )
14261419 pos = prefix_len + prompt_len
14271420
14281421 # Compute logits from last hidden state
1429- hidden_last = native_model .text .post_ln (
1430- hidden [:, - 1 :, :]
1431- )
1432- logits = native_model .text .lm_head (
1433- hidden_last .squeeze (1 )
1434- )
1422+ hidden_last = native_model .text .post_ln (hidden [:, - 1 :, :])
1423+ logits = native_model .text .lm_head (hidden_last .squeeze (1 ))
14351424
14361425 # --- Greedy decode ---
14371426 generated = []
@@ -1456,22 +1445,12 @@ def _generate(
14561445 torch .tensor ([[next_token ]], device = device ),
14571446 native_model .text .wte ,
14581447 )
1459- mask = native_model .attn_mask [
1460- :, :, pos : pos + 1 , :
1461- ]
1462- pos_ids_step = torch .tensor (
1463- [pos ], dtype = torch .long , device = device
1464- )
1465- hidden = native_model ._prefill (
1466- next_emb , mask , pos_ids_step , None
1467- )
1468- hidden_last = native_model .text .post_ln (
1469- hidden [:, - 1 :, :]
1470- )
1448+ mask = native_model .attn_mask [:, :, pos : pos + 1 , :]
1449+ pos_ids_step = torch .tensor ([pos ], dtype = torch .long , device = device )
1450+ hidden = native_model ._prefill (next_emb , mask , pos_ids_step , None )
1451+ hidden_last = native_model .text .post_ln (hidden [:, - 1 :, :])
14711452 prev_hs = hidden_last
1472- logits = native_model .text .lm_head (
1473- hidden_last .squeeze (1 )
1474- )
1453+ logits = native_model .text .lm_head (hidden_last .squeeze (1 ))
14751454 pos += 1
14761455
14771456 result_ids = ids + generated
@@ -1480,8 +1459,7 @@ def _generate(
14801459 if return_dict :
14811460 return types .SimpleNamespace (
14821461 sequences = sequences ,
1483- hidden_states = tuple (all_hidden_states )
1484- if output_hs else None ,
1462+ hidden_states = tuple (all_hidden_states ) if output_hs else None ,
14851463 )
14861464 return sequences
14871465
@@ -1514,6 +1492,7 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
15141492 import soundfile as sf
15151493
15161494 processor = hf_model .processor
1495+ assert processor is not None
15171496
15181497 def _audio_to_base64 (audio_array , sample_rate : int ) -> str :
15191498 """Encode a numpy audio array as a base64 WAV string."""
0 commit comments