ruff

sniper35 · sniper35 · commit 5aee5401cbdc · 2026-02-23T12:55:36.000Z
Signed-off-by: Dong Wang &lt;dongw2019@gmail.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -573,6 +573,10 @@ def generate(
                 use_cache=True,
                 **kwargs,
             )
+            if self.processor is None:
+                raise RuntimeError(
+                    "HfRunner.processor is not initialized; cannot decode output."
+                )
             output_str = self.processor.batch_decode(
                 output_ids,
                 skip_special_tokens=True,
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -301,6 +301,7 @@ def qwen_prompt_path_encoder(
 def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4."""
     hf_processor = hf_model.processor
+    assert hf_processor is not None
 
     def processor(*args, text="", images=None, **kwargs):
         if isinstance(images, Image):
@@ -320,15 +321,16 @@ def processor(*args, text="", images=None, **kwargs):
         return BatchFeature(data=inputs, tensor_type="pt")
 
     hf_model.processor = processor
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.language.model.embed_tokens
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.language.model.embed_tokens
     )
     return hf_model
 
 
 def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Gemma 3."""
     hf_processor = hf_model.processor
+    assert hf_processor is not None
 
     def processor(*args, **kwargs):
         return hf_processor(*args, do_pan_and_scan=True, **kwargs)
@@ -408,6 +410,7 @@ def patched_forward(*args, **kwargs):
         hf_model.model.forward = patched_forward
 
     hf_processor = hf_model.processor
+    assert hf_processor is not None
 
     def processor(*args, text="", images=None, **kwargs):
         if images is None:
@@ -433,15 +436,16 @@ def processor(*args, text="", images=None, **kwargs):
         )
 
     hf_model.processor = processor
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.transformer.output_layer
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.transformer.output_layer
     )
     return hf_model
 
 
 def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
     hf_processor = hf_model.processor
+    assert hf_processor is not None
 
     def processor(*args, videos=None, **kwargs):
         if videos is not None and is_list_of(videos, tuple):
@@ -521,8 +525,8 @@ def __call__(self, text: str, images: Image | list[Image], **kwargs):
     img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = H2OVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.language_model.get_output_embeddings()
     )
     hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
@@ -555,6 +559,7 @@ def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
     # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
     # ----------------------------
     original_processor = hf_model.processor
+    assert original_processor is not None
 
     def patched_processor(*args, **kwargs):
         result = original_processor(*args, **kwargs)
@@ -782,8 +787,8 @@ def __call__(self, text: str, images: Image | list[Image], **kwargs):
     img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = SkyworkR1VProcessor(hf_model)
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.language_model.get_output_embeddings()
     )
     hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
@@ -890,8 +895,8 @@ def __call__(
     img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = InternVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.language_model.get_output_embeddings()
     )
     hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
@@ -1029,6 +1034,7 @@ def _generate(self, *args, image_sizes=None, **kwargs):
 def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Molmo."""
     hf_processor = hf_model.processor
+    assert hf_processor is not None
 
     def _processor(*args, **kwargs):
         return hf_processor.process(*args, **kwargs)
@@ -1060,8 +1066,8 @@ def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
 
 def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.llm.get_output_embeddings()
     )
 
     def processor(*args, text="", images=None, **kwargs):
@@ -1096,8 +1102,8 @@ def processor(*args, text="", images=None, **kwargs):
 
 def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = (
-        lambda: hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = lambda: (
+        hf_model.model.llm.get_output_embeddings()
     )
 
     def processor(*args, text="", images=None, videos=None, **kwargs):
@@ -1160,6 +1166,7 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 def qwen3_vl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4.1V."""
     hf_processor = hf_model.processor
+    assert hf_processor is not None
 
     def processor(*args, videos=None, **kwargs):
         if videos is not None and is_list_of(videos, tuple):
@@ -1211,6 +1218,7 @@ def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     vision_encoder_info = get_vision_encoder_info(hf_model.config)
 
     hf_processor = hf_model.processor
+    assert hf_processor is not None
     if hf_processor.patch_size is None:
         hf_processor.patch_size = vision_encoder_info.get_patch_size()
 
@@ -1287,18 +1295,17 @@ def _encode_vision(pixel_values, tilings):
         pv = pv.to(device=device, dtype=dtype)
 
         features = native_model._vis_enc(pv)
-        grid_size = (
-            config.vision.crop_size // config.vision.enc_patch_size
-        )
+        grid_size = config.vision.crop_size // config.vision.enc_patch_size
         global_feat = features[0]
 
         if features.shape[0] > 1 and tilings is not None:
             tiling = _normalize_tiling(tilings)
-            local = features[1:].view(
-                -1, grid_size, grid_size, config.vision.enc_dim
-            )
+            local = features[1:].view(-1, grid_size, grid_size, config.vision.enc_dim)
             reconstructed = reconstruct_from_crops(
-                local, tiling, config.vision.overlap_margin, patch_size=1,
+                local,
+                tiling,
+                config.vision.overlap_margin,
+                patch_size=1,
             )
         else:
             reconstructed = global_feat.view(
@@ -1362,20 +1369,14 @@ def _generate(
 
             # --- Prefill BOS + vision embeddings ---
             bos_emb = F.embedding(
-                torch.tensor(
-                    [[config.tokenizer.bos_id]], device=device
-                ),
+                torch.tensor([[config.tokenizer.bos_id]], device=device),
                 native_model.text.wte,
             )
-            img_input = torch.cat(
-                [bos_emb, img_emb.unsqueeze(0)], dim=1
-            )
+            img_input = torch.cat([bos_emb, img_emb.unsqueeze(0)], dim=1)
             prefix_len = img_input.size(1)  # 730
 
             mask = native_model.attn_mask[:, :, :prefix_len, :]
-            pos_ids = torch.arange(
-                prefix_len, dtype=torch.long, device=device
-            )
+            pos_ids = torch.arange(prefix_len, dtype=torch.long, device=device)
             native_model._prefill(img_input, mask, pos_ids, None)
 
             # --- Extract prompt tokens after BOS + <image> ---
@@ -1391,7 +1392,7 @@ def _generate(
                     )
                 return sequences
 
-            prompt_tokens = ids[img_start + len(image_placeholder_ids):]
+            prompt_tokens = ids[img_start + len(image_placeholder_ids) :]
 
             # --- Prefill prompt tokens and get first logits ---
             if not prompt_tokens:
@@ -1403,35 +1404,23 @@ def _generate(
                     )
                 return sequences
 
-            prompt_tensor = torch.tensor(
-                [prompt_tokens], device=device
-            )
-            prompt_emb = F.embedding(
-                prompt_tensor, native_model.text.wte
-            )
+            prompt_tensor = torch.tensor([prompt_tokens], device=device)
+            prompt_emb = F.embedding(prompt_tensor, native_model.text.wte)
             prompt_len = prompt_emb.size(1)
 
-            mask = native_model.attn_mask[
-                :, :, prefix_len : prefix_len + prompt_len, :
-            ]
+            mask = native_model.attn_mask[:, :, prefix_len : prefix_len + prompt_len, :]
             pos_ids = torch.arange(
                 prefix_len,
                 prefix_len + prompt_len,
                 dtype=torch.long,
                 device=device,
             )
-            hidden = native_model._prefill(
-                prompt_emb, mask, pos_ids, None
-            )
+            hidden = native_model._prefill(prompt_emb, mask, pos_ids, None)
             pos = prefix_len + prompt_len
 
             # Compute logits from last hidden state
-            hidden_last = native_model.text.post_ln(
-                hidden[:, -1:, :]
-            )
-            logits = native_model.text.lm_head(
-                hidden_last.squeeze(1)
-            )
+            hidden_last = native_model.text.post_ln(hidden[:, -1:, :])
+            logits = native_model.text.lm_head(hidden_last.squeeze(1))
 
             # --- Greedy decode ---
             generated = []
@@ -1456,22 +1445,12 @@ def _generate(
                     torch.tensor([[next_token]], device=device),
                     native_model.text.wte,
                 )
-                mask = native_model.attn_mask[
-                    :, :, pos : pos + 1, :
-                ]
-                pos_ids_step = torch.tensor(
-                    [pos], dtype=torch.long, device=device
-                )
-                hidden = native_model._prefill(
-                    next_emb, mask, pos_ids_step, None
-                )
-                hidden_last = native_model.text.post_ln(
-                    hidden[:, -1:, :]
-                )
+                mask = native_model.attn_mask[:, :, pos : pos + 1, :]
+                pos_ids_step = torch.tensor([pos], dtype=torch.long, device=device)
+                hidden = native_model._prefill(next_emb, mask, pos_ids_step, None)
+                hidden_last = native_model.text.post_ln(hidden[:, -1:, :])
                 prev_hs = hidden_last
-                logits = native_model.text.lm_head(
-                    hidden_last.squeeze(1)
-                )
+                logits = native_model.text.lm_head(hidden_last.squeeze(1))
                 pos += 1
 
             result_ids = ids + generated
@@ -1480,8 +1459,7 @@ def _generate(
             if return_dict:
                 return types.SimpleNamespace(
                     sequences=sequences,
-                    hidden_states=tuple(all_hidden_states)
-                    if output_hs else None,
+                    hidden_states=tuple(all_hidden_states) if output_hs else None,
                 )
             return sequences
 
@@ -1514,6 +1492,7 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
     import soundfile as sf
 
     processor = hf_model.processor
+    assert processor is not None
 
     def _audio_to_base64(audio_array, sample_rate: int) -> str:
         """Encode a numpy audio array as a base64 WAV string."""
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -661,12 +661,11 @@ def process_outputs(
                 if (
                     isinstance(text_override, str)
                     and finish_reason is not None
-                    and hasattr(request_output, "outputs")
+                    and isinstance(request_output, RequestOutput)
                     and request_output.outputs
                 ):
                     for comp_output in request_output.outputs:
-                        if isinstance(comp_output, CompletionOutput):
-                            comp_output.text = text_override
+                        comp_output.text = text_override
 
                 if req_state.streaming_input:
                     request_output.finished = False