run precommit

HonestDeng · HonestDeng · commit c6deeb187542 · 2026-03-01T18:53:54.000+08:00
Signed-off-by: HonestDeng &lt;2958906959@qq.com&gt;
diff --git a/examples/offline_inference/mammothmodal2_preview/README.md b/examples/offline_inference/mammothmodal2_preview/README.md
@@ -32,6 +32,6 @@ python run_mammothmoda2_t2i.py \
 python run_mammothmoda2_image_summarize.py \
   --model ./MammothModa2-Preview \
   --stage-config ./mammoth_moda2_image_summarize.yaml \
-  --question "Summerize this image." \
+  --question "Summarize this image." \
   --image ./image.png
 ```
diff --git a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py
@@ -26,11 +26,18 @@
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="MammothModa2 image summarization (offline, AR only).")
     parser.add_argument("--model", type=str, required=True, help="Path to model directory or model id.")
-    parser.add_argument("--stage-config", type=str, required=True, help="Path to stage config yaml (single-stage AR->text).")
+    parser.add_argument(
+        "--stage-config", type=str, required=True, help="Path to stage config yaml (single-stage AR->text)."
+    )
     parser.add_argument("--image", type=str, required=True, help="Path to input image.")
     parser.add_argument("--question", type=str, default=DEFAULT_QUESTION, help="Question/instruction for the model.")
     parser.add_argument("--system", type=str, default=DEFAULT_SYSTEM, help="System prompt.")
-    parser.add_argument("--max-tokens", type=int, default=512, help="Max new tokens to generate.",)
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=512,
+        help="Max new tokens to generate.",
+    )
     parser.add_argument("--temperature", type=float, default=0.2)
     parser.add_argument("--top-p", type=float, default=0.9)
     parser.add_argument("--seed", type=int, default=42)
diff --git a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_t2i.py b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_t2i.py
@@ -35,14 +35,14 @@
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
-_PATCH_SIZE = 16   # AR image grid patch size (pixels per token)
+_PATCH_SIZE = 16  # AR image grid patch size (pixels per token)
 
 
 class T2IGenConfig(NamedTuple):
     eol_token_id: int
     visual_token_start_id: int
     visual_token_end_id: int
-    top_k: int           # AR sampling top-k (covers the full visual generation vocabulary)
+    top_k: int  # AR sampling top-k (covers the full visual generation vocabulary)
     # Qwen2.5-VL special vision tokens: <|image_pad|>, <|video_pad|>, <|vision_start|>, <|vision_end|>
     visual_ids: list[int]
 
@@ -80,17 +80,30 @@ def load_t2i_generation_config(model_dir: str) -> T2IGenConfig:
 def parse_args() -> argparse.Namespace:
     p = argparse.ArgumentParser(description="Run MammothModa2 T2I (AR -> DiT) with vLLM-Omni.")
     p.add_argument("--model", type=str, required=True, help="Path to the model directory.")
-    p.add_argument("--stage-config", type=str, required=True,help="Path to the multi-stage YAML configuration.")
-    p.add_argument("--prompt", type=str, action="append", default=None,
+    p.add_argument("--stage-config", type=str, required=True, help="Path to the multi-stage YAML configuration.")
+    p.add_argument(
+        "--prompt",
+        type=str,
+        action="append",
+        default=None,
         help=(
             "Text prompt for image generation. Can be provided multiple times "
-            "to generate multiple images with shared height/width/CFG settings."),
+            "to generate multiple images with shared height/width/CFG settings."
+        ),
     )
     p.add_argument("--height", type=int, default=1024, help="Output image height (must be a multiple of 16).")
     p.add_argument("--width", type=int, default=1024, help="Output image width (must be a multiple of 16).")
     p.add_argument("--num-inference-steps", type=int, default=50, help="Number of diffusion steps for the DiT stage.")
-    p.add_argument("--text-guidance-scale", type=float, default=9.0, help="Classifier-Free Guidance (CFG) scale for DiT.")
-    p.add_argument("--cfg-range", type=float, nargs=2, default=(0.0, 1.0), help="Relative step range [start, end] where CFG is active.",)
+    p.add_argument(
+        "--text-guidance-scale", type=float, default=9.0, help="Classifier-Free Guidance (CFG) scale for DiT."
+    )
+    p.add_argument(
+        "--cfg-range",
+        type=float,
+        nargs=2,
+        default=(0.0, 1.0),
+        help="Relative step range [start, end] where CFG is active.",
+    )
     p.add_argument("--out", type=str, default="output.png", help="Path to save the generated image.")
     p.add_argument("--trust-remote-code", action="store_true", help="Trust remote code when loading the model.")
     args = p.parse_args()
@@ -128,12 +141,12 @@ def _collect_images(outputs: list) -> list[torch.Tensor]:
         if not isinstance(ro_list, list):
             ro_list = [ro_list]
         for ro_item in ro_list:
-            for completion in (getattr(ro_item, "outputs", None) or []):
+            for completion in getattr(ro_item, "outputs", None) or []:
                 mm = getattr(completion, "multimodal_output", None)
                 if not isinstance(mm, dict) or "image" not in mm:
                     raise RuntimeError(f"Missing image in multimodal output: {mm}")
                 payload = mm["image"]
-                for tensor in (payload if isinstance(payload, list) else [payload]):
+                for tensor in payload if isinstance(payload, list) else [payload]:
                     if not isinstance(tensor, torch.Tensor):
                         raise TypeError(f"Expected image tensor, got {type(tensor)}")
                     images.append(tensor)
@@ -183,16 +196,22 @@ def main() -> None:
             detokenize=False,
         )
         dit_sampling = SamplingParams(
-            temperature=0.0, top_p=1.0, top_k=-1, max_tokens=1, detokenize=False,
+            temperature=0.0,
+            top_p=1.0,
+            top_k=-1,
+            max_tokens=1,
+            detokenize=False,
         )
 
         additional_information = {
             "omni_task": ["t2i"],
-            "ar_width": [ar_width], "ar_height": [ar_height],
+            "ar_width": [ar_width],
+            "ar_height": [ar_height],
             "eol_token_id": [gen_cfg.eol_token_id],
             "visual_token_start_id": [gen_cfg.visual_token_start_id],
             "visual_token_end_id": [gen_cfg.visual_token_end_id],
-            "image_height": [args.height], "image_width": [args.width],
+            "image_height": [args.height],
+            "image_width": [args.width],
             "num_inference_steps": [args.num_inference_steps],
             "text_guidance_scale": [args.text_guidance_scale],
             "cfg_range": [args.cfg_range[0], args.cfg_range[1]],
diff --git a/tests/e2e/offline_inference/test_mammoth_moda2.py b/tests/e2e/offline_inference/test_mammoth_moda2.py
@@ -32,15 +32,15 @@
 # Token ID constants (Qwen2.5-VL base tokenizer + MammothModa2 gen vocab)
 # ---------------------------------------------------------------------------
 # Qwen2.5-VL special vision token IDs (match Mammothmoda2Qwen2_5_VLTextConfig defaults)
-_IMAGE_TOKEN_ID = 151655        # "<|image_pad|>"
-_VIDEO_TOKEN_ID = 151656        # "<|video_pad|>"
-_VISION_START_TOKEN_ID = 151652 # "<|vision_start|>"
-_VISION_END_TOKEN_ID = 151653   # "<|vision_end|>"
+_IMAGE_TOKEN_ID = 151655  # "<|image_pad|>"
+_VIDEO_TOKEN_ID = 151656  # "<|video_pad|>"
+_VISION_START_TOKEN_ID = 151652  # "<|vision_start|>"
+_VISION_END_TOKEN_ID = 151653  # "<|vision_end|>"
 # MammothModa2 generation vocab (from t2i_generation_config.json)
-_BASE_VOCAB_SIZE = 152064       # Qwen2.5 base vocab size; also used as eol_token_id
-_VISUAL_TOKEN_START_ID = 152072 # first visual generation token
-_VISUAL_TOKEN_END_ID = 168456   # last visual generation token
-_GEN_VOCAB_SIZE = 32800         # size of the visual generation vocabulary
+_BASE_VOCAB_SIZE = 152064  # Qwen2.5 base vocab size; also used as eol_token_id
+_VISUAL_TOKEN_START_ID = 152072  # first visual generation token
+_VISUAL_TOKEN_END_ID = 168456  # last visual generation token
+_GEN_VOCAB_SIZE = 32800  # size of the visual generation vocabulary
 # AR stage image grid: each token covers _AR_PATCH_SIZE x _AR_PATCH_SIZE pixels
 _AR_PATCH_SIZE = 16
 # AR sampling top-k covers the full visual generation vocabulary
@@ -49,7 +49,9 @@
 _EXAMPLE_DIR = Path(__file__).resolve().parents[3] / "examples" / "offline_inference" / "mammothmodal2_preview"
 MODEL_PATH = os.environ.get("MAMMOTHMODA2_MODEL_PATH", str(_EXAMPLE_DIR / "MammothModa2-Preview"))
 T2I_STAGE_CONFIG = os.environ.get("MAMMOTHMODA2_T2I_STAGE_CONFIG", str(_EXAMPLE_DIR / "mammoth_moda2_t2i.yaml"))
-SUMMARIZE_STAGE_CONFIG = os.environ.get("MAMMOTHMODA2_SUMMARIZE_STAGE_CONFIG", str(_EXAMPLE_DIR / "mammoth_moda2_image_summarize.yaml"))
+SUMMARIZE_STAGE_CONFIG = os.environ.get(
+    "MAMMOTHMODA2_SUMMARIZE_STAGE_CONFIG", str(_EXAMPLE_DIR / "mammoth_moda2_image_summarize.yaml")
+)
 
 
 def _load_t2i_gen_config(model_dir: str) -> dict:
@@ -78,7 +80,9 @@ class TestConfigParsing:
     def test_autoconfig_registration(self):
         """AutoConfig should resolve 'mammothmoda2' model_type."""
         from transformers import AutoConfig
+
         from vllm_omni.model_executor.models.mammoth_moda2.config import Mammothmoda2Config  # noqa: F401
+
         cfg = AutoConfig.for_model(
             model_type="mammothmoda2",
             llm_config={"model_type": "mammothmoda2_qwen2_5_vl"},
@@ -88,24 +92,33 @@ def test_autoconfig_registration(self):
     def test_dual_vocab_size_computation(self):
         """With extra_gen_vocab=True: vocab_size == gen_vocab_start_index + gen_vocab_size."""
         from vllm_omni.model_executor.models.mammoth_moda2.config import Mammothmoda2Qwen2_5_VLTextConfig
-        tc = Mammothmoda2Qwen2_5_VLTextConfig(vocab_size=_BASE_VOCAB_SIZE, extra_gen_vocab=True, gen_vocab_size=_GEN_VOCAB_SIZE)
+
+        tc = Mammothmoda2Qwen2_5_VLTextConfig(
+            vocab_size=_BASE_VOCAB_SIZE, extra_gen_vocab=True, gen_vocab_size=_GEN_VOCAB_SIZE
+        )
         assert tc.gen_vocab_start_index == _BASE_VOCAB_SIZE
         assert tc.vocab_size == _BASE_VOCAB_SIZE + _GEN_VOCAB_SIZE
 
     def test_proxy_properties(self):
         """Top-level config should proxy token IDs from llm_config."""
         from vllm_omni.model_executor.models.mammoth_moda2.config import Mammothmoda2Config
-        cfg = Mammothmoda2Config(llm_config={
-            "model_type": "mammothmoda2_qwen2_5_vl",
-            "image_token_id": _IMAGE_TOKEN_ID, "video_token_id": _VIDEO_TOKEN_ID,
-            "vision_start_token_id": _VISION_START_TOKEN_ID, "vision_end_token_id": _VISION_END_TOKEN_ID,
-        })
+
+        cfg = Mammothmoda2Config(
+            llm_config={
+                "model_type": "mammothmoda2_qwen2_5_vl",
+                "image_token_id": _IMAGE_TOKEN_ID,
+                "video_token_id": _VIDEO_TOKEN_ID,
+                "vision_start_token_id": _VISION_START_TOKEN_ID,
+                "vision_end_token_id": _VISION_END_TOKEN_ID,
+            }
+        )
         assert cfg.image_token_id == _IMAGE_TOKEN_ID
         assert cfg.video_token_id == _VIDEO_TOKEN_ID
 
     def test_missing_llm_config_raises(self):
         """Proxy property access with llm_config=None should raise AttributeError."""
         from vllm_omni.model_executor.models.mammoth_moda2.config import Mammothmoda2Config
+
         with pytest.raises(AttributeError, match="llm_config is None"):
             _ = Mammothmoda2Config(llm_config=None).image_token_id
 
@@ -153,18 +166,22 @@ def _stage(ar_outputs: list) -> list:
 def _p(image_height: int = 512, image_width: int = 512, visual_ids: list[int] | None = None, **kw) -> dict:
     if visual_ids is None:
         visual_ids = [_IMAGE_TOKEN_ID, _VIDEO_TOKEN_ID, _VISION_START_TOKEN_ID, _VISION_END_TOKEN_ID]
-    return {"additional_information": {
-        "omni_task": ["t2i"],
-        "ar_width": [image_width // _AR_PATCH_SIZE], "ar_height": [image_height // _AR_PATCH_SIZE],
-        "eol_token_id": [kw.get("eol_token_id", _BASE_VOCAB_SIZE)],
-        "visual_token_start_id": [kw.get("visual_token_start_id", _VISUAL_TOKEN_START_ID)],
-        "visual_token_end_id": [kw.get("visual_token_end_id", _VISUAL_TOKEN_END_ID)],
-        "image_height": [image_height], "image_width": [image_width],
-        "num_inference_steps": [kw.get("num_inference_steps", 50)],
-        "text_guidance_scale": [kw.get("text_guidance_scale", 9.0)],
-        "cfg_range": list(kw.get("cfg_range", [0.0, 1.0])),
-        "visual_ids": visual_ids,
-    }}
+    return {
+        "additional_information": {
+            "omni_task": ["t2i"],
+            "ar_width": [image_width // _AR_PATCH_SIZE],
+            "ar_height": [image_height // _AR_PATCH_SIZE],
+            "eol_token_id": [kw.get("eol_token_id", _BASE_VOCAB_SIZE)],
+            "visual_token_start_id": [kw.get("visual_token_start_id", _VISUAL_TOKEN_START_ID)],
+            "visual_token_end_id": [kw.get("visual_token_end_id", _VISUAL_TOKEN_END_ID)],
+            "image_height": [image_height],
+            "image_width": [image_width],
+            "num_inference_steps": [kw.get("num_inference_steps", 50)],
+            "text_guidance_scale": [kw.get("text_guidance_scale", 9.0)],
+            "cfg_range": list(kw.get("cfg_range", [0.0, 1.0])),
+            "visual_ids": visual_ids,
+        }
+    }
 
 
 class TestAR2DitProcessor:
@@ -184,7 +201,9 @@ def test_basic_output_structure(self):
     def test_embed_shapes_and_dtype(self):
         """text/image condition embeds must be 2D float32 with correct leading dim."""
         n_gen = 30
-        ar_out = _mock_ar(list(range(15)), list(range(_VISUAL_TOKEN_START_ID, _VISUAL_TOKEN_START_ID + n_gen)) + [0], hidden_dim=128)
+        ar_out = _mock_ar(
+            list(range(15)), list(range(_VISUAL_TOKEN_START_ID, _VISUAL_TOKEN_START_ID + n_gen)) + [0], hidden_dim=128
+        )
         info = ar2dit(_stage([ar_out]), engine_input_source=[0], prompts=[_p()])[0]["additional_information"]
         assert info["image_prompt_embeds"].shape == (n_gen, 128)
         assert info["text_prompt_embeds"].dtype == torch.float32
@@ -227,8 +246,9 @@ def test_visual_ids_excluded_from_text_embeds(self):
         ar.prompt_token_ids = prompt_ids
         ar.outputs = [c]
         ar.request_id = "req-visual"
-        info = ar2dit(_stage([ar]), engine_input_source=[0],
-                      prompts=[_p(visual_ids=visual_ids)])[0]["additional_information"]
+        info = ar2dit(_stage([ar]), engine_input_source=[0], prompts=[_p(visual_ids=visual_ids)])[0][
+            "additional_information"
+        ]
         assert info["text_prompt_embeds"].shape[0] == 3
 
 
@@ -331,9 +351,7 @@ def test_mammothmoda2_t2i_e2e():
                                 f"Expected image tensor, got {type(img_tensor)}"
                             )
                             # DiT output is (C, H*2, W*2) or (1, C, H*2, W*2)
-                            assert img_tensor.ndim in (3, 4), (
-                                f"Expected 3D or 4D image tensor, got {img_tensor.ndim}D"
-                            )
+                            assert img_tensor.ndim in (3, 4), f"Expected 3D or 4D image tensor, got {img_tensor.ndim}D"
                             found_image = True
 
         assert found_image, "No image tensor found in pipeline output"
@@ -350,6 +368,7 @@ class TestStageConfigValidation:
     def test_t2i_config_two_stages(self):
         """T2I YAML must define exactly 2 stages (AR->latent, DiT->image) with correct wiring."""
         import yaml
+
         if not Path(T2I_STAGE_CONFIG).exists():
             pytest.skip(f"Not found: {T2I_STAGE_CONFIG}")
         with open(T2I_STAGE_CONFIG) as f:
@@ -366,6 +385,7 @@ def test_t2i_config_two_stages(self):
     def test_summarize_config_single_ar_stage(self):
         """Image-summarisation YAML must be a single AR stage outputting text."""
         import yaml
+
         if not Path(SUMMARIZE_STAGE_CONFIG).exists():
             pytest.skip(f"Not found: {SUMMARIZE_STAGE_CONFIG}")
         with open(SUMMARIZE_STAGE_CONFIG) as f:
diff --git a/vllm_omni/model_executor/models/mammoth_moda2/mammoth_moda2_ar.py b/vllm_omni/model_executor/models/mammoth_moda2/mammoth_moda2_ar.py
@@ -49,7 +49,7 @@
 from vllm_omni.model_executor.models.output_templates import OmniOutput
 
 
-def moe_enable(moe_type, layer_type, layer_idx):
+def moe_enable(moe_type, layer_type, layer_idx) -> bool:
     """Determine if MoE should be enabled for a specific layer type and index.
 
     Args:
@@ -69,6 +69,7 @@ def moe_enable(moe_type, layer_type, layer_idx):
     assert moe_type in ["none", "attention", "ffn", "ffn_attention"]
     return layer_type in moe_type and start <= layer_idx < end
 
+
 def moe_forward(
     hidden_states: torch.Tensor,
     und_expert: Callable[[torch.Tensor], torch.Tensor],