[core] add torch compile for diffusion (vllm-project#684)

ZJY0516 · web-flow · commit c784b3e07d9f · 2026-01-09T04:05:44.000+13:00
Signed-off-by: zjy0516 &lt;riverclouds.zhu@qq.com&gt;
diff --git a/vllm_omni/diffusion/compile.py b/vllm_omni/diffusion/compile.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch.nn as nn
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def regionally_compile(model: nn.Module, *compile_args: Any, **compile_kwargs: Any) -> nn.Module:
+    """
+    Apply regional compilation to a PyTorch model.
+
+    Args:
+        model: The PyTorch model instance to compile
+        *compile_args: Positional arguments forwarded to torch.compile
+        **compile_kwargs: Keyword arguments forwarded to torch.compile
+
+    Returns:
+        The same model instance (modified in-place)
+    """
+    # Get the list of repeated blocks from the model
+    repeated_blocks = getattr(model, "_repeated_blocks", None)
+
+    if not repeated_blocks:
+        logger.warning("Regional compilation skipped because the model does not define `_repeated_blocks`.")
+        return model
+
+    # Check if we have modules with the specified class names
+    has_compiled_region = False
+    for submod in model.modules():
+        if submod.__class__.__name__ in repeated_blocks:
+            # Compile this submodule
+            submod.compile(*compile_args, **compile_kwargs)
+            has_compiled_region = True
+
+    if not has_compiled_region:
+        logger.warning(f"Regional compilation skipped because {repeated_blocks} classes are not found in the model.")
+
+    return model
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
@@ -303,7 +303,7 @@ class OmniDiffusionConfig:
     skip_time_steps: int = 15
 
     # Compilation
-    enable_torch_compile: bool = False
+    enforce_eager: bool = False
 
     # Enable sleep mode
     enable_sleep_mode: bool = False
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
@@ -8,10 +8,15 @@
 from dataclasses import dataclass
 from typing import Any
 
+import PIL.Image
 from vllm.logger import init_logger
 
 from vllm_omni.diffusion.data import SHUTDOWN_MESSAGE, OmniDiffusionConfig
-from vllm_omni.diffusion.registry import get_diffusion_post_process_func, get_diffusion_pre_process_func
+from vllm_omni.diffusion.registry import (
+    DiffusionModelRegistry,
+    get_diffusion_post_process_func,
+    get_diffusion_pre_process_func,
+)
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.diffusion.scheduler import Scheduler, scheduler
 from vllm_omni.outputs import OmniRequestOutput
@@ -20,6 +25,13 @@
 logger = init_logger(__name__)
 
 
+def supports_image_input(model_class_name: str) -> bool:
+    model_cls = DiffusionModelRegistry._try_load_model_cls(model_class_name)
+    if model_cls is None:
+        return False
+    return bool(getattr(model_cls, "support_image_input", False))
+
+
 @dataclass
 class BackgroundResources:
     """
@@ -70,6 +82,12 @@ def __init__(self, od_config: OmniDiffusionConfig):
         self._processes: list[mp.Process] = []
         self._closed = False
         self._make_client()
+        try:
+            self._dummy_run()
+        except Exception as e:
+            logger.error(f"Dummy run failed: {e}")
+            self.close()
+            raise e
 
     def step(self, requests: list[OmniDiffusionRequest]):
         try:
@@ -272,6 +290,30 @@ def _launch_workers(self, broadcast_handle):
     def add_req_and_wait_for_response(self, requests: list[OmniDiffusionRequest]):
         return scheduler.add_req(requests)
 
+    def _dummy_run(self):
+        """A dummy run to warm up the model."""
+        prompt = "dummy run"
+        num_inference_steps = 1
+        height = 1024
+        width = 1024
+        if supports_image_input(self.od_config.model_class_name):
+            # Provide a dummy image input if the model supports it
+
+            dummy_image = PIL.Image.new("RGB", (width, height), color=(0, 0, 0))
+        else:
+            dummy_image = None
+        req = OmniDiffusionRequest(
+            prompt=prompt,
+            height=height,
+            width=width,
+            pil_image=dummy_image,
+            num_inference_steps=num_inference_steps,
+            num_outputs_per_prompt=1,
+        )
+        logger.info("dummy run to warm up the model")
+        requests = self.pre_process_func([req]) if self.pre_process_func is not None else [req]
+        self.add_req_and_wait_for_response(requests)
+
     def collective_rpc(
         self,
         method: str | Callable,
@@ -343,22 +385,6 @@ def collective_rpc(
             logger.error(f"RPC call failed: {e}")
             raise
 
-    def _dummy_run(self):
-        """A dummy run to warm up the model."""
-        prompt = "dummy run"
-        num_inference_steps = 1
-        height = 1024
-        width = 1024
-        req = OmniDiffusionRequest(
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            num_outputs_per_prompt=1,
-        )
-        logger.info("dummy run to warm up the model")
-        self.add_req_and_wait_for_response([req])
-
     def close(self) -> None:
         self._finalizer()
 
diff --git a/vllm_omni/diffusion/models/interface.py b/vllm_omni/diffusion/models/interface.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import (
+    ClassVar,
+    Protocol,
+    runtime_checkable,
+)
+
+
+@runtime_checkable
+class SupportImageInput(Protocol):
+    support_image_input: ClassVar[bool] = True
diff --git a/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py b/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py
@@ -353,6 +353,11 @@ class LongCatImageTransformer2DModel(nn.Module):
     The Transformer model introduced in Flux.
     """
 
+    _repeated_blocks = [
+        "LongCatImageTransformerBlock",
+        "LongCatImageSingleTransformerBlock",
+    ]
+
     def __init__(
         self,
         od_config: OmniDiffusionConfig,
diff --git a/vllm_omni/diffusion/models/longcat_image/pipeline_longcat_image_edit.py b/vllm_omni/diffusion/models/longcat_image/pipeline_longcat_image_edit.py
@@ -28,6 +28,7 @@
 from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportImageInput
 from vllm_omni.diffusion.models.longcat_image.longcat_image_transformer import (
     LongCatImageTransformer2DModel,
 )
@@ -196,7 +197,7 @@ def split_quotation(prompt, quote_pairs=None):
     return result
 
 
-class LongCatImageEditPipeline(nn.Module):
+class LongCatImageEditPipeline(nn.Module, SupportImageInput):
     def __init__(
         self,
         *,
diff --git a/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py b/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py
@@ -365,6 +365,8 @@ class OvisImageTransformer2DModel(nn.Module):
             The dimensions to use for the rotary positional embeddings.
     """
 
+    _repeated_blocks = ["OvisImageTransformerBlock", "OvisImageSingleTransformerBlock"]
+
     def __init__(
         self,
         od_config: OmniDiffusionConfig,
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py
@@ -32,6 +32,7 @@
 )
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportImageInput
 from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import calculate_shift
 from vllm_omni.diffusion.models.qwen_image.qwen_image_transformer import (
     QwenImageTransformer2DModel,
@@ -195,6 +196,7 @@ def retrieve_latents(
 
 class QwenImageEditPipeline(
     nn.Module,
+    SupportImageInput,
 ):
     def __init__(
         self,
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py
@@ -30,6 +30,7 @@
 )
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportImageInput
 from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import calculate_shift
 from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image_edit import (
     calculate_dimensions,
@@ -156,9 +157,7 @@ def post_process_func(
     return post_process_func
 
 
-class QwenImageEditPlusPipeline(
-    nn.Module,
-):
+class QwenImageEditPlusPipeline(nn.Module, SupportImageInput):
     def __init__(
         self,
         *,
diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py
@@ -29,6 +29,7 @@
 )
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportImageInput
 from vllm_omni.diffusion.models.qwen_image.autoencoder_kl_qwenimage import (
     AutoencoderKLQwenImage,
 )
@@ -170,9 +171,7 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 
 
-class QwenImageLayeredPipeline(
-    nn.Module,
-):
+class QwenImageLayeredPipeline(nn.Module, SupportImageInput):
     def __init__(
         self,
         *,
diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py
@@ -677,6 +677,11 @@ class QwenImageTransformer2DModel(CachedTransformer):
             The dimensions to use for the rotary positional embeddings.
     """
 
+    # the small and frequently-repeated block(s) of a model
+    # -- typically a transformer layer
+    # used for torch compile optimizations
+    _repeated_blocks = ["QwenImageTransformerBlock"]
+
     def __init__(
         self,
         od_config: OmniDiffusionConfig,
diff --git a/vllm_omni/diffusion/models/sd3/sd3_transformer.py b/vllm_omni/diffusion/models/sd3/sd3_transformer.py
@@ -321,6 +321,8 @@ class SD3Transformer2DModel(nn.Module):
     The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
     """
 
+    _repeated_blocks = ["SD3TransformerBlock"]
+
     def __init__(
         self,
         od_config: OmniDiffusionConfig,
diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
@@ -515,6 +515,8 @@ class WanTransformer3DModel(nn.Module):
         pos_embed_seq_len: Optional position embedding sequence length
     """
 
+    _repeated_blocks = ["WanTransformerBlock"]
+
     def __init__(
         self,
         patch_size: tuple[int, int, int] = (1, 2, 2),
diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py
@@ -344,6 +344,8 @@ def __call__(self, ids: torch.Tensor):
 
 
 class ZImageTransformer2DModel(nn.Module):
+    _repeated_blocks = ["ZImageTransformerBlock"]
+
     def __init__(
         self,
         all_patch_size=(2,),
diff --git a/vllm_omni/diffusion/worker/gpu_worker.py b/vllm_omni/diffusion/worker/gpu_worker.py
@@ -14,6 +14,7 @@
 from vllm.utils.mem_utils import DeviceMemoryProfiler, GiB_bytes
 
 from vllm_omni.diffusion.cache.selector import get_cache_backend
+from vllm_omni.diffusion.compile import regionally_compile
 from vllm_omni.diffusion.data import (
     DiffusionOutput,
     OmniDiffusionConfig,
@@ -100,6 +101,16 @@ def init_device_and_model(self) -> None:
         )
         logger.info(f"Worker {self.rank}: Model loaded successfully.")
 
+        if not self.od_config.enforce_eager:
+            try:
+                self.pipeline.transformer = regionally_compile(
+                    self.pipeline.transformer,
+                    dynamic=True,
+                )
+                logger.info(f"Worker {self.rank}: Model compiled with torch.compile.")
+            except Exception as e:
+                logger.warning(f"Worker {self.rank}: torch.compile failed with error: {e}. Using eager mode.")
+
         # Setup cache backend based on type (both backends use enable()/reset() interface)
         self.cache_backend = get_cache_backend(self.od_config.cache_backend, self.od_config.cache_config)