[TeaCache]: Add Coefficient Estimation (#940)

princepride · hsliuustc0106 · dongbo910220 · commit 220cd590bcb0 · 2026-02-01T12:53:06.000+08:00
Signed-off-by: princepride &lt;wangzhipeng628@gmail.com&gt;
Signed-off-by: 汪志鹏 &lt;wangzhipeng628@gmail.com&gt;
Co-authored-by: Hongsheng Liu &lt;liuhongsheng4@huawei.com&gt;
diff --git a/docs/contributing/model/adding_diffusion_model.md b/docs/contributing/model/adding_diffusion_model.md
@@ -204,7 +204,151 @@ Key point for writing the example:
 
 + Save or display the generated results so users can validate the integration.
 
-## Step 5: Open a Pull Request
+## Step 5: TeaCache Coefficient Estimation (Optional)
+
+If your model supports TeaCache acceleration, you need to estimate the polynomial coefficients for optimal caching performance.
+
+### 5.1 Add Extractor Function
+
+First, implement an extractor function in `vllm_omni/diffusion/cache/teacache/extractors.py`. The extractor extracts the modulated input and defines how to run transformer blocks:
+
+```python
+def extract_your_model_context(
+    module: nn.Module,
+    hidden_states: torch.Tensor,
+    timestep: torch.Tensor,
+    **kwargs: Any,
+) -> CacheContext:
+    # 1. Preprocessing
+    temb = module.time_embed(timestep)
+
+    # 2. Extract modulated input (for cache decision)
+    modulated_input = module.transformer_blocks[0].norm1(hidden_states, temb)
+
+    # 3. Define transformer execution
+    def run_transformer_blocks():
+        h = hidden_states
+        for block in module.transformer_blocks:
+            h = block(h, temb=temb)
+        return (h,)
+
+    # 4. Define postprocessing
+    def postprocess(h):
+        return module.proj_out(module.norm_out(h, temb))
+
+    return CacheContext(
+        modulated_input=modulated_input,
+        hidden_states=hidden_states,
+        encoder_hidden_states=None,
+        temb=temb,
+        run_transformer_blocks=run_transformer_blocks,
+        postprocess=postprocess,
+    )
+```
+
+Register it in `EXTRACTOR_REGISTRY`:
+```python
+EXTRACTOR_REGISTRY = {
+    ...
+    "YourTransformer2DModel": extract_your_model_context,
+}
+```
+
+### 5.2 Add Adapter for Coefficient Estimation
+
+Add an adapter in `vllm_omni/diffusion/cache/teacache/coefficient_estimator.py`:
+
+```python
+class YourModelAdapter:
+    @staticmethod
+    def load_pipeline(model_path: str, device: str, dtype: torch.dtype) -> Any:
+        # Load your pipeline
+        ...
+
+    @staticmethod
+    def get_transformer(pipeline: Any) -> tuple[Any, str]:
+        return pipeline.transformer, "YourTransformer2DModel"
+
+    @staticmethod
+    def install_hook(transformer: Any, hook: DataCollectionHook) -> None:
+        registry = HookRegistry.get_or_create(transformer)
+        registry.register_hook(hook._HOOK_NAME, hook)
+
+_MODEL_ADAPTERS["YourModel"] = YourModelAdapter
+```
+
+### 5.3 Run Coefficient Estimation
+
+Use the provided script to estimate coefficients:
+
+```python
+from vllm_omni.diffusion.cache.teacache.coefficient_estimator import (
+    TeaCacheCoefficientEstimator,
+)
+from datasets import load_dataset
+from tqdm import tqdm
+
+# Load model
+estimator = TeaCacheCoefficientEstimator(
+    model_path="/path/to/model",
+    model_type="Bagel",  # Your model type
+    device="cuda",
+)
+
+# Load prompts (paper suggests ~70 prompts)
+dataset = load_dataset("nateraw/parti-prompts", split="train")
+prompts = dataset["Prompt"][:70]
+
+# Collect data
+for prompt in tqdm(prompts):
+    estimator.collect_from_prompt(prompt, num_inference_steps=50)
+
+# Estimate coefficients
+coeffs = estimator.estimate(poly_order=4)
+print(f"Coefficients: {coeffs}")
+```
+
+### 5.4 Interpreting Coefficient Estimation Results
+
+The estimator outputs statistics and polynomial coefficients. Here's how to interpret them:
+
+**Example Output:**
+```
+Data statistics:
+Count: 48
+Input Diffs (x): min=1.1089e-02, max=5.2555e-02, mean=2.8435e-02
+Output Diffs (y): min=2.8242e-02, max=2.9792e-01, mean=7.0312e-02
+Coefficients: [1333131.29, -168644.23, 7950.51, -163.75, 1.26]
+```
+
+**What to Check:**
+- **Count**: Number of timestep pairs analyzed. Should be at least 30-50 for reliable estimation. Low count suggests insufficient prompts or inference steps.
+- **Input/Output Ranges**: Verify output differences correlate with input differences. If ranges seem unusual, check your prompt diversity.
+- **Coefficient Magnitude**: Extremely large values (>1e8) may indicate numerical instability - try collecting more diverse data.
+
+**Troubleshooting:**
+- If results seem unreliable, try:
+  - Increasing number of prompts (100+ recommended)
+  - Using more diverse prompts from multiple datasets
+  - Adjusting `num_inference_steps` (try 20, 50, 100)
+
+### 5.5 Add Coefficients to Config
+
+Add the estimated coefficients to `vllm_omni/diffusion/cache/teacache/config.py`:
+
+```python
+_MODEL_COEFFICIENTS = {
+    ...
+    "YourTransformer2DModel": [
+        1.04730573e+06,  # a4
+        -1.34150749e+05, # a3
+        6.51517806e+03,  # a2
+        -1.41209108e+02, # a1
+        1.17241808e+00,  # a0
+    ],
+}
+```
+## Step 6: Open a Pull Request
 
 When submitting a pull request to add support for a new model, please include the following information in the PR description:
 
diff --git a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import types
+from typing import Any
+
+import numpy as np
+import torch
+from vllm.config import LoadConfig
+
+from vllm_omni.diffusion.cache.teacache.extractors import get_extractor
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.diffusion.hooks import HookRegistry, ModelHook
+from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.bagel.pipeline_bagel import BagelPipeline
+
+
+class DataCollectionHook(ModelHook):
+    """Hook to collect modulated inputs and model outputs for TeaCache coefficient estimation."""
+
+    _HOOK_NAME = "teacache_collector"
+
+    def __init__(self, transformer_type: str):
+        super().__init__()
+        self.transformer_type = transformer_type
+        self.extractor_fn = None
+        self.current_trajectory: list[tuple[np.ndarray, np.ndarray]] = []
+
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        self.extractor_fn = get_extractor(self.transformer_type)
+        return module
+
+    def new_forward(self, module: torch.nn.Module, *args: Any, **kwargs: Any) -> Any:
+        ctx = self.extractor_fn(module, *args, **kwargs)
+        modulated_input_cpu = ctx.modulated_input.detach().cpu().numpy()
+
+        outputs = ctx.run_transformer_blocks()
+        ctx.hidden_states = outputs[0]
+        if len(outputs) > 1 and ctx.encoder_hidden_states is not None:
+            ctx.encoder_hidden_states = outputs[1]
+
+        model_output_cpu = ctx.hidden_states.detach().cpu().numpy()
+        self.current_trajectory.append((modulated_input_cpu, model_output_cpu))
+
+        return ctx.postprocess(ctx.hidden_states)
+
+    def start_collection(self):
+        self.current_trajectory = []
+
+    def stop_collection(self) -> list[tuple[np.ndarray, np.ndarray]]:
+        return list(self.current_trajectory)
+
+
+class BagelAdapter:
+    """Adapter for Bagel model."""
+
+    @staticmethod
+    def load_pipeline(model_path: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16) -> BagelPipeline:
+        od_config = OmniDiffusionConfig.from_kwargs(model=model_path, dtype=dtype)
+        od_config.model_class_name = "BagelPipeline"
+
+        pipeline = BagelPipeline(od_config=od_config)
+        loader = DiffusersPipelineLoader(LoadConfig())
+        loader.load_weights(pipeline)
+        pipeline.to(device)
+        return pipeline
+
+    @staticmethod
+    def get_transformer(pipeline: Any) -> tuple[Any, str]:
+        return pipeline.bagel, "Bagel"
+
+    @staticmethod
+    def install_hook(transformer: Any, hook: DataCollectionHook) -> None:
+        original_forward_flow = transformer._forward_flow
+
+        def forward_alias(self, *args, **kwargs):
+            return original_forward_flow(*args, **kwargs)
+
+        transformer.forward = types.MethodType(forward_alias, transformer)
+        registry = HookRegistry.get_or_create(transformer)
+        registry.register_hook(hook._HOOK_NAME, hook)
+        transformer._forward_flow = transformer.forward
+
+
+class DefaultAdapter:
+    """Default adapter for standard diffusers pipelines."""
+
+    @staticmethod
+    def load_pipeline(model_path: str, device: str, dtype: torch.dtype) -> Any:
+        raise NotImplementedError("DefaultAdapter.load_pipeline not implemented")
+
+    @staticmethod
+    def get_transformer(pipeline: Any) -> tuple[Any, str]:
+        return pipeline.transformer, pipeline.transformer.__class__.__name__
+
+    @staticmethod
+    def install_hook(transformer: Any, hook: DataCollectionHook) -> None:
+        registry = HookRegistry.get_or_create(transformer)
+        registry.register_hook(hook._HOOK_NAME, hook)
+
+
+_MODEL_ADAPTERS: dict[str, type] = {
+    "Bagel": BagelAdapter,
+}
+
+_EPSILON = 1e-6
+
+
+def calculate_relative_l1(tensor_current: np.ndarray, tensor_next: np.ndarray) -> float:
+    """Calculate relative L1 distance (Eq. 4 from TeaCache paper)."""
+    diff = np.abs(tensor_current - tensor_next).sum()
+    norm = np.abs(tensor_current).sum() + _EPSILON
+    return diff / norm
+
+
+def estimate_teacache_coefficients(
+    collected_data: list[list[tuple[np.ndarray, np.ndarray]]], poly_order: int = 4
+) -> list[float]:
+    """Estimate polynomial coefficients for TeaCache using np.polyfit."""
+    input_diffs, output_diffs = [], []
+
+    for sample in collected_data:
+        for t in range(len(sample) - 1):
+            feat_in_curr, feat_out_curr = sample[t]
+            feat_in_next, feat_out_next = sample[t + 1]
+            input_diffs.append(calculate_relative_l1(feat_in_curr, feat_in_next))
+            output_diffs.append(calculate_relative_l1(feat_out_curr, feat_out_next))
+
+    x = np.array(input_diffs, dtype=np.float64)
+    y = np.array(output_diffs, dtype=np.float64)
+
+    print("Data statistics:")
+    print(f"  Count: {len(x)}")
+    print(f"  Input Diffs (x): min={x.min():.4e}, max={x.max():.4e}, mean={x.mean():.4e}")
+    print(f"  Output Diffs (y): min={y.min():.4e}, max={y.max():.4e}, mean={y.mean():.4e}")
+
+    return np.polyfit(x, y, poly_order).tolist()
+
+
+class TeaCacheCoefficientEstimator:
+    """Model-agnostic helper class to collect data and estimate TeaCache coefficients."""
+
+    def __init__(
+        self,
+        model_path: str,
+        model_type: str = "Bagel",
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        # Add validation here ⬇️
+        if model_type not in _MODEL_ADAPTERS:
+            available_types = list(_MODEL_ADAPTERS.keys())
+            raise ValueError(
+                f"Unsupported model_type: '{model_type}'. "
+                f"Available types: {available_types}. "
+                f"To add support for a new model, add an entry to _MODEL_ADAPTERS."
+            )
+
+        adapter = _MODEL_ADAPTERS.get(model_type, DefaultAdapter)
+        self.pipeline = adapter.load_pipeline(model_path, device, dtype)
+        self.transformer, self.transformer_type = adapter.get_transformer(self.pipeline)
+        self.hook = DataCollectionHook(self.transformer_type)
+        self.collected_data: list[list[tuple[np.ndarray, np.ndarray]]] = []
+        adapter.install_hook(self.transformer, self.hook)
+
+    def collect_from_prompt(self, prompt: str, **generate_kwargs):
+        self.hook.start_collection()
+        from vllm_omni.diffusion.request import OmniDiffusionRequest
+
+        req = OmniDiffusionRequest(
+            prompt=prompt,
+            num_inference_steps=generate_kwargs.get("num_inference_steps", 20),
+            seed=generate_kwargs.get("seed", 42),
+        )
+        self.pipeline.forward(req)
+        trajectory = self.hook.stop_collection()
+        if trajectory:
+            self.collected_data.append(trajectory)
+
+    def estimate(self, poly_order: int = 4) -> list[float]:
+        """Estimate polynomial coefficients from collected data.
+
+        Args:
+            poly_order: Order of polynomial fit (default: 4)
+
+        Returns:
+            List of polynomial coefficients [a_n, a_{n-1}, ..., a_1, a_0]
+
+        Raises:
+            RuntimeError: If no data has been collected
+        """
+        if not self.collected_data:
+            raise RuntimeError(
+                "No data collected for coefficient estimation. "
+                "Call collect_from_prompt() at least once before calling estimate()."
+            )
+        return estimate_teacache_coefficients(self.collected_data, poly_order)