Merge branch 'main' into feat/vllmomni_profiling

erfgss · web-flow · commit 0c1bb01c0cfc · 2026-02-25T15:58:30.000+08:00
diff --git a/docs/.nav.yml b/docs/.nav.yml
@@ -58,12 +58,6 @@ nav:
     - contributing/model/README.md
     - contributing/model/adding_omni_model.md
     - contributing/model/adding_diffusion_model.md
-    - Advanced Features:
-      - contributing/features/cfg_parallel.md
-      - contributing/features/sequence_parallel.md
-      - contributing/features/tensor_parallel.md
-      - contributing/features/cache_dit.md
-      - contributing/features/teacache.md
   - CI: contributing/ci
   - Design Documents:
     - design/index.md
@@ -72,6 +66,11 @@ nav:
       - design/feature/disaggregated_inference.md
       - design/feature/ray_based_execution.md
       - design/feature/omni_connectors/
+      - design/feature/cfg_parallel.md
+      - design/feature/sequence_parallel.md
+      - design/feature/tensor_parallel.md
+      - design/feature/cache_dit.md
+      - design/feature/teacache.md
     - Module Design:
       - design/module/ar_module.md
       - design/module/dit_module.md
diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg
diff --git a/docs/contributing/model/adding_diffusion_model.md b/docs/contributing/model/adding_diffusion_model.md
@@ -1,4 +1,4 @@
-# Adding a Diffusion Model to vLLM-Omni
+# Adding a Diffusion Model
 
 This guide walks you through adding a new diffusion model to vLLM-Omni. We use **Qwen-Image** as the primary example, with references to other models (LongCat, Flux, Wan2.2) to illustrate different patterns.
 
@@ -680,7 +680,7 @@ vLLM-Omni automatically compiles blocks in `_repeated_blocks` when `torch.compil
 
 ### Tensor Parallelism
 
-See detailed guide: [How to add Tensor Parallel support](../features/tensor_parallel.md)
+See detailed guide: [How to add Tensor Parallel support](../../design/feature/tensor_parallel.md)
 
 **Quick setup:**
 
@@ -694,7 +694,7 @@ omni = Omni(model="your-model", tensor_parallel_size=2)
 
 ### CFG Parallelism
 
-See detailed guide: [How to add CFG-Parallel support](../features/cfg_parallel.md)
+See detailed guide: [How to add CFG-Parallel support](../../design/feature/cfg_parallel.md)
 
 **Quick setup:**
 
@@ -708,7 +708,7 @@ omni = Omni(model="your-model", cfg_parallel_size=2)
 
 ### Sequence Parallelism
 
-See detailed guide: [How to add Sequence Parallel support](../features/sequence_parallel.md)
+See detailed guide: [How to add Sequence Parallel support](../../design/feature/sequence_parallel.md)
 
 **Quick setup:**
 
@@ -724,7 +724,7 @@ omni = Omni(model="your-model", ulysses_degree=2, ring_degree=2)
 
 #### TeaCache
 
-See detailed guide: [How to add TeaCache support](../features/teacache.md)
+See detailed guide: [How to add TeaCache support](../../design/feature/teacache.md)
 
 **Quick setup:**
 
@@ -744,7 +744,7 @@ omni = Omni(model="your-model",
 
 #### Cache-DiT
 
-See detailed guide: [How to add Cache-DiT support](../features/cache_dit.md)
+See detailed guide: [How to add Cache-DiT support](../../design/feature/cache_dit.md)
 
 **Quick setup:**
 
diff --git a/docs/design/feature/cache_dit.md b/docs/design/feature/cache_dit.md
@@ -1,4 +1,4 @@
-# Support Cache-DiT
+# Cache-DiT
 
 This section describes how to add cache-dit acceleration to a new diffusion pipeline. We use the Qwen-Image pipeline and LongCat-Image pipeline as reference implementations.
 
diff --git a/docs/design/feature/cfg_parallel.md b/docs/design/feature/cfg_parallel.md
@@ -1,4 +1,4 @@
-# Support CFG-Parallel
+# CFG-Parallel
 
 This section describes how to add CFG-Parallel (Classifier-Free Guidance Parallel) to a diffusion pipeline. We use the Qwen-Image pipeline as the reference implementation.
 
diff --git a/docs/design/feature/sequence_parallel.md b/docs/design/feature/sequence_parallel.md
@@ -1,4 +1,4 @@
-# Support Sequence Parallel
+# Sequence Parallel
 
 This section describes how to add Sequence Parallel (SP) to a diffusion transformer model. We use the Qwen-Image transformer and Wan2.2 transformer as reference implementations.
 
diff --git a/docs/design/feature/teacache.md b/docs/design/feature/teacache.md
@@ -1,4 +1,4 @@
-# Support TeaCache
+# TeaCache
 
 This section describes how to add TeaCache to a diffusion transformer model. We use the Qwen-Image transformer as the reference implementation.
 
diff --git a/docs/design/feature/tensor_parallel.md b/docs/design/feature/tensor_parallel.md
@@ -1,4 +1,4 @@
-# Support Tensor Parallel
+# Tensor Parallel
 
 This section describes how to add Tensor Parallel (TP) to a diffusion transformer model. We use the Z-Image transformer as the reference implementation.
 
diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py
@@ -192,15 +192,6 @@ def execute_model(
                 num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
             )
 
-            logger.debug(
-                "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
-                "should_ubatch: %s, num_tokens_across_dp: %s",
-                cudagraph_mode,
-                batch_desc,
-                should_ubatch,
-                num_tokens_across_dp,
-            )
-
             num_tokens_padded = batch_desc.num_tokens
             num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
             ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
@@ -211,12 +202,6 @@ def execute_model(
                 self.parallel_config.num_ubatches,
             )
 
-            logger.debug(
-                "ubatch_slices: %s, ubatch_slices_padded: %s",
-                ubatch_slices,
-                ubatch_slices_padded,
-            )
-
             pad_attn = cudagraph_mode == CUDAGraphMode.FULL
 
             use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
@@ -308,15 +293,6 @@ def execute_model(
                 aux_hidden_states = None
 
             hidden_states, multimodal_outputs = self.extract_multimodal_outputs(model_output)
-            if multimodal_outputs is not None:
-                keys_or_type = (
-                    list(multimodal_outputs.keys())
-                    if isinstance(multimodal_outputs, dict)
-                    else type(multimodal_outputs)
-                )
-                logger.debug(f"[AR] execute_model: multimodal_outputs keys = {keys_or_type}")
-            else:
-                logger.debug("[AR] execute_model: multimodal_outputs is None")
 
             if not self.broadcast_pp_output:
                 # Common case.
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
@@ -1151,12 +1151,6 @@ def _model_forward(
         """Inject omni-specific kwargs into forward and cache model output"""
         model_kwargs_extra = self._build_model_kwargs_extra()
 
-        runtime_info = model_kwargs_extra.get("runtime_additional_information", [])
-        if runtime_info:
-            for i, info in enumerate(runtime_info):
-                if info:
-                    logger.debug(f"[OMNI] req[{i}] runtime_additional_information keys: {list(info.keys())}")
-
         model_output = super()._model_forward(
             input_ids=input_ids,
             positions=positions,

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Support Cache-DiT`
	`1`	`+# Cache-DiT`
`2`	`2`
`3`	`3`	`This section describes how to add cache-dit acceleration to a new diffusion pipeline. We use the Qwen-Image pipeline and LongCat-Image pipeline as reference implementations.`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Support CFG-Parallel`
	`1`	`+# CFG-Parallel`
`2`	`2`
`3`	`3`	`This section describes how to add CFG-Parallel (Classifier-Free Guidance Parallel) to a diffusion pipeline. We use the Qwen-Image pipeline as the reference implementation.`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Support Sequence Parallel`
	`1`	`+# Sequence Parallel`
`2`	`2`
`3`	`3`	`This section describes how to add Sequence Parallel (SP) to a diffusion transformer model. We use the Qwen-Image transformer and Wan2.2 transformer as reference implementations.`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Support TeaCache`
	`1`	`+# TeaCache`
`2`	`2`
`3`	`3`	`This section describes how to add TeaCache to a diffusion transformer model. We use the Qwen-Image transformer as the reference implementation.`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Support Tensor Parallel`
	`1`	`+# Tensor Parallel`
`2`	`2`
`3`	`3`	`This section describes how to add Tensor Parallel (TP) to a diffusion transformer model. We use the Z-Image transformer as the reference implementation.`
`4`	`4`