[Minor][Refactor] Pass seq_token_counts explicitly (#1425)

gcanlin · hsliuustc0106 · web-flow · commit af11b02081fa · 2026-02-24T14:10:00.000+08:00
Signed-off-by: gcanlin &lt;canlinguosdu@gmail.com&gt;
Co-authored-by: Hongsheng Liu &lt;liuhongsheng4@huawei.com&gt;
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py
@@ -15,7 +15,6 @@
     Qwen3OmniMoeThinkerConfig,
 )
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal, SupportsPP
@@ -355,13 +354,14 @@ def forward(
 
         # ========== Stage 3: Code2Wav ==========
         elif self.model_stage == "code2wav":
+            seq_token_counts: list[int] | None = kwargs.get("seq_token_counts")
+
             # Extract codec codes from input
             if input_ids.shape[0] % 16 == 0:
-                ubatch_slices = get_forward_context().ubatch_slices
-                if ubatch_slices is not None:
-                    max_seq_len = max(ubatch_slices) // 16
-                    batch_size = len(ubatch_slices)
-                    split_codes = torch.split(input_ids, ubatch_slices, dim=0)
+                if seq_token_counts is not None:
+                    max_seq_len = max(seq_token_counts) // 16
+                    batch_size = len(seq_token_counts)
+                    split_codes = torch.split(input_ids, seq_token_counts, dim=0)
                     codes = torch.zeros((batch_size, 16, max_seq_len), device=input_ids.device, dtype=input_ids.dtype)
                     for idx, code in enumerate(split_codes):
                         seq_len = code.shape[0] // 16
@@ -386,7 +386,7 @@ def forward(
                 codes = input_ids_flatten.reshape(1, 16, -1)
 
             # Generate audio from codec codes
-            audio_tensors = self.generate_audio(codes, voice_type)
+            audio_tensors = self.generate_audio(codes, voice_type, seq_token_counts)
 
             return audio_tensors
 
@@ -458,16 +458,22 @@ def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput, **kwargs) -
 
     # ==================== Audio Generation ====================
 
-    def generate_audio(self, code: torch.Tensor, voice_type: str) -> list[torch.Tensor]:
+    def generate_audio(
+        self,
+        code: torch.Tensor,
+        voice_type: str,
+        seq_token_counts: list[int] | None = None,
+    ) -> list[torch.Tensor]:
         """
         Generate audio waveform from codec codes.
 
         Args:
-            code: [8, T] - 8-layer RVQ codec codes
+            code: [batch, num_quantizers, T] - RVQ codec codes
             voice_type: Voice type (not used in Qwen3, kept for compatibility)
+            seq_token_counts: Token count for each request in batch
 
         Returns:
-            audio_tensor: [1, waveform_len] - Audio waveform
+            list of audio waveforms
         """
         code2wav_dev = self._module_device(self.code2wav)
 
@@ -491,13 +497,15 @@ def generate_audio(self, code: torch.Tensor, voice_type: str) -> list[torch.Tens
                 talker_codes,
                 chunk_size=25,
                 left_context_size=25,
+                seq_token_counts=seq_token_counts,
             )
         else:
             # Use chunked decode for memory efficiency
             audio_tensors = self.code2wav.chunked_decode(
                 talker_codes,
                 chunk_size=300,
                 left_context_size=25,
+                seq_token_counts=seq_token_counts,
             )
 
         return audio_tensors
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_code2wav.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_code2wav.py
@@ -22,7 +22,6 @@
     SnakeBeta,
 )
 from vllm.config import VllmConfig  # type: ignore
-from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger  # type: ignore
 from vllm.model_executor.models.utils import (  # type: ignore
     AutoWeightsLoader,
@@ -163,6 +162,7 @@ def chunked_decode(
         codes: torch.Tensor,
         chunk_size: int = 300,
         left_context_size: int = 25,
+        seq_token_counts: list[int] | None = None,
     ) -> list[torch.Tensor]:
         """
         Decode long sequences in chunks to avoid OOM.
@@ -173,6 +173,7 @@ def chunked_decode(
             codes: [batch, num_quantizers, seq_len] - num_quantizers-layer RVQ codes
             chunk_size: Number of codec frames per chunk
             left_context_size: Number of overlapping frames for context
+            seq_token_counts: Token count for each request in batch
 
         Returns:
             list[torch.Tensor]: Complete waveform decoded from the input
@@ -197,12 +198,10 @@ def chunked_decode(
 
             start_index = end_index
 
-        ubatch_slices = get_forward_context().ubatch_slices
-        if ubatch_slices is not None:
-            code_seq_lens = [seq_len // self.config.num_quantizers for seq_len in ubatch_slices]
+        if seq_token_counts is not None:
+            code_seq_lens = [seq_len // self.config.num_quantizers for seq_len in seq_token_counts]
         else:
             # Fallback: assume all batch elements share the same sequence length.
-            # Create one entry per batch so that each element is processed.
             code_seq_lens = [codes.shape[-1]] * codes.shape[0]
         batch_wav = torch.cat(wavs, dim=-1)
         wavs = []
@@ -216,6 +215,7 @@ def chunked_decode_streaming(
         codes: torch.Tensor,
         chunk_size: int = 25,
         left_context_size: int = 25,
+        seq_token_counts: list[int] | None = None,
     ) -> list[torch.Tensor]:
         """
         Decode long sequences in chunks to avoid OOM.
@@ -226,21 +226,19 @@ def chunked_decode_streaming(
             codes: [batch, num_quantizers, seq_len] - num_quantizers-layer RVQ codes
             chunk_size: Number of codec frames per chunk
             left_context_size: Number of overlapping frames for context
+            seq_token_counts: Token count for each request in batch
 
         Returns:
             list[torch.Tensor]: Complete waveform decoded from the input
                 codes. For ``batch_size == 1``, this is a list containing a
                 single tensor with shape ``[1, waveform_len]``.
         """
-        # Decode chunk
         wavs = []
         batch_wav = self(codes)
-        ubatch_slices = get_forward_context().ubatch_slices
-        if ubatch_slices is not None:
-            code_seq_lens = [seq_len // self.config.num_quantizers for seq_len in ubatch_slices]
+        if seq_token_counts is not None:
+            code_seq_lens = [n // self.config.num_quantizers for n in seq_token_counts]
         else:
             # Fallback: assume all batch elements share the same sequence length.
-            # Create one entry per batch so that each element is processed.
             code_seq_lens = [codes.shape[-1]] * codes.shape[0]
         for idx, code_seq_len in enumerate(code_seq_lens):
             # TODO: need to optimize algorithms, current only support
diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py
@@ -250,6 +250,9 @@ def execute_model(
                 intermediate_tensors,
             )
 
+            # [Omni] Pass token counts per request for code2wav output slicing
+            model_kwargs["seq_token_counts"] = tokens
+
         # Set cudagraph mode to none if calc_kv_scales is true.
         # KV scales calculation involves dynamic operations that are incompatible
         # with CUDA graph capture.
@@ -258,10 +261,6 @@ def execute_model(
             # Mark KV scales as calculated after the first forward pass
             self.calculate_kv_scales = False
 
-        if ubatch_slices_padded is None:
-            # reuse ubatch_slices_padded for code2wav batching
-            ubatch_slices_padded = tokens
-
         # Run the model.
         # Use persistent buffers for CUDA graphs.
         with (