[Whisper] Enable CUDA graph support for encoder-decoder models

JustinTong0323 · JustinTong0323 · commit f8ac20ae7c43 · 2026-03-23T06:09:13.000Z
Replace manual BMM cross-attention with RadixAttention to enable CUDA
graph capture/replay for the Whisper decode path. The encoder KV cache
is now stored in the standard KV pool via the attention backend's
encoder_out_cache_loc mechanism.

Key changes:
- Cross-attention uses RadixAttention with k=None,v=None during decode
  to read cached encoder KV from the pool
- pad_input_ids prepends dummy encoder tokens and sets num_image_tokens
  so prepare_encoder_info_extend allocates encoder KV cache locations
- Auto-select flashinfer backend for encoder-decoder models
- Auto-disable radix cache to avoid prefix matching conflicts
- Set encoder_len_fill_value to actual encoder length during CUDA graph
  capture so cross-attention kernels are properly recorded
- Fix cross-attention seq_lens_cpu in FlashInfer decode updater: use
  encoder_lens instead of decoder seq_lens to prevent
  global_override_indptr_cpu from overriding the correct KV length
- Add encoder_out_cache_loc support in trtllm_mha backend
- Clamp decoder position_ids to max_target_positions

Benchmark (earnings22, 511 samples, concurrency=1):
  WER: 12.77% (identical with/without CUDA graph)
  Throughput: 3.26 req/s (+36% vs 2.40 without CUDA graph)
  Avg latency: 0.297s (-27% vs 0.406s)
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -1048,16 +1048,19 @@ def update_cross_attention(
         fixed_split_size: Optional[int] = None,
         disable_split_kv: Optional[bool] = None,
     ):
+        # Cache encoder_lens on CPU to avoid GPU→CPU transfer per call
+        encoder_lens_cpu = encoder_lens.cpu() if encoder_lens is not None else None
         for wrapper_id in range(2):
             if wrapper_id == 0:
-                # Normal attention
                 paged_kernel_lens = seq_lens
                 kv_start_idx = encoder_lens
+                kv_lens_cpu = seq_lens_cpu
             else:
-                # Cross attention
+                # Cross-attention: attend to encoder tokens only
                 paged_kernel_lens = encoder_lens
                 kv_start_idx = torch.zeros_like(encoder_lens)
                 seq_lens_sum = encoder_lens.sum().item()
+                kv_lens_cpu = encoder_lens_cpu
 
             self.call_begin_forward(
                 decode_wrappers[wrapper_id],
@@ -1067,7 +1070,7 @@ def update_cross_attention(
                 self.kv_indptr[wrapper_id],
                 kv_start_idx,
                 spec_info,
-                seq_lens_cpu=seq_lens_cpu,
+                seq_lens_cpu=kv_lens_cpu,
             )
 
     def call_begin_forward(
diff --git a/python/sglang/srt/layers/attention/trtllm_mha_backend.py b/python/sglang/srt/layers/attention/trtllm_mha_backend.py
@@ -703,7 +703,11 @@ def forward_decode(
         **kwargs,
     ) -> torch.Tensor:
         """Run forward for decode using TRTLLM MHA kernel."""
-        cache_loc = forward_batch.out_cache_loc
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
 
         use_fused_fp8_path = self._should_use_fused_fp8_path(save_kv_cache, k)
 
@@ -788,7 +792,11 @@ def forward_extend(
         save_kv_cache=True,
         **kwargs,
     ):
-        cache_loc = forward_batch.out_cache_loc
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
 
         use_fused_fp8_path = self._should_use_fused_fp8_path(save_kv_cache, k)
 
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -590,7 +590,12 @@ def __init__(self, model_runner: ModelRunner):
             else self.dllm_config.block_size
         )
 
-        self.encoder_len_fill_value = 0
+        # Non-zero encoder length ensures cross-attention kernels are captured in the graph.
+        self.encoder_len_fill_value = (
+            getattr(model_runner.model_config.hf_config, "max_source_positions", 0)
+            if self.is_encoder_decoder
+            else 0
+        )
 
         if self.enable_torch_compile:
             set_torch_compile_config()
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -2044,7 +2044,11 @@ def _dummy_run(self, batch_size: int, run_ctx=None):
             is_encoder_decoder=self.model_config.is_encoder_decoder,
             require_mlp_tp_gather=require_mlp_tp_gather_,
             seq_len_fill_value=seq_len_fill_value,
-            encoder_len_fill_value=0,
+            encoder_len_fill_value=(
+                getattr(self.model_config.hf_config, "max_source_positions", 0)
+                if self.model_config.is_encoder_decoder
+                else 0
+            ),
             num_tokens_per_bs=num_tokens_per_bs,
             cache_loc_dtype=torch.int64,
             enable_mamba_track=False,
diff --git a/python/sglang/srt/models/whisper.py b/python/sglang/srt/models/whisper.py
@@ -94,70 +94,16 @@ def forward(
         """Input shape: Batch x Time x Channel"""
 
         if self.is_cross_attention:
+            # Cross-attention: KV cached during prefill, read from pool during decode.
             q, _ = self.q_proj(hidden_states)
+            q = q * self.scaling
             if cross_hidden_states is not None:
                 kv, _ = self.kv_proj(cross_hidden_states)
                 k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
             else:
-                k = torch.zeros_like(q)
-                v = torch.zeros_like(q)
-
-            q = q * self.scaling
-            num_heads = self.attn.tp_q_head_num
-            head_dim = self.attn.head_dim
-
-            q = q.view(-1, num_heads, head_dim)
-            k = k.view(-1, num_heads, head_dim)
-            v = v.view(-1, num_heads, head_dim)
-
-            q_len = q.shape[0]
-            kv_len = k.shape[0]
-
-            q = q.transpose(0, 1)
-            k = k.transpose(0, 1)
-            v = v.transpose(0, 1)
-
-            attn_weights = torch.bmm(q, k.transpose(1, 2))
-
-            # Apply block-diagonal mask for batched cross-attention
-            batch_size = forward_batch.batch_size if forward_batch else 1
-            if batch_size > 1 and kv_len > 0:
-                encoder_len_per_request = kv_len // batch_size
-                if encoder_len_per_request * batch_size == kv_len:
-                    is_decode = forward_batch.forward_mode.is_decode()
-                    if is_decode:
-                        mask = torch.zeros(
-                            (q_len, kv_len), device=q.device, dtype=torch.bool
-                        )
-                        for i in range(batch_size):
-                            enc_start = i * encoder_len_per_request
-                            enc_end = (i + 1) * encoder_len_per_request
-                            mask[i, enc_start:enc_end] = True
-                        attn_weights = attn_weights.masked_fill(
-                            ~mask.unsqueeze(0), float("-inf")
-                        )
-                    else:
-                        seq_lens = forward_batch.seq_lens
-                        if seq_lens is not None and len(seq_lens) == batch_size:
-                            seq_lens_list = seq_lens.tolist()
-                            mask = torch.zeros(
-                                (q_len, kv_len), device=q.device, dtype=torch.bool
-                            )
-                            q_start = 0
-                            for i, dec_len in enumerate(seq_lens_list):
-                                enc_start = i * encoder_len_per_request
-                                enc_end = (i + 1) * encoder_len_per_request
-                                q_end = q_start + dec_len
-                                mask[q_start:q_end, enc_start:enc_end] = True
-                                q_start = q_end
-                            attn_weights = attn_weights.masked_fill(
-                                ~mask.unsqueeze(0), float("-inf")
-                            )
-
-            attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-            attn_output = torch.bmm(attn_weights, v)
-            attn_output = attn_output.transpose(0, 1)
-            attn_output = attn_output.reshape(q_len, num_heads * head_dim)
+                k = None
+                v = None
+            attn_output = self.attn(q, k, v, forward_batch)
         else:
             qkv, _ = self.qkv_proj(hidden_states)
             q, k, v = qkv.chunk(chunks=3, dim=-1)
@@ -394,6 +340,7 @@ def forward(
         position_ids=None,
     ):
         inputs_embeds = self.embed_tokens(input_ids)
+        position_ids = position_ids.clamp(max=self.max_target_positions - 1)
         positions = self.embed_positions(position_ids)
         hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
 
@@ -420,7 +367,6 @@ def __init__(
         )
         self.logits_processor = LogitsProcessor(config)
         self.config = config
-        self._encoder_cache = {}
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
@@ -468,8 +414,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
 
-    def pad_input_ids(self, input_ids: List[int], _mm_inputs: MultimodalInputs):
-        return input_ids
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        # Prepend dummy encoder tokens so that prepare_encoder_info_extend
+        # correctly allocates encoder KV cache locations in the KV pool.
+        # These dummy tokens are stripped before the model forward receives input_ids.
+        encoder_len = self.config.max_source_positions
+        mm_inputs.num_image_tokens = encoder_len
+        pad_ids = [0] * encoder_len
+        return pad_ids + input_ids
 
     def forward(
         self,
@@ -479,29 +431,22 @@ def forward(
         **kwargs: Any,
     ) -> LogitsProcessorOutput:
         dtype = self.encoder.conv1.weight.dtype
-        is_decode = forward_batch.forward_mode.is_decode()
-
-        if is_decode:
-            encoder_outputs = None
-            if forward_batch.req_pool_indices is not None:
-                req_indices = forward_batch.req_pool_indices.tolist()
-                encoder_list = []
-                for req_idx in req_indices:
-                    if req_idx in self._encoder_cache:
-                        encoder_list.append(self._encoder_cache[req_idx])
-                if encoder_list:
-                    encoder_outputs = torch.cat(encoder_list, dim=0)
-        else:
-            encoder_list = []
+
+        # Run encoder for requests that haven't cached encoder output yet.
+        # During decode or when encoder is already cached, encoder_hidden_states
+        # is None and cross-attention reads KV from the pool via RadixAttention.
+        encoder_hidden_states = None
+        if not forward_batch.forward_mode.is_decode():
             mm_inputs_list = forward_batch.mm_inputs if forward_batch.mm_inputs else []
-            req_indices = (
-                forward_batch.req_pool_indices.tolist()
-                if forward_batch.req_pool_indices is not None
-                else []
+            encoder_cached_list = (
+                forward_batch.encoder_cached if forward_batch.encoder_cached else []
             )
 
-            for req_idx, mm_input in zip(req_indices, mm_inputs_list):
-                if mm_input is None or not mm_input.mm_items:
+            encoder_list = []
+            for i, (mm_input, cached) in enumerate(
+                zip(mm_inputs_list, encoder_cached_list)
+            ):
+                if cached or mm_input is None or not mm_input.mm_items:
                     continue
 
                 features = mm_input.mm_items[0].feature
@@ -513,21 +458,17 @@ def forward(
                     features.device, non_blocking=True
                 )
 
-                req_encoder_outputs = self.encoder(
+                req_encoder_output = self.encoder(
                     features.to(dtype), encoder_position_ids, forward_batch
                 )
-                req_encoder_outputs = req_encoder_outputs.squeeze(0)
-
-                self._encoder_cache[req_idx] = req_encoder_outputs
-                encoder_list.append(req_encoder_outputs)
+                req_encoder_output = req_encoder_output.squeeze(0)
+                encoder_list.append(req_encoder_output)
 
             if encoder_list:
-                encoder_outputs = torch.cat(encoder_list, dim=0)
-            else:
-                encoder_outputs = None
+                encoder_hidden_states = torch.cat(encoder_list, dim=0)
 
         decoder_outputs = self.decoder(
-            input_ids, encoder_outputs, forward_batch, positions
+            input_ids, encoder_hidden_states, forward_batch, positions
         )
 
         logits = self.logits_processor(
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -2181,6 +2181,10 @@ def _get_default_attn_backend(self, use_mla_backend: bool, model_config):
             2.2 We will use Flashinfer backend on blackwell.
             2.3 Otherwise, we will use triton backend.
         """
+        # Encoder-decoder models (e.g., Whisper) require flashinfer for cross-attention support
+        if model_config.is_encoder_decoder:
+            return "flashinfer"
+
         if not use_mla_backend:
             # MHA architecture
             if is_hopper_with_cuda_12_3() and is_no_spec_infer_or_topk_one(self):
@@ -2256,12 +2260,13 @@ def _handle_attention_backend_compatibility(self):
                 self.speculative_algorithm is None
             ), "Speculative decoding is currently not supported with Flex Attention backend"
 
-        # Encoder-decoder models (e.g., Whisper)
-        if model_config.is_encoder_decoder:
-            logger.warning(
-                "Cuda graph is disabled for encoder-decoder models (e.g., Whisper)"
+        # Encoder-decoder models (e.g., Whisper) require radix cache disabled
+        # because encoder token padding conflicts with prefix caching.
+        if model_config.is_encoder_decoder and not self.disable_radix_cache:
+            logger.info(
+                "Radix cache is disabled for encoder-decoder models (e.g., Whisper)"
             )
-            self.disable_cuda_graph = True
+            self.disable_radix_cache = True
 
         # Major NVIDIA platforms backends
         if (
diff --git a/test/manual/test_whisper_cuda_graph.py b/test/manual/test_whisper_cuda_graph.py