feedback and updates

lucaslie · lucaslie · commit b85376f6ed76 · 2025-12-30T09:52:18.000-08:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -10,19 +10,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import (
-    Callable,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Protocol,
-    Sequence,
-    Set,
-    Tuple,
-    Type,
-    Union,
-)
+from typing import Dict, List, Literal, Optional, Protocol, Sequence, Set, Tuple, Type, Union
 
 import torch
 from pydantic import BaseModel, ConfigDict, Field, field_validator
@@ -36,6 +24,10 @@
 Constant = Union[int, float, str, None]
 
 
+class PrepareMetadataHostCallable(Protocol):
+    def __call__(self, **sequence_info_args: torch.Tensor) -> None: ...
+
+
 class InputBuffer:
     """Manages contiguous memory buffers for efficient host-to-device transfers.
 
@@ -388,6 +380,9 @@ class SequenceInfo:
     - _mask_scatter_indices: [m_0, m_1, ..., m_{s_total-1}]
       Mask scatter indices used by the overlap scheduler to scatter results back.
 
+    NOTE: all tensors are also accessible as host tensors with the suffix "_host". For example,
+    the tensor "batch_info" is accessible as "batch_info_host" on the host.
+
     ################################################################################################
 
     Here are a couple of notes to emphasize this notation:
@@ -526,7 +521,7 @@ def __init__(
         ############################################################################################
 
         # HOST PREPARE FOR ATTENTION FORWARD #######################################################
-        self._host_prepare_functions: set[Callable[[SequenceInfo], None]] = set()
+        self._host_prepare_functions: List[Tuple[PrepareMetadataHostCallable, List[str]]] = []
 
         # call reset once to set a consistent initial state
         self.reset()
@@ -1043,13 +1038,13 @@ def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]:
         return list(torch.split(t_squeezed, self.seq_len))
 
     def register_host_prepare_for_attention_forward(
-        self, host_function: Callable[["SequenceInfo"], None]
+        self, host_function: PrepareMetadataHostCallable, args: List[str]
     ):
-        self._host_prepare_functions.add(host_function)
+        self._host_prepare_functions.append((host_function, args))
 
     def run_host_prepare_for_attention_forward(self) -> None:
-        for host_function in self._host_prepare_functions:
-            host_function(self)
+        for host_function, args in self._host_prepare_functions:
+            host_function(**{arg: self._get_arg(arg) for arg in args})
 
 
 class MHACallable(Protocol):
@@ -1061,14 +1056,7 @@ def __call__(
 
 class PrepareMetadataCallable(Protocol):
     def __call__(
-        self,
-        position_ids: torch.Tensor,
-        seq_len: torch.Tensor,
-        input_pos: torch.Tensor,
-        cache_loc: torch.Tensor,
-        pages_per_seq: torch.Tensor,
-        slot_idx: torch.Tensor,
-        page_size: int,
+        self, *sequence_info_args_and_constants: Union[torch.Tensor, Constant]
     ) -> List[torch.Tensor]: ...
 
 
@@ -1229,13 +1217,14 @@ def get_constants(cls, source_attn_node: Node) -> List[Constant]:
         return []
 
     @classmethod
-    def host_prepare_for_forward(cls, sequence_info: SequenceInfo):
-        """Perform host-side preparation for the forward pass for the attention op.
+    def get_host_prepare_metadata_function(cls) -> Optional[PrepareMetadataHostCallable]:
+        """Get function that performs host-side prep for the forward pass for the attention op.
 
         This method is responsible for preparing the attention op for the forward pass.
-        This function is not expected to be graph capturable or compatible with cuda graphs.
+        This function is not expected to be graph capturable or compatible with cuda graphs. It can
+        use any argument from the SequenceInfo interface as input argument to its function.
         """
-        return
+        return None
 
 
 class AttentionRegistry:
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
@@ -21,6 +21,7 @@
     Constant,
     MHACallable,
     PrepareMetadataCallable,
+    PrepareMetadataHostCallable,
     SequenceInfo,
 )
 
@@ -183,7 +184,6 @@ class PlanParams:
     n_kv_heads: int
     head_dim: int
     num_seq: int
-    is_generate: bool
     page_size: int
     q_dtype: torch.dtype
     kv_dtype: torch.dtype
@@ -289,12 +289,17 @@ def plan_prefill(
         kv_page_indices: torch.Tensor,
         kv_last_page_len_host: torch.Tensor,
         kv_lens_arr_host: torch.Tensor,
-        seq_len_host: torch.Tensor,
         plan_params: PlanParams,
     ) -> None:
         # check for re-planning
         if plan_params != self.plan_params_prefill:
             # plan prefill
+            # NOTE (lucaslie): we use host versions here. the plan actually needs both (host+device)
+            # version. Unfortunately, there is no good way to access the plan API and provide both
+            # although we have both available. I have decided to use the host versions here to
+            # ensure non-blocking invocation of plan, whereas the other way around would trigger a
+            # blocking copy to cpu. This way we trigger a non-blocking copy to device (note that
+            # this is safe since we do have pinned CPU memory for all our host-side arguments).
             self.prefill_wrapper.plan(
                 qo_indptr_host,
                 kv_page_indptr_host,
@@ -308,7 +313,6 @@ def plan_prefill(
                 q_data_type=plan_params.q_dtype,
                 kv_data_type=plan_params.kv_dtype,
                 sm_scale=plan_params.sm_scale,
-                # max_token_per_sequence=max(seq_len_host).item(),
                 seq_lens=kv_lens_arr_host,
             )
             self.plan_params_prefill = plan_params
@@ -359,7 +363,6 @@ def _plan_decode(
             _plan_decode(self.cached_cuda_graph_decode_wrappers[plan_params])
         # check if we are in cuda graph capture and just return the pre-cached decode wrapper
         if torch.cuda.is_current_stream_capturing() or cuda_graph_state.in_warm_up():
-            assert plan_params.is_generate, "Only generate is supported during cuda graph capture."
             wrapper = self.cached_cuda_graph_decode_wrappers[plan_params]
             return wrapper
 
@@ -423,6 +426,23 @@ def prepare_flashinfer_metadata_fake(
     )
 
 
+def prepare_flashinfer_metadata_host(
+    batch_info_host: torch.Tensor,
+    cu_num_pages_host: torch.Tensor,
+    cache_loc_host: torch.Tensor,
+    last_page_len_host: torch.Tensor,
+) -> None:
+    num_prefill, num_prefill_tokens, num_decode = batch_info_host.tolist()
+
+    if num_prefill == 0:
+        _GlobalFlashInferPlanner.plan_generate_only(
+            num_decode,
+            cu_num_pages_host[: num_decode + 1],
+            cache_loc_host,
+            last_page_len_host[:num_decode],
+        )
+
+
 @torch.library.custom_op("auto_deploy::flashinfer_attention_mha_with_cache", mutates_args=())
 def flashinfer_mha_with_cache(
     # Q, K, V
@@ -438,7 +458,6 @@ def flashinfer_mha_with_cache(
     last_page_len: torch.Tensor,
     last_page_len_host: torch.Tensor,
     seq_len_with_cache_host: torch.Tensor,
-    seq_len_host: torch.Tensor,
     # EXTRA METADATA
     flashinfer_batch_indices: torch.Tensor,
     flashinfer_positions: torch.Tensor,
@@ -502,7 +521,6 @@ def flashinfer_mha_with_cache(
             n_kv_heads=n_kv_heads,
             head_dim=head_dim,
             num_seq=num_prefill,
-            is_generate=False,
             page_size=k_cache.shape[1],
             q_dtype=q_prefill.dtype,
             kv_dtype=k_cache.dtype,
@@ -515,7 +533,6 @@ def flashinfer_mha_with_cache(
             kv_page_indices=cache_loc,
             kv_last_page_len_host=last_page_len_host[:num_prefill],
             kv_lens_arr_host=seq_len_with_cache_host[:num_prefill],
-            seq_len_host=seq_len_host[:num_prefill],
             plan_params=pp_prefill,
         )
 
@@ -539,7 +556,6 @@ def flashinfer_mha_with_cache(
             n_kv_heads=n_kv_heads,
             head_dim=head_dim,
             num_seq=num_decode,
-            is_generate=True,
             page_size=k_cache.shape[1],
             q_dtype=q_decode.dtype,
             kv_dtype=k_cache.dtype,
@@ -584,7 +600,6 @@ def flashinfer_mha_with_cache_fake(
     last_page_len: torch.Tensor,
     last_page_len_host: torch.Tensor,
     seq_len_with_cache_host: torch.Tensor,
-    seq_len_host: torch.Tensor,
     # EXTRA METADATA
     flashinfer_batch_indices: torch.Tensor,
     flashinfer_positions: torch.Tensor,
@@ -642,7 +657,6 @@ def get_standard_metadata_args(cls) -> List[str]:
             "last_page_len",
             "last_page_len_host",
             "seq_len_with_cache_host",
-            "seq_len_host",
         ]
 
     @classmethod
@@ -684,18 +698,8 @@ def _init_workspace(si: SequenceInfo) -> torch.Tensor:
         return {"workspace_buffer": _init_workspace}
 
     @classmethod
-    def host_prepare_for_forward(cls, sequence_info: SequenceInfo):
-        batch_info = sequence_info._input_buffer.get_host_view("batch_info")
-        num_prefill, num_prefill_tokens, num_decode = batch_info.tolist()
-        # Call plan for generate-only batches.
-        if num_prefill == 0:
-            _GlobalFlashInferPlanner.plan_generate_only(
-                num_decode,
-                sequence_info._input_buffer.get_host_view("cu_num_pages")[: num_decode + 1],
-                sequence_info._input_buffer.get_host_view("cache_loc"),
-                sequence_info._input_buffer.get_host_view("last_page_len")[:num_decode],
-            )
-        return
+    def get_host_prepare_metadata_function(cls) -> Optional[PrepareMetadataHostCallable]:
+        return prepare_flashinfer_metadata_host
 
     @classmethod
     def get_constants(cls, source_attn_node: Node) -> List[Constant]:
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py
@@ -1,5 +1,6 @@
 """Graph transformation to automatically add kv cache into fused MHA op."""
 
+import inspect
 import operator
 from typing import Dict, List, Optional, Tuple, Type
 
@@ -106,6 +107,23 @@ def _process_metadata_extra(
             gm, prep_meta_op, inputs_for_prep_meta, const_args, num_meta_out
         )
 
+    def _process_metadata_host(self, cm: CachedSequenceInterface):
+        """Process the host-side prepare metadata function."""
+        prep_meta_host_op = self.attn_descriptor.get_host_prepare_metadata_function()
+        if prep_meta_host_op is None:
+            return
+
+        # analyze the args of the host-side prepare metadata function using inspect
+        sig = inspect.signature(prep_meta_host_op)
+        args = sig.parameters.keys()
+
+        # check if all args are available in the cached sequence interface
+        unavailable_args = args - cm.info.available_args
+        assert not unavailable_args, f"Missing args in SequenceInfo: {unavailable_args=}"
+
+        # add the host-side prepare metadata function to the graph
+        cm.info.register_host_prepare_for_attention_forward(prep_meta_host_op, list(args))
+
     def _process_cache_node(self, gm: GraphModule, cache_name: str) -> Node:
         """Process the cache nodes by inserting a cached attention replacement op."""
         return add_graph_input(gm, cache_name)
@@ -173,6 +191,9 @@ def _apply(
         # insert metadata computation and extract each argument as a node
         meta_nodes_extra = self._process_metadata_extra(gm, cm, source_attn_nodes[0])
 
+        # Register host-side prepare_metadata function for attention descriptor.
+        self._process_metadata_host(cm)
+
         buffer_in_lookup: Dict[str, Node] = {}
 
         # replace fused attention node with attention node that has kv cache
@@ -213,11 +234,7 @@ def _apply(
                 buffer_in_nodes,
                 constants,
             )
-            # Attention descriptor should register its host function with SequenceInfo.
-            # This function will be called before graph invocation.
-            cm.info.register_host_prepare_for_attention_forward(
-                attn_descriptor.host_prepare_for_forward
-            )
+
             num_cached_attn_replacements += 1
 
         info = TransformInfo(