sequence interface revisited

lucaslie · lucaslie · commit fb3a82b64168 · 2025-08-21T07:46:01.000-07:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/.gitignore b/examples/auto_deploy/.gitignore
@@ -2,3 +2,5 @@
 !.vscode
 benchmark_results.json
 *.png
+# ignore config files that users might put here for debugging
+*.yaml
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
diff --git a/tensorrt_llm/_torch/auto_deploy/llm.py b/tensorrt_llm/_torch/auto_deploy/llm.py
@@ -1,19 +1,83 @@
 import types
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from ...executor.result import CompletionOutput
-from ...inputs.registry import create_input_processor
+from ...inputs.registry import DefaultInputProcessor, ExtraProcessedInputs
 from ...llmapi.llm import RequestOutput, _TorchLLM
-from ...llmapi.tokenizer import TokenizerBase, tokenizer_factory
+from ...llmapi.tokenizer import TokenizerBase, TransformersTokenizer, tokenizer_factory
+from ...sampling_params import SamplingParams
 from .distributed import common as dist_ad
 from .llm_args import LlmArgs
+from .models.factory import ModelFactory
 from .shim.demollm import DemoGenerationExecutor
 
 
+class ADInputProcessor(DefaultInputProcessor):
+    """Input processor for AutoDeploy backend.
+
+    This is a wrapper to either support standard TRT-LLM text-only input processing or use HF's
+    message chat template system to process multimodal inputs.
+    """
+
+    def __init__(self, tokenizer: TokenizerBase, processor: Optional[Any] = None):
+        super().__init__(None, None, tokenizer)
+        # NOTE: HF's tokenizer/processor that has the apply_chat_template method
+        self.processor = processor or tokenizer.tokenizer
+
+    def __call__(
+        self, inputs: Dict[str, Any], sampling_params: SamplingParams
+    ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
+        # construct kwargs to reflect DefaultInputProcessor
+        kwargs = {
+            "add_special_tokens": sampling_params.add_special_tokens,
+        }
+        if sampling_params.truncate_prompt_tokens is not None:
+            kwargs = {
+                "truncation": True,
+                "max_length": sampling_params.truncate_prompt_tokens,
+            }
+        # check for messages field and if yes, use the apply_chat_template method
+        if "messages" in inputs:
+            # TODO: we don't really need this but it makes for a good sanity check. Consider
+            # removing this in the future if we need to speed things up.
+            prompt = self.processor.apply_chat_template(
+                inputs["messages"],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            inputs["prompt"] = prompt
+
+            all_args = self.processor.apply_chat_template(
+                inputs["messages"],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+                padding=False,  # there shouldn't be a need for padding ever...
+                return_attention_mask=False,
+                **kwargs,
+            )
+            # TODO: is there a more reliable way to avoid the attention_mask here?
+            all_args.pop("attention_mask", None)
+
+            # TODO: can we avoid the extra tolist() here eventually?
+            token_ids = all_args.pop("input_ids")
+            assert token_ids.shape[0] == 1, "messages should be unbatched at this point."
+            return token_ids[0].tolist(), {"multimodal_data": all_args} if all_args else None
+        else:
+            token_ids = self.tokenizer.encode(inputs["prompt"], **kwargs)
+            return token_ids, None
+
+
 class LLM(_TorchLLM):
     """LLM class is the main class for running an LLM model using AutoDeploy backend."""
 
     args: LlmArgs
+    _factory: ModelFactory
+
+    @property
+    def factory(self) -> ModelFactory:
+        return self._factory
 
     def __init__(self, *args, **kwargs):
         kwargs["backend"] = "_autodeploy"
@@ -23,30 +87,36 @@ def _try_load_tokenizer(self) -> Optional[TokenizerBase]:
         if self.args.skip_tokenizer_init:
             return None
 
-        factory = self.args.create_factory()
-        return tokenizer_factory(factory.init_tokenizer())
+        return tokenizer_factory(self._factory.init_tokenizer())
 
     def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
         """We don't need to validate args for AutoDeploy backend for now."""
         pass
 
-    def _prefetch_model(self):
-        """Prefetch the model for the LLM."""
-        self.args.create_factory().prefetch_checkpoint()
+    def _create_input_processor(self) -> ADInputProcessor:
+        return ADInputProcessor(self.tokenizer, self._factory.init_processor())
 
     def _build_model(self):
         """Build the model for the LLM.
 
         This is a wrapper around the regular build model method that prefetches the model with the
         factory.
         """
+        # create and store a factory
+        self._factory = self.args.create_factory()
+
         # prefetch model with factory
-        self._prefetch_model()
+        self._factory.prefetch_checkpoint()
 
         # NOTE (lucaslie): do regular build model, we bypass the regular LLM CachedModelLoader in
         # _autodeploy backend.
         super()._build_model()
 
+        # now correct input processor
+        assert isinstance(self.input_processor, DefaultInputProcessor)
+        assert isinstance(self.tokenizer, TransformersTokenizer)
+        self.input_processor = self._create_input_processor()
+
 
 class DemoLLM(LLM):
     """A simple LLM class to demo the LLM interface while debugging the e2e workflow.
@@ -61,9 +131,10 @@ def __init__(self, **kwargs):
         self.runtime_context = None
 
         # prefetch model and load tokenizer
-        self._prefetch_model()
+        self._factory = self.args.create_factory()
+        self._factory.prefetch_checkpoint()
         self._tokenizer = self._try_load_tokenizer()
-        self.input_processor = create_input_processor(None, self.tokenizer)
+        self.input_processor = self._create_input_processor()
 
         # construct demo executor + engine
         self._executor = DemoGenerationExecutor(
diff --git a/tensorrt_llm/_torch/auto_deploy/models/factory.py b/tensorrt_llm/_torch/auto_deploy/models/factory.py
@@ -2,13 +2,13 @@
 
 import copy
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Optional, Type
+from typing import Any, Callable, Dict, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
 from torch._prims_common import DeviceLikeType
 
-from ..custom_ops.attention_interface import CacheConfig
+from ..custom_ops.attention_interface import CacheConfig, DynamicShapeCallback
 from ..utils.logger import ad_logger
 
 
@@ -113,6 +113,15 @@ def init_tokenizer(self) -> Optional[Any]:
         """
         return None
 
+    def init_processor(self) -> Optional[Any]:
+        """Initialize the (multi-modal) processor for the model.
+
+        Returns:
+            The initialized processor for the model. If the processor is not available, then this
+            method should return None.
+        """
+        return None
+
     def prefetch_checkpoint(self, force: bool = False):
         """Try or skip prefetching the checkpoint for the model and tokenizer.
 
@@ -206,6 +215,33 @@ def _load_checkpoint(self, model: nn.Module, device: DeviceLikeType):
             device: The device to load the model on.
         """
 
+    def get_example_inputs(self) -> Dict[str, torch.Tensor]:
+        """Return a dictionary of example inputs for the model.
+
+        This function can be overwritten by a factory when it requires a specific example input to
+        in order to run through export.
+
+        Returns:
+            A dictionary of example inputs for the model where the key corresponds to the argument
+            name and the value corresponds to the example input.
+        """
+        return {}
+
+    def get_extra_inputs(self) -> Dict[str, Tuple[torch.Tensor, DynamicShapeCallback]]:
+        """Return a dictionary of extra inputs for the model.
+
+        Returns:
+            A dictionary of extra inputs for the model where the key corresponds to the argument
+            name and the value corresponds to a tuple of (none_input, dynamic_shape_callback):
+                - `none_input`: The none input value of the extra input indicating the tensor
+                   value corresponding to the equivalent of the None input. `None` is not supported
+                   as we require the input to be a tensor. Hence, this none_input acts as a
+                   placeholder for the None input.
+                - `dynamic_shape_callback`: A function that returns the dynamic shape of the extra
+                  input.
+        """
+        return {}
+
 
 class ModelFactoryRegistry:
     _registry: Dict[str, Type[ModelFactory]] = {}
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -1,6 +1,6 @@
-from itertools import chain
+from collections import defaultdict
 from types import SimpleNamespace
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 from torch._prims_common import DeviceLikeType
@@ -102,16 +102,24 @@ def build_from_config(cls, ad_config: AutoDeployConfig):
             max_num_tokens=max_num_tokens,
         )
 
+        # get factory
+        factory = ad_config.create_factory()
+
         # update device to contain the current default device if it's in cuda
         device = torch.device(ad_config.device)
         if device.type == "cuda" and device.index is None:
             device = torch.device(f"cuda:{torch.cuda.current_device()}")
         device = str(device)
 
+        # pass in extra arguments defined by the model factory
+        for name, (none_input, dynamic_shape_callback) in factory.get_extra_inputs().items():
+            seq_info.add_extra_arg(name, none_input, dynamic_shape_callback)
+
+        # TODO (lucaslie): consider how we move args around InferenceOptimizer.__init__,
+        # ADEngine.__init__, and ADEngine.build_from_config. Seems a bit unnatural atm.
+
         # construct inference optimizer
-        build_and_optimize = InferenceOptimizer(
-            factory=ad_config.create_factory(), ad_config=ad_config
-        )
+        build_and_optimize = InferenceOptimizer(factory=factory, ad_config=ad_config)
 
         # construct engine
         return cls(build_and_optimize, seq_info, device, max_beam_width)
@@ -176,6 +184,7 @@ def _prepare_inputs(
         input_pos: List[int] = []
         last_logit_only: List[bool] = []
         page_assignments: List[List[int]] = []
+        extra_args: Dict[str, List[torch.Tensor]] = defaultdict(list)
 
         # look at context requests first
         for request in context_requests:
@@ -186,6 +195,15 @@ def _prepare_inputs(
             request.py_batch_idx = request.seq_slot
             last_logit_only.append(True)
 
+            # get cache indices
+            cache_indices = kv_cache_manager.get_cache_indices(request)
+            page_assignments.append(cache_indices)
+
+            # store extra arguments
+            if request.py_multimodal_data is not None:
+                for k, v in request.py_multimodal_data.items():
+                    extra_args[k].append(v)
+
         # look at generate requests next
         # TODO: we should also handle extend requests (for speculative decoding) here
         for request in gen_requests:
@@ -202,17 +220,17 @@ def _prepare_inputs(
             # return all logits
             last_logit_only.append(False)
 
-        # extract cache information for all requests
-        for request in chain(context_requests, gen_requests):
             # get cache indices
             cache_indices = kv_cache_manager.get_cache_indices(request)
             page_assignments.append(cache_indices)
 
         # update the sequence info object now
-        si = self.cache_seq_interface.info
-        si.nest_sequences(input_ids)
-        si.update_pos(input_pos, reset=True)
-        si.assign_cache_loc(page_assignments)
+        self.cache_seq_interface.info.nest_sequences(
+            input_ids,
+            input_pos=input_pos,
+            page_assignments=page_assignments,
+            **extra_args,
+        )
         return last_logit_only
 
     def _compute_logits(self) -> List[torch.Tensor]:
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
@@ -53,7 +53,7 @@ def _apply(
         model = gm.get_submodule("factory_model")
 
         # set the example sequence
-        cm.info.set_example_sequence()
+        cm.info.set_example_sequence(**factory.get_example_inputs())
 
         # export the model to a graph module
         gm = torch_export_to_gm(
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py
@@ -26,9 +26,6 @@ def update_in_out_nodes(egm: GraphModule, cm: CachedSequenceInterface) -> None:
     # loop through nodes to get input, output, and get_attr nodes
     input_nodes, output_nodes = get_all_input_output_nodes(egm.graph)
 
-    # we only expect one input node
-    assert len(input_nodes) == 2, "Expected exactly two input nodes (input_ids, position_ids)."
-
     # NOTE: for now, we wanna make sure we *only* return the final output and no hidden states.
     # Later on, we can revisit how to support returning hidden states.
     assert len(output_nodes) == 1, "Expected exactly one output node!"
@@ -73,16 +70,17 @@ def insert_cached_attention(
 
     # retrieve input nodes
     input_nodes, _ = get_all_input_output_nodes(egm.graph)
+    input_nodes_mapping = {n.target: n for n in input_nodes}
+
+    # filtered and sorted for SequenceInfo arguments (input_ids, position_ids, etc.)
+    input_nodes_from_info = [input_nodes_mapping[k] for k in cm.info.named_standard_args.keys()]
 
     # insert metadata computation and extract each argument as a node
     get_metadata, num_metadata = attn_descriptor.get_prepare_metadata_op()
     with graph.inserting_before(input_nodes[-1].next):
         ret_node = graph.call_function(
             get_metadata,
-            args=(
-                *input_nodes,
-                cm.info.page_size,
-            ),
+            args=(*input_nodes_from_info, *cm.info.extra_args_for_prepare_metadata),
         )
         metadata_nodes = [
             graph.call_function(operator.getitem, args=(ret_node, idx))
@@ -162,7 +160,7 @@ def _get_mem_info_in_mb():
 
     try:
         # Let's run a forward pass to get the memory usage
-        cm.info._set_max_num_tokens_sample()
+        cm.info.set_max_num_tokens_sample()
         free_mem_pre, _ = _get_mem_info_in_mb()
         ad_logger.info(f"Free memory before forward pass (MB): {free_mem_pre}")
 
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -80,7 +80,7 @@ def __init__(
             self)  # TODO: make it weakref
         self._executor_config = executor_config
         self._is_pytorch_backend = getattr(self._executor_config, "backend",
-                                           None) == "pytorch"
+                                           None) in ["pytorch", "_autodeploy"]
 
         if global_mpi_size() > 1:
             logger.set_rank(self.global_rank)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_engine.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_engine.py
@@ -71,7 +71,6 @@ def test_engine(engine_cls: Type[ADEngine], attn_backend: str, attn_page_size: i
         input_ids = [torch.tensor([0, 1, 2], device=device)]
         sequence_info.reset()
         sequence_info.nest_sequences(input_ids)
-        engine.cache_seq_interface.info.sync(sequence_info)
         logits = engine._compute_logits()
         logits = torch.stack(logits)
         assert logits is not None, "Logits are None"
@@ -106,7 +105,6 @@ def test_demo_engine_sampling(attn_page_size: int):
         input_ids = [torch.tensor([1, 2, 3, 4], device=device)]
         sequence_info.reset()
         sequence_info.nest_sequences(input_ids)
-        engine.cache_seq_interface.info.sync(sequence_info)
         logits = engine._compute_logits()
         logits = torch.stack(logits)
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py