nv-auto-deploy
diff --git a/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 43 additions & 55 deletions b/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 43 additions & 55 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm.py‎
Lines changed: 82 additions & 11 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm.py‎
Lines changed: 82 additions & 11 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/factory.py‎
Lines changed: 9 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/factory.py‎
Lines changed: 9 additions & 0 deletions
@@ -26,6 +26,9 @@
 # Global torch config, set the torch compile cache to fix up to llama 405B
 torch._dynamo.config.cache_size_limit = 20
 
+# simple string, TRT-LLM style text-only prompt or full-scale HF message template
+PromptInput = Union[str, Dict, List[Dict]]
+
 
 class PromptConfig(BaseModel):
     """Prompt configuration.
@@ -35,17 +38,27 @@ class PromptConfig(BaseModel):
     """
 
     batch_size: int = Field(default=2, description="Number of queries")
-    queries: Union[str, List[str]] = Field(
+    queries: Union[PromptInput, List[PromptInput]] = Field(
         default_factory=lambda: [
+            # OPTION 1: simple text prompt
             "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-            "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-        ]
+            # OPTION 2: wrapped text prompt for TRT-LLM
+            {"prompt": "In simple words and a single sentence, explain the concept of gravity: "},
+            # OPTION 3: a full-scale HF message template (this one works for text-only models!)
+            # Learn more about chat templates: https://huggingface.co/docs/transformers/en/chat_templating
+            # and multi-modal templates: https://huggingface.co/docs/transformers/en/chat_templating_multimodal
+            [
+                {
+                    "role": "user",
+                    "content": "How to fix slicing in golf?",
+                }
+            ],
+            # More prompts...
+            {"prompt": "Where is the capital of Iceland? "},
+        ],
+        description="Example queries to prompt the model with. We support both TRT-LLM text-only "
+        "queries via the 'prompt' key and full-scale HF message template called via "
+        "apply_chat_template.",
     )
     sp_kwargs: Dict[str, Any] = Field(
         default_factory=lambda: {"max_tokens": 100, "top_k": 200, "temperature": 1.0},
@@ -59,10 +72,28 @@ def model_post_init(self, __context: Any):
         NOTE (lucaslie): has to be done with model_post_init to ensure it's always run. field
         validators are only run if a value is provided.
         """
-        queries = [self.queries] if isinstance(self.queries, str) else self.queries
+        queries = self.queries if isinstance(self.queries, list) else [self.queries]
         batch_size = self.batch_size
         queries = queries * (batch_size // len(queries) + 1)
-        self.queries = queries[:batch_size]
+        queries = queries[:batch_size]
+
+        # now let's standardize the queries for the LLM api to understand them
+        queries_processed = []
+        for query in queries:
+            if isinstance(query, str):
+                queries_processed.append({"prompt": query})
+            elif isinstance(query, dict):
+                queries_processed.append(query)
+            elif isinstance(query, list):
+                queries_processed.append(
+                    {
+                        "prompt": "Fake prompt. Check out messages field for the HF chat template.",
+                        "messages": query,  # contains the actual HF chat template
+                    }
+                )
+            else:
+                raise ValueError(f"Invalid query type: {type(query)}")
+        self.queries = queries_processed
 
     @field_validator("sp_kwargs", mode="after")
     @classmethod
@@ -237,56 +268,13 @@ def main(config: Optional[ExperimentConfig] = None):
 
     llm = build_llm_from_config(config)
 
-    # just run config.prompt.queries with our special token sequence including special image tokens
-    # fmt: off
-    input_ids = [[
-        200000, 200005,   1556, 200006,    368, 200080, 200090, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200081, 200080,
-        200090, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092, 200092,
-        200092, 200081,  51212,   1780,    650,   2556,    310,    290,   1472,
-        8392,    341,   1357,  13492,     26, 200008, 200005, 140680, 200006,
-        368
-    ] for _ in range(2)]
-    # fmt: on
-
     # prompt the model and print its output
     ad_logger.info("Running example prompts...")
 
     # now let's try piping through multimodal data
 
     outs = llm.generate(
-        input_ids,
-        # config.prompt.queries,
+        config.prompt.queries,
         sampling_params=SamplingParams(**config.prompt.sp_kwargs),
     )
     results = {"prompts_and_outputs": print_outputs(outs)}
 
@@ -577,6 +577,7 @@ def nest_sequences(
             else:
                 self._extra_args[name] = none_input
 
+        # TODO (lucaslie): how strict do we wanna be here? Should we just warn/ignore instead?
         assert not extra_args, f"Extra arguments {extra_args.keys()} not found"
 
     def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]:
 
@@ -1,19 +1,83 @@
 import types
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from ...executor.result import CompletionOutput
-from ...inputs.registry import create_input_processor
+from ...inputs.registry import DefaultInputProcessor, ExtraProcessedInputs
 from ...llmapi.llm import RequestOutput, _TorchLLM
-from ...llmapi.tokenizer import TokenizerBase, tokenizer_factory
+from ...llmapi.tokenizer import TokenizerBase, TransformersTokenizer, tokenizer_factory
+from ...sampling_params import SamplingParams
 from .distributed import common as dist_ad
 from .llm_args import LlmArgs
+from .models.factory import ModelFactory
 from .shim.demollm import DemoGenerationExecutor
 
 
+class ADInputProcessor(DefaultInputProcessor):
+    """Input processor for AutoDeploy backend.
+
+    This is a wrapper to either support standard TRT-LLM text-only input processing or use HF's
+    message chat template system to process multimodal inputs.
+    """
+
+    def __init__(self, tokenizer: TokenizerBase, processor: Optional[Any] = None):
+        super().__init__(None, None, tokenizer)
+        # NOTE: HF's tokenizer/processor that has the apply_chat_template method
+        self.processor = processor or tokenizer.tokenizer
+
+    def __call__(
+        self, inputs: Dict[str, Any], sampling_params: SamplingParams
+    ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
+        # construct kwargs to reflect DefaultInputProcessor
+        kwargs = {
+            "add_special_tokens": sampling_params.add_special_tokens,
+        }
+        if sampling_params.truncate_prompt_tokens is not None:
+            kwargs = {
+                "truncation": True,
+                "max_length": sampling_params.truncate_prompt_tokens,
+            }
+        # check for messages field and if yes, use the apply_chat_template method
+        if "messages" in inputs:
+            # TODO: we don't really need this but it makes for a good sanity check. Consider
+            # removing this in the future if we need to speed things up.
+            prompt = self.processor.apply_chat_template(
+                inputs["messages"],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            inputs["prompt"] = prompt
+
+            all_args = self.processor.apply_chat_template(
+                inputs["messages"],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+                padding=False,  # there shouldn't be a need for padding ever...
+                return_attention_mask=False,
+                **kwargs,
+            )
+            # TODO: is there a more reliable way to avoid the attention_mask here?
+            all_args.pop("attention_mask", None)
+
+            # TODO: can we avoid the extra tolist() here eventually?
+            token_ids = all_args.pop("input_ids")
+            assert token_ids.shape[0] == 1, "messages should be unbatched at this point."
+            return token_ids[0].tolist(), {"multimodal_data": all_args} if all_args else None
+        else:
+            token_ids = self.tokenizer.encode(inputs["prompt"], **kwargs)
+            return token_ids, None
+
+
 class LLM(_TorchLLM):
     """LLM class is the main class for running an LLM model using AutoDeploy backend."""
 
     args: LlmArgs
+    _factory: ModelFactory
+
+    @property
+    def factory(self) -> ModelFactory:
+        return self._factory
 
     def __init__(self, *args, **kwargs):
         kwargs["backend"] = "_autodeploy"
@@ -23,30 +87,36 @@ def _try_load_tokenizer(self) -> Optional[TokenizerBase]:
         if self.args.skip_tokenizer_init:
             return None
 
-        factory = self.args.create_factory()
-        return tokenizer_factory(factory.init_tokenizer())
+        return tokenizer_factory(self._factory.init_tokenizer())
 
     def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
         """We don't need to validate args for AutoDeploy backend for now."""
         pass
 
-    def _prefetch_model(self):
-        """Prefetch the model for the LLM."""
-        self.args.create_factory().prefetch_checkpoint()
+    def _create_input_processor(self) -> ADInputProcessor:
+        return ADInputProcessor(self.tokenizer, self._factory.init_processor())
 
     def _build_model(self):
         """Build the model for the LLM.
 
         This is a wrapper around the regular build model method that prefetches the model with the
         factory.
         """
+        # create and store a factory
+        self._factory = self.args.create_factory()
+
         # prefetch model with factory
-        self._prefetch_model()
+        self._factory.prefetch_checkpoint()
 
         # NOTE (lucaslie): do regular build model, we bypass the regular LLM CachedModelLoader in
         # _autodeploy backend.
         super()._build_model()
 
+        # now correct input processor
+        assert isinstance(self.input_processor, DefaultInputProcessor)
+        assert isinstance(self.tokenizer, TransformersTokenizer)
+        self.input_processor = self._create_input_processor()
+
 
 class DemoLLM(LLM):
     """A simple LLM class to demo the LLM interface while debugging the e2e workflow.
@@ -61,9 +131,10 @@ def __init__(self, **kwargs):
         self.runtime_context = None
 
         # prefetch model and load tokenizer
-        self._prefetch_model()
+        self._factory = self.args.create_factory()
+        self._factory.prefetch_checkpoint()
         self._tokenizer = self._try_load_tokenizer()
-        self.input_processor = create_input_processor(None, self.tokenizer)
+        self.input_processor = self._create_input_processor()
 
         # construct demo executor + engine
         self._executor = DemoGenerationExecutor(
 
@@ -113,6 +113,15 @@ def init_tokenizer(self) -> Optional[Any]:
         """
         return None
 
+    def init_processor(self) -> Optional[Any]:
+        """Initialize the (multi-modal) processor for the model.
+
+        Returns:
+            The initialized processor for the model. If the processor is not available, then this
+            method should return None.
+        """
+        return None
+
     def prefetch_checkpoint(self, force: bool = False):
         """Try or skip prefetching the checkpoint for the model and tokenizer.