nv-auto-deploy
diff --git a/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 45 additions & 11 deletions b/‎examples/auto_deploy/build_and_run_ad.py‎
Lines changed: 45 additions & 11 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 102 additions & 7 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/hf.py‎
Lines changed: 102 additions & 7 deletions
@@ -26,6 +26,9 @@
 # Global torch config, set the torch compile cache to fix up to llama 405B
 torch._dynamo.config.cache_size_limit = 20
 
+# simple string, TRT-LLM style text-only prompt or full-scale HF message template
+PromptInput = Union[str, Dict, List[Dict]]
+
 
 class PromptConfig(BaseModel):
     """Prompt configuration.
@@ -35,17 +38,27 @@ class PromptConfig(BaseModel):
     """
 
     batch_size: int = Field(default=2, description="Number of queries")
-    queries: Union[str, List[str]] = Field(
+    queries: Union[PromptInput, List[PromptInput]] = Field(
         default_factory=lambda: [
+            # OPTION 1: simple text prompt
             "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-            "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-        ]
+            # OPTION 2: wrapped text prompt for TRT-LLM
+            {"prompt": "In simple words and a single sentence, explain the concept of gravity: "},
+            # OPTION 3: a full-scale HF message template (this one works for text-only models!)
+            # Learn more about chat templates: https://huggingface.co/docs/transformers/en/chat_templating
+            # and multi-modal templates: https://huggingface.co/docs/transformers/en/chat_templating_multimodal
+            [
+                {
+                    "role": "user",
+                    "content": "How to fix slicing in golf?",
+                }
+            ],
+            # More prompts...
+            {"prompt": "Where is the capital of Iceland? "},
+        ],
+        description="Example queries to prompt the model with. We support both TRT-LLM text-only "
+        "queries via the 'prompt' key and full-scale HF message template called via "
+        "apply_chat_template.",
     )
     sp_kwargs: Dict[str, Any] = Field(
         default_factory=lambda: {"max_tokens": 100, "top_k": 200, "temperature": 1.0},
@@ -59,10 +72,28 @@ def model_post_init(self, __context: Any):
         NOTE (lucaslie): has to be done with model_post_init to ensure it's always run. field
         validators are only run if a value is provided.
         """
-        queries = [self.queries] if isinstance(self.queries, str) else self.queries
+        queries = self.queries if isinstance(self.queries, list) else [self.queries]
         batch_size = self.batch_size
         queries = queries * (batch_size // len(queries) + 1)
-        self.queries = queries[:batch_size]
+        queries = queries[:batch_size]
+
+        # now let's standardize the queries for the LLM api to understand them
+        queries_processed = []
+        for query in queries:
+            if isinstance(query, str):
+                queries_processed.append({"prompt": query})
+            elif isinstance(query, dict):
+                queries_processed.append(query)
+            elif isinstance(query, list):
+                queries_processed.append(
+                    {
+                        "prompt": "Fake prompt. Check out messages field for the HF chat template.",
+                        "messages": query,  # contains the actual HF chat template
+                    }
+                )
+            else:
+                raise ValueError(f"Invalid query type: {type(query)}")
+        self.queries = queries_processed
 
     @field_validator("sp_kwargs", mode="after")
     @classmethod
@@ -239,6 +270,9 @@ def main(config: Optional[ExperimentConfig] = None):
 
     # prompt the model and print its output
     ad_logger.info("Running example prompts...")
+
+    # now let's try piping through multimodal data
+
     outs = llm.generate(
         config.prompt.queries,
         sampling_params=SamplingParams(**config.prompt.sp_kwargs),
 
@@ -4,19 +4,21 @@
 import os
 import types
 from contextlib import contextmanager, nullcontext
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 from accelerate import init_empty_weights, load_checkpoint_in_model
 from accelerate.utils import modeling
 from huggingface_hub import HfApi, snapshot_download
 from huggingface_hub.utils import HFValidationError, filter_repo_objects, validate_repo_id
+from PIL import Image
 from torch._prims_common import DeviceLikeType
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
+    AutoProcessor,
     AutoTokenizer,
     PretrainedConfig,
 )
@@ -27,7 +29,7 @@
     WEIGHTS_NAME,
 )
 
-from ..custom_ops.attention_interface import CacheConfig
+from ..custom_ops.attention_interface import CacheConfig, Dim, DynamicShapeCallback
 from ..utils._config import deep_merge_dicts
 from ..utils.logger import ad_logger
 from .factory import ModelFactory, ModelFactoryRegistry
@@ -108,10 +110,6 @@ def __init__(self, *args, **kwargs):
     def autoconfig_from_pretrained(self):
         return AutoConfig.from_pretrained
 
-    @property
-    def autotokenizer_from_pretrained(self):
-        return AutoTokenizer.from_pretrained
-
     # TODO (@lucaslie): Do we ever want to switch to from_pretrained?
     @property
     def automodel_from_config(self):
@@ -200,7 +198,7 @@ def init_tokenizer(self) -> Optional[Any]:
         """Initialize the tokenizer—either a custom name or the model's default."""
         if self.tokenizer is None:
             return None
-        return self.autotokenizer_from_pretrained(self.tokenizer, **self.tokenizer_kwargs)
+        return AutoTokenizer.from_pretrained(self.tokenizer, **self.tokenizer_kwargs)
 
     @staticmethod
     def _get_ignore_patterns(repo_id: str, skip_prefetch_weights: bool) -> List[str]:
@@ -366,3 +364,100 @@ def _get_max_position_embeddings_config(self) -> Dict[str, Any]:
     @property
     def automodel_from_config(self):
         return AutoModelForImageTextToText.from_config
+
+    def init_tokenizer(self) -> Optional[Any]:
+        """Initialize the tokenizer—either a custom name or the model's default."""
+        processor = self.init_processor()
+        if processor is None:
+            return None
+        return processor.tokenizer
+
+    def init_processor(self) -> Optional[Any]:
+        """Initialize the processor for the model."""
+        if self.tokenizer is None:
+            return None
+        return AutoProcessor.from_pretrained(self.tokenizer, **self.tokenizer_kwargs)
+
+    @staticmethod
+    def _simple_forward(
+        model: nn.Module,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        pixel_values: torch.Tensor,
+    ):
+        """A simple forward pass for the model to functionalize the args.
+
+        This follows the standard function signature as expected by factory.py.
+        """
+        return type(model).forward(
+            model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+        )
+
+    def get_example_inputs(self) -> Dict[str, torch.Tensor]:
+        """Return a dictionary of example inputs for the model."""
+
+        def _prep_seq(text, img1, img2):
+            return [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": img1},
+                        {"type": "image", "image": img2},
+                        {"type": "text", "text": text},
+                    ],
+                }
+            ]
+
+        # Create a batch of conversations (batch_size = 2)
+        batch_messages = [
+            _prep_seq(
+                "Describe what you see in the two images and their differences.",
+                Image.new("RGB", (16, 16), color=(128, 128, 128)),
+                Image.new("RGB", (16, 16), color=(64, 64, 64)),
+            ),
+            _prep_seq(
+                "What are the main differences between these two images?",
+                Image.new("RGB", (16, 16), color=(255, 0, 0)),
+                Image.new("RGB", (16, 16), color=(0, 255, 0)),
+            ),
+        ]
+
+        processor = AutoProcessor.from_pretrained(self.tokenizer, **self.tokenizer_kwargs)
+        inputs = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+            return_attention_mask=False,
+        )
+
+        return {
+            "input_ids": inputs["input_ids"],
+            "pixel_values": inputs["pixel_values"],
+        }
+
+    def get_extra_inputs(self) -> Dict[str, Tuple[torch.Tensor, DynamicShapeCallback]]:
+        """Return a dictionary of extra inputs for the model.
+
+        Returns:
+            A dictionary of extra inputs for the model where the key corresponds to the argument
+            name and the value corresponds to a tuple of (example_input, dynamic_shape_callback).
+            The dynamic shape callback is a function that returns the dynamic shape of the extra
+            input.
+        """
+
+        def _get_dynamic_shape():
+            return {
+                # TODO (lucaslie): how to set default values for dynamic shapes?
+                0: Dim("img_batch_size", max=10),
+                2: Dim("img_height", min=32, max=2048),
+                3: Dim("img_width", min=32, max=2048),
+            }
+
+        none_pixel_values = torch.zeros(0, 3, 336, 336)
+        return {"pixel_values": (none_pixel_values, _get_dynamic_shape)}