nv-auto-deploy · nvchenghaoz · Jul 25, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/docker/Makefile b/docker/Makefile
@@ -118,7 +118,7 @@ endef
 DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
 DOCKER_RUN_ARGS   ?=
 # Check if NVIDIA_VISIBLE_DEVICES is set and not empty
-NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
+NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NV_GPU)
 ifeq ($(NVIDIA_VISIBLE_DEVICES_VAL),)
   # If empty or not set, use all GPUs
   GPU_OPTS ?= --gpus=all
@@ -147,17 +147,23 @@ ifeq ($(LOCAL_USER),1)
 	$(call add_local_user,$(IMAGE_WITH_TAG))
 endif
 	docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
+    		--network=host \
     		$(GPU_OPTS) \
     		--volume $(SOURCE_DIR):$(CODE_DIR) \
     		$(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \
     		--env "CCACHE_DIR=$(CCACHE_DIR)" \
     		--env "CCACHE_BASEDIR=$(CODE_DIR)" \
     		--env "CONAN_HOME=$(CONAN_DIR)" \
+    		--env "HF_HOME=/home/scratch.williamz_gpu/code/trtc/builder/hf_cache" \
+    		--volume /home/scratch.trt_llm_data:/home/scratch.trt_llm_data \
+    		--volume /home/scratch.williamz_gpu:/home/scratch.williamz_gpu \
     		--workdir $(WORK_DIR) \
     		--hostname $(shell hostname)-$* \
     		--name $(CONTAINER_NAME)-$*-$(USER_NAME) \
     		--tmpfs /tmp:exec \
     		$(IMAGE_WITH_TAG)$(IMAGE_TAG_SUFFIX) $(RUN_CMD)
+    # 		$(if $(filter 1,$(LOCAL_USER)),--volume ${HOME_DIR}/.cache:/home/${USER_NAME}/.cache:rw) \
+    # 		--env TLLM_LLMAPI_BUILD_CACHE_ROOT=/home/scratch.williamz_gpu/trtllm_llmapi_cache \
 
 devel_%: STAGE = devel
 tritondevel_%: STAGE = tritondevel

diff --git a/examples/auto_deploy/.gitignore b/examples/auto_deploy/.gitignore
@@ -2,3 +2,5 @@
 !.vscode
 benchmark_results.json
 *.png
+# ignore config files that users might put here for debugging
+*.yaml
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
@@ -26,6 +26,9 @@
 # Global torch config, set the torch compile cache to fix up to llama 405B
 torch._dynamo.config.cache_size_limit = 20
 
+# simple string, TRT-LLM style text-only prompt or full-scale HF message template
+PromptInput = Union[str, Dict, List[Dict]]
+
 
 class PromptConfig(BaseModel):
     """Prompt configuration.
@@ -35,17 +38,27 @@ class PromptConfig(BaseModel):
     """
 
     batch_size: int = Field(default=2, description="Number of queries")
-    queries: Union[str, List[str]] = Field(
+    queries: Union[PromptInput, List[PromptInput]] = Field(
         default_factory=lambda: [
+            # OPTION 1: simple text prompt
             "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-            "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-        ]
+            # OPTION 2: wrapped text prompt for TRT-LLM
+            {"prompt": "In simple words and a single sentence, explain the concept of gravity: "},
+            # OPTION 3: a full-scale HF message template (this one works for text-only models!)
+            # Learn more about chat templates: https://huggingface.co/docs/transformers/en/chat_templating
+            # and multi-modal templates: https://huggingface.co/docs/transformers/en/chat_templating_multimodal
+            [
+                {
+                    "role": "user",
+                    "content": "How to fix slicing in golf?",
+                }
+            ],
+            # More prompts...
+            {"prompt": "Where is the capital of Iceland? "},
+        ],
+        description="Example queries to prompt the model with. We support both TRT-LLM text-only "
+        "queries via the 'prompt' key and full-scale HF message template called via "
+        "apply_chat_template.",
     )
     sp_kwargs: Dict[str, Any] = Field(
         default_factory=lambda: {"max_tokens": 100, "top_k": 200, "temperature": 1.0},
@@ -59,10 +72,28 @@ def model_post_init(self, __context: Any):
         NOTE (lucaslie): has to be done with model_post_init to ensure it's always run. field
         validators are only run if a value is provided.
         """
-        queries = [self.queries] if isinstance(self.queries, str) else self.queries
+        queries = self.queries if isinstance(self.queries, list) else [self.queries]
         batch_size = self.batch_size
         queries = queries * (batch_size // len(queries) + 1)
-        self.queries = queries[:batch_size]
+        queries = queries[:batch_size]
+
+        # now let's standardize the queries for the LLM api to understand them
+        queries_processed = []
+        for query in queries:
+            if isinstance(query, str):
+                queries_processed.append({"prompt": query})
+            elif isinstance(query, dict):
+                queries_processed.append(query)
+            elif isinstance(query, list):
+                queries_processed.append(
+                    {
+                        "prompt": "Fake prompt. Check out messages field for the HF chat template.",
+                        "messages": query,  # contains the actual HF chat template
+                    }
+                )
+            else:
+                raise ValueError(f"Invalid query type: {type(query)}")
+        self.queries = queries_processed
 
     @field_validator("sp_kwargs", mode="after")
     @classmethod
@@ -239,6 +270,9 @@ def main(config: Optional[ExperimentConfig] = None):
 
     # prompt the model and print its output
     ad_logger.info("Running example prompts...")
+
+    # now let's try piping through multimodal data
+
     outs = llm.generate(
         config.prompt.queries,
         sampling_params=SamplingParams(**config.prompt.sp_kwargs),

diff --git a/examples/auto_deploy/pixtral.yml b/examples/auto_deploy/pixtral.yml
@@ -0,0 +1,29 @@
+args:
+  model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
+  world_size: 0
+  runtime: demollm # or: trtllm
+  compile_backend: torch-simple # not tested: torch-compile, torch-opt
+  attn_page_size: 64
+  max_input_len: 4096
+  max_seq_len: 8192
+  attn_backend: flashinfer
+  model_factory: AutoModelForImageTextToText
+  # skip_loading_weights: true
+  # uncomment below to quickly initialize/load a smaller, random weight model
+  skip_loading_weights: false
+  model_kwargs:
+    text_config:
+      _attn_implementation: eager
+    vision_config:
+      _attn_implementation: sdpa
+prompt:
+  batch_size: 1
+  queries:
+    - - role: user
+        content:
+          - type: text
+            text: Please describe the natural scenery you see in the following images
+          - type: image
+            url: https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png
+          - type: image
+            url: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png