Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ endef
DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
DOCKER_RUN_ARGS ?=
# Check if NVIDIA_VISIBLE_DEVICES is set and not empty
NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NV_GPU)
ifeq ($(NVIDIA_VISIBLE_DEVICES_VAL),)
# If empty or not set, use all GPUs
GPU_OPTS ?= --gpus=all
Expand Down Expand Up @@ -147,17 +147,23 @@ ifeq ($(LOCAL_USER),1)
$(call add_local_user,$(IMAGE_WITH_TAG))
endif
docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
--network=host \
$(GPU_OPTS) \
--volume $(SOURCE_DIR):$(CODE_DIR) \
$(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \
--env "CCACHE_DIR=$(CCACHE_DIR)" \
--env "CCACHE_BASEDIR=$(CODE_DIR)" \
--env "CONAN_HOME=$(CONAN_DIR)" \
--env "HF_HOME=/home/scratch.williamz_gpu/code/trtc/builder/hf_cache" \
--volume /home/scratch.trt_llm_data:/home/scratch.trt_llm_data \
--volume /home/scratch.williamz_gpu:/home/scratch.williamz_gpu \
--workdir $(WORK_DIR) \
--hostname $(shell hostname)-$* \
--name $(CONTAINER_NAME)-$*-$(USER_NAME) \
--tmpfs /tmp:exec \
$(IMAGE_WITH_TAG)$(IMAGE_TAG_SUFFIX) $(RUN_CMD)
# $(if $(filter 1,$(LOCAL_USER)),--volume ${HOME_DIR}/.cache:/home/${USER_NAME}/.cache:rw) \
# --env TLLM_LLMAPI_BUILD_CACHE_ROOT=/home/scratch.williamz_gpu/trtllm_llmapi_cache \

devel_%: STAGE = devel
tritondevel_%: STAGE = tritondevel
Expand Down
2 changes: 2 additions & 0 deletions examples/auto_deploy/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
!.vscode
benchmark_results.json
*.png
# ignore config files that users might put here for debugging
*.yaml
56 changes: 45 additions & 11 deletions examples/auto_deploy/build_and_run_ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
# Global torch config, set the torch compile cache to fix up to llama 405B
torch._dynamo.config.cache_size_limit = 20

# simple string, TRT-LLM style text-only prompt or full-scale HF message template
PromptInput = Union[str, Dict, List[Dict]]


class PromptConfig(BaseModel):
"""Prompt configuration.
Expand All @@ -35,17 +38,27 @@ class PromptConfig(BaseModel):
"""

batch_size: int = Field(default=2, description="Number of queries")
queries: Union[str, List[str]] = Field(
queries: Union[PromptInput, List[PromptInput]] = Field(
default_factory=lambda: [
# OPTION 1: simple text prompt
"How big is the universe? ",
"In simple words and in a single sentence, explain the concept of gravity: ",
"How to fix slicing in golf? ",
"Where is the capital of Iceland? ",
"How big is the universe? ",
"In simple words and in a single sentence, explain the concept of gravity: ",
"How to fix slicing in golf? ",
"Where is the capital of Iceland? ",
]
# OPTION 2: wrapped text prompt for TRT-LLM
{"prompt": "In simple words and a single sentence, explain the concept of gravity: "},
# OPTION 3: a full-scale HF message template (this one works for text-only models!)
# Learn more about chat templates: https://huggingface.co/docs/transformers/en/chat_templating
# and multi-modal templates: https://huggingface.co/docs/transformers/en/chat_templating_multimodal
[
{
"role": "user",
"content": "How to fix slicing in golf?",
}
],
# More prompts...
{"prompt": "Where is the capital of Iceland? "},
],
description="Example queries to prompt the model with. We support both TRT-LLM text-only "
"queries via the 'prompt' key and full-scale HF message template called via "
"apply_chat_template.",
)
sp_kwargs: Dict[str, Any] = Field(
default_factory=lambda: {"max_tokens": 100, "top_k": 200, "temperature": 1.0},
Expand All @@ -59,10 +72,28 @@ def model_post_init(self, __context: Any):
NOTE (lucaslie): has to be done with model_post_init to ensure it's always run. field
validators are only run if a value is provided.
"""
queries = [self.queries] if isinstance(self.queries, str) else self.queries
queries = self.queries if isinstance(self.queries, list) else [self.queries]
batch_size = self.batch_size
queries = queries * (batch_size // len(queries) + 1)
self.queries = queries[:batch_size]
queries = queries[:batch_size]

# now let's standardize the queries for the LLM api to understand them
queries_processed = []
for query in queries:
if isinstance(query, str):
queries_processed.append({"prompt": query})
elif isinstance(query, dict):
queries_processed.append(query)
elif isinstance(query, list):
queries_processed.append(
{
"prompt": "Fake prompt. Check out messages field for the HF chat template.",
"messages": query, # contains the actual HF chat template
}
)
else:
raise ValueError(f"Invalid query type: {type(query)}")
self.queries = queries_processed

@field_validator("sp_kwargs", mode="after")
@classmethod
Expand Down Expand Up @@ -239,6 +270,9 @@ def main(config: Optional[ExperimentConfig] = None):

# prompt the model and print its output
ad_logger.info("Running example prompts...")

# now let's try piping through multimodal data

outs = llm.generate(
config.prompt.queries,
sampling_params=SamplingParams(**config.prompt.sp_kwargs),
Expand Down
29 changes: 29 additions & 0 deletions examples/auto_deploy/pixtral.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
args:
model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
world_size: 0
runtime: demollm # or: trtllm
compile_backend: torch-simple # not tested: torch-compile, torch-opt
attn_page_size: 64
max_input_len: 4096
max_seq_len: 8192
attn_backend: flashinfer
model_factory: AutoModelForImageTextToText
# skip_loading_weights: true
# uncomment below to quickly initialize/load a smaller, random weight model
skip_loading_weights: false
model_kwargs:
text_config:
_attn_implementation: eager
vision_config:
_attn_implementation: sdpa
prompt:
batch_size: 1
queries:
- - role: user
content:
- type: text
text: Please describe the natural scenery you see in the following images
- type: image
url: https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png
- type: image
url: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png
Loading