From 951f99f1beb0242862d43707a2ee6440507d32f6 Mon Sep 17 00:00:00 2001 From: AaryamSharmaBaseten Date: Thu, 2 Oct 2025 17:42:54 -0400 Subject: [PATCH 1/5] Add examples for inference stack v2 --- .../README.md | 170 ++++++++++++++++++ .../config.yaml | 34 ++++ .../README.md | 170 ++++++++++++++++++ .../config.yaml | 33 ++++ .../README.md | 2 + 5 files changed, 409 insertions(+) create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md new file mode 100644 index 00000000..0a2f2650 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md @@ -0,0 +1,170 @@ +# TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct + +This is a Deployment for TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +meta-llama/Llama-3.2-3B-Instruct is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + +This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency. + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100_40GB + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: meta-llama/Llama-3.2-3B-Instruct + revision: main + source: HF + quantization_type: fp8_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml new file mode 100644 index 00000000..4f793a47 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml @@ -0,0 +1,34 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100_40GB + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: meta-llama/Llama-3.2-3B-Instruct + revision: main + source: HF + quantization_type: fp8_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev1 + diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md new file mode 100644 index 00000000..860ec23e --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md @@ -0,0 +1,170 @@ +# TensorRT-LLM Briton with Qwen/Qwen3-32B + +This is a Deployment for TensorRT-LLM Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +Qwen/Qwen3-32B is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + +This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency. + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-qwen-qwen3-32b-EngineV2-fp8-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-32b-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: Qwen/Qwen3-32B + revision: main + source: HF + quantization_type: fp8_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml new file mode 100644 index 00000000..cc7e3b8f --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml @@ -0,0 +1,33 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-32b-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: Qwen/Qwen3-32B + revision: main + source: HF + quantization_type: fp8_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev1 diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md index 1aa686e4..10960413 100644 --- a/11-embeddings-reranker-classification-tensorrt/README.md +++ b/11-embeddings-reranker-classification-tensorrt/README.md @@ -90,6 +90,7 @@ Examples: - [Qwen/Qwen2.5-Coder-7B-Instruct-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-min-latency-fp8) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp8) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp4) + - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8) - [Qwen/Qwen3-8B-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-8b-min-latency-fp8) - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-fp8) - [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-qwen-32b-fp8) @@ -101,6 +102,7 @@ Examples: - [meta-llama/Llama-3.1-8B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-8b-instruct-with-speculative-lookahead-decoding-fp8) - [meta-llama/Llama-3.2-1B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-1b-instruct-fp8) - [meta-llama/Llama-3.2-3B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-fp8) + - [meta-llama/Llama-3.2-3B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8) - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp8) - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp4) - [meta-llama/Llama-3.3-70B-Instruct-tp4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp4-fp8) From f1fe307469531c66f3e4c8d5e536e2a90550363d Mon Sep 17 00:00:00 2001 From: AaryamSharmaBaseten Date: Thu, 2 Oct 2025 17:46:50 -0400 Subject: [PATCH 2/5] Updated generate_templates.py --- .../templating/generate_templates.py | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py index d4e5a7d6..8d19d5c4 100644 --- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py +++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py @@ -10,12 +10,14 @@ CheckpointRepository, CheckpointSource, TRTLLMConfiguration, + TRTLLMConfigurationV2, TrussSpeculatorConfiguration, TrussTRTLLMBuildConfiguration, TrussTRTLLMModel, TrussTRTLLMPluginConfiguration, TrussTRTLLMQuantizationType, TrussTRTLLMRuntimeConfiguration, + TRTLLMRuntimeConfigurationV2, TrussTRTQuantizationConfiguration, VersionsOverrides, ) @@ -27,6 +29,7 @@ Resources, TrussConfig, ) +import yaml REPO_URL = "https://github.com/basetenlabs/truss-examples" SUBFOLDER = Path("11-embeddings-reranker-classification-tensorrt") @@ -302,6 +305,83 @@ def make_truss_config(self, dp): ) +@dataclasses.dataclass +class BritonV2(Solution): + name: str = "TensorRT-LLM Briton" + nickname: str = "Briton" + benefits: str = """Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. +""" + + def make_truss_config(self, dp): + hf_cfg = AutoConfig.from_pretrained( + dp.hf_model_id, trust_remote_code=True + ) # make sure model is available + max_position_embeddings = hf_cfg.max_position_embeddings + assert self.trt_config is not None + self.trt_config.runtime.max_seq_len = max_position_embeddings + assert max_position_embeddings >= 512, "Model needs to have at least 512 tokens" + if self.trt_config.runtime is not None: + self.trt_config.runtime.max_seq_len = min( + self.trt_config.runtime.max_seq_len, 32768 + ) + self.trt_config.runtime.max_num_tokens = self.trt_config.runtime.max_seq_len + + if ( + hf_cfg.model_type in ["qwen2", "qwen2_moe"] + and self.trt_config.build.quantization_type is not None + ): + if ( + self.trt_config.build.quantization_type + == TrussTRTLLMQuantizationType.FP8_KV + ): + raise ValueError( + f"Qwen2 models do not support FP8_KV quantization / have quality issues with this dtype - please use regular FP8 for now in the model library {dp.hf_model_id}" + ) + # increase the quantization example size for qwen2 models + self.trt_config.build.quantization_config = ( + TrussTRTQuantizationConfiguration( + calib_size=2048, + calib_max_seq_length=min(2048, self.trt_config.runtime.max_seq_len), + ) + ) + + overrides_engine_builder = ENGINE_BUILDER_VERSION + overrides_briton = BRITON_VERSION + + if overrides_engine_builder is not None or overrides_briton is not None: + version_overrides = VersionsOverrides( + engine_builder_version=overrides_engine_builder, + briton_version=overrides_briton, + ) + self.trt_config.root.version_overrides = version_overrides + + return TrussConfig( + model_metadata=dp.task.model_metadata, + resources=Resources( + accelerator=AcceleratorSpec( + accelerator=dp.accelerator, + count=1, + ), + memory="10Gi", + ), + model_name=dp.model_nickname, + trt_llm=self.trt_config, + ) + + @dataclasses.dataclass class Embedder(Task): purpose: str = ( @@ -836,6 +916,7 @@ def folder_name(self): self.solution.nickname + "-" + self.name.replace(" ", "-").replace("/", "-").lower() + + ("-EngineV2" if isinstance(self.solution, BritonV2) else "") + ("-fp8" * self.is_fp8) + ("-fp4" * self.is_fp4) ) @@ -845,6 +926,44 @@ def model_nickname(self): return self.folder_name + "-truss-example" +def add_inference_v2_stack(path: Path, dep: Deployment) -> None: + """ + Edits the YAML at `path` in-place: + - Only if `should_inject` is True + - Adds `inference_stack: v2` INSIDE the `trt_llm` mapping + """ + if not isinstance(dep.solution, BritonV2): + return + + data = yaml.safe_load(path.read_text()) + trt_llm = data.get("trt_llm") + if isinstance(trt_llm, dict): + # Build new dict with inference_stack first + new_trt = {"inference_stack": "v2"} + new_trt.update({k: v for k, v in trt_llm.items() if k != "inference_stack"}) + data["trt_llm"] = new_trt + path.write_text(yaml.safe_dump(data, sort_keys=False)) + + +def add_base_model_override(path: Path, dep: Deployment) -> None: + """ + Edits the YAML at `path` in-place: + - Only if `should_inject` is True + - Adds `base_model: ...` INSIDE the `trt_llm` mapping + """ + if not isinstance(dep.solution, BritonV2): + return + + data = yaml.safe_load(path.read_text()) + build_details = data.get("trt_llm").get("build") + if isinstance(build_details, dict): + # Build new dict with base_model first + new_build = {"base_model": "decoder"} + new_build.update({k: v for k, v in build_details.items() if k != "base_model"}) + data["trt_llm"]["build"] = new_build + path.write_text(yaml.safe_dump(data, sort_keys=False)) + + def generate_deployment(dp: Deployment): root = Path(__file__).parent.parent.parent assert root.name == ROOT_NAME.name, "This script has been moved" @@ -887,6 +1006,9 @@ def generate_deployment(dp: Deployment): header = "# this file was autogenerated by `generate_templates.py` - please do change via template only\n" Path(config_yaml_path).write_text(header + config_yaml_as_str) + add_inference_v2_stack(config_yaml_path, dp) + add_base_model_override(config_yaml_path, dp) + README_SUBREPO = f"""# {dp.solution.make_headline(dp)} This is a Deployment for {dp.solution.make_headline(dp)}. {dp.solution.benefits} @@ -1336,6 +1458,41 @@ def llamalike_spec_dec( return config +def llamalike_config_v2( + quant: TrussTRTLLMQuantizationType = TrussTRTLLMQuantizationType.FP8_KV, + repoid="meta-llama/Llama-3.3-70B-Instruct", + max_batch_size: int = 32, +): + # config for meta-llama/Llama-3.3-70B-Instruct (FP8) + build_kwargs = dict() + runtime_kwargs = dict() + + config = TRTLLMConfigurationV2( + build=TrussTRTLLMBuildConfiguration( + checkpoint_repository=CheckpointRepository( + repo=repoid, + revision="main", + source=CheckpointSource.HF, + ), + quantization_type=quant, + **build_kwargs, + ), + runtime=TRTLLMRuntimeConfigurationV2( + max_seq_len=1000001, # dummy for now + max_batch_size=max_batch_size, + **runtime_kwargs, + ), + ) + + if quant in [ + TrussTRTLLMQuantizationType.WEIGHTS_INT4_KV_INT8, + ]: + config.build.plugin_configuration.use_paged_context_fmha = False + config.build.plugin_configuration.use_fp8_context_fmha = False + config.runtime.enable_chunked_context = False + return config + + DEPLOYMENTS_BRITON = [ Deployment( "meta-llama/Llama-3.3-70B-Instruct", @@ -1358,6 +1515,18 @@ def llamalike_spec_dec( ) ), ), + Deployment( + "meta-llama/Llama-3.3-70B-Instruct", + "meta-llama/Llama-3.3-70B-Instruct", + Accelerator.B200, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="meta-llama/Llama-3.3-70B-Instruct", + quant=TrussTRTLLMQuantizationType.FP4, + ) + ), + ), Deployment( "meta-llama/Llama-3.3-70B-Instruct-tp4", "meta-llama/Llama-3.3-70B-Instruct", @@ -1382,6 +1551,18 @@ def llamalike_spec_dec( ) ), ), + Deployment( + "meta-llama/Llama-3.2-3B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct", + Accelerator.H100_40GB, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="meta-llama/Llama-3.2-3B-Instruct", + quant=TrussTRTLLMQuantizationType.FP8_KV, + ) + ), + ), Deployment( "meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct", @@ -1480,6 +1661,18 @@ def llamalike_spec_dec( ) ), ), + Deployment( + "Qwen/Qwen3-32B", + "Qwen/Qwen3-32B", + Accelerator.H100, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="Qwen/Qwen3-32B", + quant=TrussTRTLLMQuantizationType.FP8_KV, + ) + ), + ), Deployment( "meta-llama/Llama-3.1-405B", "meta-llama/Llama-3.1-405B", From 9c638f21093752eb27a7085c23915dabbf059528 Mon Sep 17 00:00:00 2001 From: AaryamSharmaBaseten Date: Thu, 2 Oct 2025 17:50:29 -0400 Subject: [PATCH 3/5] Updated Readme --- .../README.md | 4 ++-- .../Briton-qwen-qwen3-32b-EngineV2-fp8/README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md index 0a2f2650..69b76ef6 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md @@ -1,6 +1,6 @@ -# TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct +# Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct -This is a Deployment for TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) +This is a Deployment for Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral) With Briton you get the following benefits by default: - *Lowest-latency* latency, beating frameworks such as vllm diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md index 860ec23e..78939979 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md @@ -1,6 +1,6 @@ -# TensorRT-LLM Briton with Qwen/Qwen3-32B +# Tensorflow Briton with Qwen/Qwen3-32B -This is a Deployment for TensorRT-LLM Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) +This is a Deployment for Tensorflow Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral) With Briton you get the following benefits by default: - *Lowest-latency* latency, beating frameworks such as vllm From 2e5f0940e57905d020d36dd817b01d01c4702b7f Mon Sep 17 00:00:00 2001 From: AaryamSharmaBaseten Date: Fri, 3 Oct 2025 14:29:51 -0400 Subject: [PATCH 4/5] Bump up engine builder version --- .../README.md | 7 +- .../config.yaml | 3 +- .../README.md | 172 +++++++++++++++++ .../config.yaml | 33 ++++ .../README.md | 175 ++++++++++++++++++ .../config.yaml | 36 ++++ .../README.md | 7 +- .../config.yaml | 2 +- .../README.md | 2 + .../templating/generate_templates.py | 48 ++++- 10 files changed, 476 insertions(+), 9 deletions(-) create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md index 69b76ef6..7c3c746e 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md @@ -1,6 +1,6 @@ -# Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct +# TensorRT Torch Backend Briton with meta-llama/Llama-3.2-3B-Instruct -This is a Deployment for Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral) +This is a Deployment for TensorRT Torch Backend Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) With Briton you get the following benefits by default: - *Lowest-latency* latency, beating frameworks such as vllm @@ -163,6 +163,9 @@ trt_llm: max_batch_size: 32 max_num_tokens: 32768 max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 ``` diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml index 4f793a47..a908d803 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml @@ -30,5 +30,4 @@ trt_llm: max_seq_len: 32768 version_overrides: briton_version: null - engine_builder_version: 0.20.0.post13.dev1 - + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md new file mode 100644 index 00000000..3e7f2ed7 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md @@ -0,0 +1,172 @@ +# TensorRT Torch Backend Briton with meta-llama/Llama-3.3-70B-Instruct + +This is a Deployment for TensorRT Torch Backend Briton with meta-llama/Llama-3.3-70B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +meta-llama/Llama-3.3-70B-Instruct is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. +Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store. +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: meta-llama/Llama-3.3-70B-Instruct + revision: main + source: HF + quantization_type: fp4 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml new file mode 100644 index 00000000..003fe7b1 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml @@ -0,0 +1,33 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: meta-llama/Llama-3.3-70B-Instruct + revision: main + source: HF + quantization_type: fp4 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md new file mode 100644 index 00000000..056a072c --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md @@ -0,0 +1,175 @@ +# TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct + +This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +Qwen/Qwen2.5-Coder-7B-Instruct is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: Qwen/Qwen2.5-Coder-7B-Instruct + revision: main + source: HF + quantization_config: + calib_max_seq_length: 2048 + calib_size: 2048 + quantization_type: fp4 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml new file mode 100644 index 00000000..ee4e5c1b --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml @@ -0,0 +1,36 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: Qwen/Qwen2.5-Coder-7B-Instruct + revision: main + source: HF + quantization_config: + calib_max_seq_length: 2048 + calib_size: 2048 + quantization_type: fp4 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md index 78939979..eca3c52f 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md @@ -1,6 +1,6 @@ -# Tensorflow Briton with Qwen/Qwen3-32B +# TensorRT Torch Backend Briton with Qwen/Qwen3-32B -This is a Deployment for Tensorflow Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral) +This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) With Briton you get the following benefits by default: - *Lowest-latency* latency, beating frameworks such as vllm @@ -163,6 +163,9 @@ trt_llm: max_batch_size: 32 max_num_tokens: 32768 max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 ``` diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml index cc7e3b8f..e79361a3 100644 --- a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml @@ -30,4 +30,4 @@ trt_llm: max_seq_len: 32768 version_overrides: briton_version: null - engine_builder_version: 0.20.0.post13.dev1 + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md index 10960413..8211713b 100644 --- a/11-embeddings-reranker-classification-tensorrt/README.md +++ b/11-embeddings-reranker-classification-tensorrt/README.md @@ -87,6 +87,7 @@ Examples: - [Qwen/QwQ-32B-reasoning-with-speculative-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwq-32b-reasoning-with-speculative-fp8) - [Qwen/Qwen2.5-72B-Instruct-tp2-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-72b-instruct-tp2-fp8) - [Qwen/Qwen2.5-7B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-7b-instruct-with-speculative-lookahead-decoding-fp8) + - [Qwen/Qwen2.5-Coder-7B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4) - [Qwen/Qwen2.5-Coder-7B-Instruct-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-min-latency-fp8) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp8) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp4) @@ -105,6 +106,7 @@ Examples: - [meta-llama/Llama-3.2-3B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8) - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp8) - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp4) + - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4) - [meta-llama/Llama-3.3-70B-Instruct-tp4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp4-fp8) - [microsoft/phi-4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4-fp8) - [mistralai/Mistral-7B-Instruct-v0.3-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3) diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py index 8d19d5c4..f5e03640 100644 --- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py +++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py @@ -36,6 +36,7 @@ ROOT_NAME = Path(REPO_URL.split("/")[-1]) BEI_VERSION = os.environ.get("BEI") ENGINE_BUILDER_VERSION = os.environ.get("ENGINE_BUILDER") +ENGINE_V2_BUILDER_VERSION = os.environ.get("ENGINE_BUILDER_V2") BRITON_VERSION = os.environ.get("BRITON") @@ -307,7 +308,7 @@ def make_truss_config(self, dp): @dataclasses.dataclass class BritonV2(Solution): - name: str = "TensorRT-LLM Briton" + name: str = "TensorRT Torch Backend Briton" nickname: str = "Briton" benefits: str = """Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) @@ -358,7 +359,7 @@ def make_truss_config(self, dp): ) ) - overrides_engine_builder = ENGINE_BUILDER_VERSION + overrides_engine_builder = ENGINE_V2_BUILDER_VERSION overrides_briton = BRITON_VERSION if overrides_engine_builder is not None or overrides_briton is not None: @@ -1429,6 +1430,24 @@ def llamalike_lookahead( return config +def llamalike_lookahead_v2( + quant: TrussTRTLLMQuantizationType = TrussTRTLLMQuantizationType.FP8_KV, + repoid="meta-llama/Llama-3.3-70B-Instruct", + use_dynamic_lengths: bool = False, + **kwargs, +): + config = llamalike_config_v2(quant, repoid, **kwargs) + config.build.speculator = TrussSpeculatorConfiguration( + # settings from https://arxiv.org/pdf/2402.02057 + speculative_decoding_mode="LOOKAHEAD_DECODING", + lookahead_windows_size=3 if not use_dynamic_lengths else 1, + lookahead_ngram_size=8 if not use_dynamic_lengths else 32, + lookahead_verification_set_size=3 if not use_dynamic_lengths else 1, + enable_b10_lookahead=True, # + ) + return config + + def llamalike_spec_dec( quant: TrussTRTLLMQuantizationType = TrussTRTLLMQuantizationType.FP8_KV, tp=1, @@ -1822,6 +1841,31 @@ def llamalike_config_v2( ) ), ), + # Deployment( + # "Qwen/Qwen2.5-Coder-7B-Instruct-min-latency", + # "Qwen/Qwen2.5-Coder-7B-Instruct", + # Accelerator.B200, + # TextGen(), + # solution=BritonV2( + # trt_config=llamalike_lookahead_v2( + # repoid="Qwen/Qwen2.5-Coder-7B-Instruct", + # use_dynamic_lengths=True, + # quant=TrussTRTLLMQuantizationType.FP4, + # ) + # ), + # ), + Deployment( + "Qwen/Qwen2.5-Coder-7B-Instruct", + "Qwen/Qwen2.5-Coder-7B-Instruct", + Accelerator.B200, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="Qwen/Qwen2.5-Coder-7B-Instruct", + quant=TrussTRTLLMQuantizationType.FP4, + ) + ), + ), Deployment( "Qwen/Qwen3-8B-min-latency", "Qwen/Qwen3-8B", From b7278282ff231d0bab608267f48fb35e324d0a4d Mon Sep 17 00:00:00 2001 From: AaryamSharmaBaseten Date: Mon, 6 Oct 2025 23:25:34 +0000 Subject: [PATCH 5/5] Added more examples --- .../README.md | 172 +++++++++++++++++ .../config.yaml | 33 ++++ .../README.md | 175 ++++++++++++++++++ .../config.yaml | 36 ++++ .../README.md | 172 +++++++++++++++++ .../config.yaml | 33 ++++ .../README.md | 173 +++++++++++++++++ .../config.yaml | 33 ++++ .../README.md | 173 +++++++++++++++++ .../config.yaml | 33 ++++ .../README.md | 5 + .../templating/generate_templates.py | 60 ++++++ 12 files changed, 1098 insertions(+) create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md new file mode 100644 index 00000000..280b856f --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md @@ -0,0 +1,172 @@ +# TensorRT Torch Backend Briton with deepseek-ai/DeepSeek-R1-Distill-Llama-70B + +This is a Deployment for TensorRT Torch Backend Briton with deepseek-ai/DeepSeek-R1-Distill-Llama-70B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +deepseek-ai/DeepSeek-R1-Distill-Llama-70B is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + revision: main + source: HF + quantization_type: fp4_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml new file mode 100644 index 00000000..761e61ea --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml @@ -0,0 +1,33 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + revision: main + source: HF + quantization_type: fp4_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md new file mode 100644 index 00000000..fd28a1f1 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md @@ -0,0 +1,175 @@ +# TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct + +This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +Qwen/Qwen2.5-Coder-7B-Instruct is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: Qwen/Qwen2.5-Coder-7B-Instruct + revision: main + source: HF + quantization_config: + calib_max_seq_length: 2048 + calib_size: 2048 + quantization_type: no_quant + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml new file mode 100644 index 00000000..469a6f16 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml @@ -0,0 +1,36 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: Qwen/Qwen2.5-Coder-7B-Instruct + revision: main + source: HF + quantization_config: + calib_max_seq_length: 2048 + calib_size: 2048 + quantization_type: no_quant + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md new file mode 100644 index 00000000..c0438029 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md @@ -0,0 +1,172 @@ +# TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507 + +This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +Qwen/Qwen3-30B-A3B-Instruct-2507 is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: Qwen/Qwen3-30B-A3B-Instruct-2507 + revision: main + source: HF + quantization_type: fp4 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml new file mode 100644 index 00000000..3e5694aa --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml @@ -0,0 +1,33 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4-truss-example +python_version: py39 +resources: + accelerator: B200 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: Qwen/Qwen3-30B-A3B-Instruct-2507 + revision: main + source: HF + quantization_type: fp4 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md new file mode 100644 index 00000000..a31f2758 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md @@ -0,0 +1,173 @@ +# TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507 + +This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +Qwen/Qwen3-30B-A3B-Instruct-2507 is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + +This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency. + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: Qwen/Qwen3-30B-A3B-Instruct-2507 + revision: main + source: HF + quantization_type: fp8 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml new file mode 100644 index 00000000..8240c488 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml @@ -0,0 +1,33 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: Qwen/Qwen3-30B-A3B-Instruct-2507 + revision: main + source: HF + quantization_type: fp8 + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md new file mode 100644 index 00000000..4e792f6d --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md @@ -0,0 +1,173 @@ +# TensorRT Torch Backend Briton with Qwen/Qwen3-4B + +This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-4B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral) + +With Briton you get the following benefits by default: +- *Lowest-latency* latency, beating frameworks such as vllm +- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching. +- *distributed inference* run large models (such as LLama-405B) tensor-parallel +- *json-schema based structured output for any model* +- *chunked prefilling* for long generation tasks + +Optionally, you can also enable: +- *speculative decoding* using an external draft model or self-speculative decoding +- *fp8 quantization* deployments on H100, H200 and L4 GPUs + +With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching. + + +# Examples: +This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B). +Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models. + +Qwen/Qwen3-4B is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more. + +This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency. + +## Deployment with Truss + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + + +First, clone this repository: +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8 +``` + +With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted. + +```sh +truss push --publish +# prints: +# ✨ Model Briton-qwen-qwen3-4b-EngineV2-fp8-truss-example was successfully pushed ✨ +# 🪵 View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx +``` + +## Call your model + +### OpenAI compatible inference +This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model. + +```python +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ['BASETEN_API_KEY'], + base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1" +) + +# Default completion +response_completion = client.completions.create( + model="not_required", + prompt="Q: Tell me everything about Baseten.co! A:", + temperature=0.3, + max_tokens=100, +) + +# Chat completion +response_chat = client.chat.completions.create( + model="", + messages=[ + {"role": "user", "content": "Tell me everything about Baseten.co!"} + ], + temperature=0.3, + max_tokens=100, +) + +# Structured output +from pydantic import BaseModel + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.beta.chat.completions.parse( + model="not_required", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."}, + ], + response_format=CalendarEvent, +) + +event = completion.choices[0].message.parsed + +# If you model supports tool-calling, you can use the following example: +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. Bogotá, Colombia" + } + }, + "required": [ + "location" + ], + "additionalProperties": False + }, + "strict": True + } +}] + +completion = client.chat.completions.create( + model="not_required", + messages=[{"role": "user", "content": "What is the weather like in Paris today?"}], + tools=tools +) + +print(completion.choices[0].message.tool_calls) +``` + + +## Config.yaml +By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16. + +```yaml +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-4b-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + build: + checkpoint_repository: + repo: Qwen/Qwen3-4B + revision: main + source: HF + quantization_type: fp8_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 + +``` + +## Support +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml new file mode 100644 index 00000000..34f11655 --- /dev/null +++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml @@ -0,0 +1,33 @@ +model_metadata: + example_model_input: + max_tokens: 512 + messages: + - content: Tell me everything you know about optimized inference. + role: user + stream: true + temperature: 0.5 + tags: + - openai-compatible +model_name: Briton-qwen-qwen3-4b-EngineV2-fp8-truss-example +python_version: py39 +resources: + accelerator: H100 + cpu: '1' + memory: 10Gi + use_gpu: true +trt_llm: + inference_stack: v2 + build: + base_model: decoder + checkpoint_repository: + repo: Qwen/Qwen3-4B + revision: main + source: HF + quantization_type: fp8_kv + runtime: + max_batch_size: 32 + max_num_tokens: 32768 + max_seq_len: 32768 + version_overrides: + briton_version: null + engine_builder_version: 0.20.0.post13.dev3 diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md index 8211713b..69c5b0ef 100644 --- a/11-embeddings-reranker-classification-tensorrt/README.md +++ b/11-embeddings-reranker-classification-tensorrt/README.md @@ -88,12 +88,17 @@ Examples: - [Qwen/Qwen2.5-72B-Instruct-tp2-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-72b-instruct-tp2-fp8) - [Qwen/Qwen2.5-7B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-7b-instruct-with-speculative-lookahead-decoding-fp8) - [Qwen/Qwen2.5-Coder-7B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4) + - [Qwen/Qwen2.5-Coder-7B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2) - [Qwen/Qwen2.5-Coder-7B-Instruct-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-min-latency-fp8) + - [Qwen/Qwen3-30B-A3B-Instruct-2507-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8) + - [Qwen/Qwen3-30B-A3B-Instruct-2507-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp8) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp4) - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8) + - [Qwen/Qwen3-4B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8) - [Qwen/Qwen3-8B-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-8b-min-latency-fp8) - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-fp8) + - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4) - [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-qwen-32b-fp8) - [google/gemma-3-1b-it-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-google-gemma-3-1b-it) - [google/gemma-3-270m-it-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-google-gemma-3-270m-it) diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py index f5e03640..f14e7101 100644 --- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py +++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py @@ -1692,6 +1692,42 @@ def llamalike_config_v2( ) ), ), + Deployment( + "Qwen/Qwen3-4B", + "Qwen/Qwen3-4B", + Accelerator.H100, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="Qwen/Qwen3-4B", + quant=TrussTRTLLMQuantizationType.FP8_KV, + ) + ), + ), + Deployment( + "Qwen/Qwen3-30B-A3B-Instruct-2507", + "Qwen/Qwen3-30B-A3B-Instruct-2507", + Accelerator.H100, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="Qwen/Qwen3-30B-A3B-Instruct-2507", + quant=TrussTRTLLMQuantizationType.FP8, + ) + ), + ), + Deployment( + "Qwen/Qwen3-30B-A3B-Instruct-2507", + "Qwen/Qwen3-30B-A3B-Instruct-2507", + Accelerator.B200, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="Qwen/Qwen3-30B-A3B-Instruct-2507", + quant=TrussTRTLLMQuantizationType.FP4, + ) + ), + ), Deployment( "meta-llama/Llama-3.1-405B", "meta-llama/Llama-3.1-405B", @@ -1726,6 +1762,18 @@ def llamalike_config_v2( ) ), ), + Deployment( + "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + Accelerator.B200, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + quant=TrussTRTLLMQuantizationType.FP4_KV, + ) + ), + ), # Qwen/Qwen2.5-72B-Instruct Deployment( "Qwen/Qwen2.5-72B-Instruct-tp2", @@ -1866,6 +1914,18 @@ def llamalike_config_v2( ) ), ), + Deployment( + "Qwen/Qwen2.5-Coder-7B-Instruct", + "Qwen/Qwen2.5-Coder-7B-Instruct", + Accelerator.B200, + TextGen(), + solution=BritonV2( + trt_config=llamalike_config_v2( + repoid="Qwen/Qwen2.5-Coder-7B-Instruct", + quant=TrussTRTLLMQuantizationType.NO_QUANT, + ) + ), + ), Deployment( "Qwen/Qwen3-8B-min-latency", "Qwen/Qwen3-8B",