From 951f99f1beb0242862d43707a2ee6440507d32f6 Mon Sep 17 00:00:00 2001
From: AaryamSharmaBaseten <aaryam.sharma@baseten.co>
Date: Thu, 2 Oct 2025 17:42:54 -0400
Subject: [PATCH 1/5] Add examples for inference stack v2

---
 .../README.md                                 | 170 ++++++++++++++++++
 .../config.yaml                               |  34 ++++
 .../README.md                                 | 170 ++++++++++++++++++
 .../config.yaml                               |  33 ++++
 .../README.md                                 |   2 +
 5 files changed, 409 insertions(+)
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml

diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
new file mode 100644
index 00000000..0a2f2650
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
@@ -0,0 +1,170 @@
+# TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct
+
+This is a Deployment for TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+meta-llama/Llama-3.2-3B-Instruct  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency.
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100_40GB
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: meta-llama/Llama-3.2-3B-Instruct
+      revision: main
+      source: HF
+    quantization_type: fp8_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml
new file mode 100644
index 00000000..4f793a47
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml
@@ -0,0 +1,34 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100_40GB
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: meta-llama/Llama-3.2-3B-Instruct
+      revision: main
+      source: HF
+    quantization_type: fp8_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev1
+
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
new file mode 100644
index 00000000..860ec23e
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
@@ -0,0 +1,170 @@
+# TensorRT-LLM Briton with Qwen/Qwen3-32B
+
+This is a Deployment for TensorRT-LLM Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen3-32B  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency.
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen3-32b-EngineV2-fp8-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-32b-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: Qwen/Qwen3-32B
+      revision: main
+      source: HF
+    quantization_type: fp8_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml
new file mode 100644
index 00000000..cc7e3b8f
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml
@@ -0,0 +1,33 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-32b-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: Qwen/Qwen3-32B
+      revision: main
+      source: HF
+    quantization_type: fp8_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev1
diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md
index 1aa686e4..10960413 100644
--- a/11-embeddings-reranker-classification-tensorrt/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/README.md
@@ -90,6 +90,7 @@ Examples:
  - [Qwen/Qwen2.5-Coder-7B-Instruct-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-min-latency-fp8)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp8)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp4)
+ - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8)
  - [Qwen/Qwen3-8B-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-8b-min-latency-fp8)
  - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-fp8)
  - [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-qwen-32b-fp8)
@@ -101,6 +102,7 @@ Examples:
  - [meta-llama/Llama-3.1-8B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.1-8b-instruct-with-speculative-lookahead-decoding-fp8)
  - [meta-llama/Llama-3.2-1B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-1b-instruct-fp8)
  - [meta-llama/Llama-3.2-3B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-fp8)
+ - [meta-llama/Llama-3.2-3B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8)
  - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp8)
  - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp4)
  - [meta-llama/Llama-3.3-70B-Instruct-tp4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp4-fp8)

From f1fe307469531c66f3e4c8d5e536e2a90550363d Mon Sep 17 00:00:00 2001
From: AaryamSharmaBaseten <aaryam.sharma@baseten.co>
Date: Thu, 2 Oct 2025 17:46:50 -0400
Subject: [PATCH 2/5] Updated generate_templates.py

---
 .../templating/generate_templates.py          | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)

diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
index d4e5a7d6..8d19d5c4 100644
--- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
+++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
@@ -10,12 +10,14 @@
     CheckpointRepository,
     CheckpointSource,
     TRTLLMConfiguration,
+    TRTLLMConfigurationV2,
     TrussSpeculatorConfiguration,
     TrussTRTLLMBuildConfiguration,
     TrussTRTLLMModel,
     TrussTRTLLMPluginConfiguration,
     TrussTRTLLMQuantizationType,
     TrussTRTLLMRuntimeConfiguration,
+    TRTLLMRuntimeConfigurationV2,
     TrussTRTQuantizationConfiguration,
     VersionsOverrides,
 )
@@ -27,6 +29,7 @@
     Resources,
     TrussConfig,
 )
+import yaml
 
 REPO_URL = "https://github.com/basetenlabs/truss-examples"
 SUBFOLDER = Path("11-embeddings-reranker-classification-tensorrt")
@@ -302,6 +305,83 @@ def make_truss_config(self, dp):
         )
 
 
+@dataclasses.dataclass
+class BritonV2(Solution):
+    name: str = "TensorRT-LLM Briton"
+    nickname: str = "Briton"
+    benefits: str = """Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+"""
+
+    def make_truss_config(self, dp):
+        hf_cfg = AutoConfig.from_pretrained(
+            dp.hf_model_id, trust_remote_code=True
+        )  # make sure model is available
+        max_position_embeddings = hf_cfg.max_position_embeddings
+        assert self.trt_config is not None
+        self.trt_config.runtime.max_seq_len = max_position_embeddings
+        assert max_position_embeddings >= 512, "Model needs to have at least 512 tokens"
+        if self.trt_config.runtime is not None:
+            self.trt_config.runtime.max_seq_len = min(
+                self.trt_config.runtime.max_seq_len, 32768
+            )
+            self.trt_config.runtime.max_num_tokens = self.trt_config.runtime.max_seq_len
+
+        if (
+            hf_cfg.model_type in ["qwen2", "qwen2_moe"]
+            and self.trt_config.build.quantization_type is not None
+        ):
+            if (
+                self.trt_config.build.quantization_type
+                == TrussTRTLLMQuantizationType.FP8_KV
+            ):
+                raise ValueError(
+                    f"Qwen2 models do not support FP8_KV quantization / have quality issues with this dtype - please use regular FP8 for now in the model library {dp.hf_model_id}"
+                )
+            # increase the quantization example size for qwen2 models
+            self.trt_config.build.quantization_config = (
+                TrussTRTQuantizationConfiguration(
+                    calib_size=2048,
+                    calib_max_seq_length=min(2048, self.trt_config.runtime.max_seq_len),
+                )
+            )
+
+        overrides_engine_builder = ENGINE_BUILDER_VERSION
+        overrides_briton = BRITON_VERSION
+
+        if overrides_engine_builder is not None or overrides_briton is not None:
+            version_overrides = VersionsOverrides(
+                engine_builder_version=overrides_engine_builder,
+                briton_version=overrides_briton,
+            )
+            self.trt_config.root.version_overrides = version_overrides
+
+        return TrussConfig(
+            model_metadata=dp.task.model_metadata,
+            resources=Resources(
+                accelerator=AcceleratorSpec(
+                    accelerator=dp.accelerator,
+                    count=1,
+                ),
+                memory="10Gi",
+            ),
+            model_name=dp.model_nickname,
+            trt_llm=self.trt_config,
+        )
+
+
 @dataclasses.dataclass
 class Embedder(Task):
     purpose: str = (
@@ -836,6 +916,7 @@ def folder_name(self):
             self.solution.nickname
             + "-"
             + self.name.replace(" ", "-").replace("/", "-").lower()
+            + ("-EngineV2" if isinstance(self.solution, BritonV2) else "")
             + ("-fp8" * self.is_fp8)
             + ("-fp4" * self.is_fp4)
         )
@@ -845,6 +926,44 @@ def model_nickname(self):
         return self.folder_name + "-truss-example"
 
 
+def add_inference_v2_stack(path: Path, dep: Deployment) -> None:
+    """
+    Edits the YAML at `path` in-place:
+      - Only if `should_inject` is True
+      - Adds `inference_stack: v2` INSIDE the `trt_llm` mapping
+    """
+    if not isinstance(dep.solution, BritonV2):
+        return
+
+    data = yaml.safe_load(path.read_text())
+    trt_llm = data.get("trt_llm")
+    if isinstance(trt_llm, dict):
+        # Build new dict with inference_stack first
+        new_trt = {"inference_stack": "v2"}
+        new_trt.update({k: v for k, v in trt_llm.items() if k != "inference_stack"})
+        data["trt_llm"] = new_trt
+        path.write_text(yaml.safe_dump(data, sort_keys=False))
+
+
+def add_base_model_override(path: Path, dep: Deployment) -> None:
+    """
+    Edits the YAML at `path` in-place:
+      - Only if `should_inject` is True
+      - Adds `base_model: ...` INSIDE the `trt_llm` mapping
+    """
+    if not isinstance(dep.solution, BritonV2):
+        return
+
+    data = yaml.safe_load(path.read_text())
+    build_details = data.get("trt_llm").get("build")
+    if isinstance(build_details, dict):
+        # Build new dict with base_model first
+        new_build = {"base_model": "decoder"}
+        new_build.update({k: v for k, v in build_details.items() if k != "base_model"})
+        data["trt_llm"]["build"] = new_build
+        path.write_text(yaml.safe_dump(data, sort_keys=False))
+
+
 def generate_deployment(dp: Deployment):
     root = Path(__file__).parent.parent.parent
     assert root.name == ROOT_NAME.name, "This script has been moved"
@@ -887,6 +1006,9 @@ def generate_deployment(dp: Deployment):
     header = "# this file was autogenerated by `generate_templates.py` - please do change via template only\n"
     Path(config_yaml_path).write_text(header + config_yaml_as_str)
 
+    add_inference_v2_stack(config_yaml_path, dp)
+    add_base_model_override(config_yaml_path, dp)
+
     README_SUBREPO = f"""# {dp.solution.make_headline(dp)}
 
 This is a Deployment for {dp.solution.make_headline(dp)}. {dp.solution.benefits}
@@ -1336,6 +1458,41 @@ def llamalike_spec_dec(
     return config
 
 
+def llamalike_config_v2(
+    quant: TrussTRTLLMQuantizationType = TrussTRTLLMQuantizationType.FP8_KV,
+    repoid="meta-llama/Llama-3.3-70B-Instruct",
+    max_batch_size: int = 32,
+):
+    # config for meta-llama/Llama-3.3-70B-Instruct (FP8)
+    build_kwargs = dict()
+    runtime_kwargs = dict()
+
+    config = TRTLLMConfigurationV2(
+        build=TrussTRTLLMBuildConfiguration(
+            checkpoint_repository=CheckpointRepository(
+                repo=repoid,
+                revision="main",
+                source=CheckpointSource.HF,
+            ),
+            quantization_type=quant,
+            **build_kwargs,
+        ),
+        runtime=TRTLLMRuntimeConfigurationV2(
+            max_seq_len=1000001,  # dummy for now
+            max_batch_size=max_batch_size,
+            **runtime_kwargs,
+        ),
+    )
+
+    if quant in [
+        TrussTRTLLMQuantizationType.WEIGHTS_INT4_KV_INT8,
+    ]:
+        config.build.plugin_configuration.use_paged_context_fmha = False
+        config.build.plugin_configuration.use_fp8_context_fmha = False
+        config.runtime.enable_chunked_context = False
+    return config
+
+
 DEPLOYMENTS_BRITON = [
     Deployment(
         "meta-llama/Llama-3.3-70B-Instruct",
@@ -1358,6 +1515,18 @@ def llamalike_spec_dec(
             )
         ),
     ),
+    Deployment(
+        "meta-llama/Llama-3.3-70B-Instruct",
+        "meta-llama/Llama-3.3-70B-Instruct",
+        Accelerator.B200,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="meta-llama/Llama-3.3-70B-Instruct",
+                quant=TrussTRTLLMQuantizationType.FP4,
+            )
+        ),
+    ),
     Deployment(
         "meta-llama/Llama-3.3-70B-Instruct-tp4",
         "meta-llama/Llama-3.3-70B-Instruct",
@@ -1382,6 +1551,18 @@ def llamalike_spec_dec(
             )
         ),
     ),
+    Deployment(
+        "meta-llama/Llama-3.2-3B-Instruct",
+        "meta-llama/Llama-3.2-3B-Instruct",
+        Accelerator.H100_40GB,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="meta-llama/Llama-3.2-3B-Instruct",
+                quant=TrussTRTLLMQuantizationType.FP8_KV,
+            )
+        ),
+    ),
     Deployment(
         "meta-llama/Llama-3.2-1B-Instruct",
         "meta-llama/Llama-3.2-1B-Instruct",
@@ -1480,6 +1661,18 @@ def llamalike_spec_dec(
             )
         ),
     ),
+    Deployment(
+        "Qwen/Qwen3-32B",
+        "Qwen/Qwen3-32B",
+        Accelerator.H100,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="Qwen/Qwen3-32B",
+                quant=TrussTRTLLMQuantizationType.FP8_KV,
+            )
+        ),
+    ),
     Deployment(
         "meta-llama/Llama-3.1-405B",
         "meta-llama/Llama-3.1-405B",

From 9c638f21093752eb27a7085c23915dabbf059528 Mon Sep 17 00:00:00 2001
From: AaryamSharmaBaseten <aaryam.sharma@baseten.co>
Date: Thu, 2 Oct 2025 17:50:29 -0400
Subject: [PATCH 3/5] Updated Readme

---
 .../README.md                                                 | 4 ++--
 .../Briton-qwen-qwen3-32b-EngineV2-fp8/README.md              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
index 0a2f2650..69b76ef6 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
@@ -1,6 +1,6 @@
-# TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct
+# Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct
 
-This is a Deployment for TensorRT-LLM Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+This is a Deployment for Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
 
 With Briton you get the following benefits by default:
 - *Lowest-latency* latency, beating frameworks such as vllm
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
index 860ec23e..78939979 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
@@ -1,6 +1,6 @@
-# TensorRT-LLM Briton with Qwen/Qwen3-32B
+# Tensorflow Briton with Qwen/Qwen3-32B
 
-This is a Deployment for TensorRT-LLM Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+This is a Deployment for Tensorflow Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
 
 With Briton you get the following benefits by default:
 - *Lowest-latency* latency, beating frameworks such as vllm

From 2e5f0940e57905d020d36dd817b01d01c4702b7f Mon Sep 17 00:00:00 2001
From: AaryamSharmaBaseten <aaryam.sharma@baseten.co>
Date: Fri, 3 Oct 2025 14:29:51 -0400
Subject: [PATCH 4/5] Bump up engine builder version

---
 .../README.md                                 |   7 +-
 .../config.yaml                               |   3 +-
 .../README.md                                 | 172 +++++++++++++++++
 .../config.yaml                               |  33 ++++
 .../README.md                                 | 175 ++++++++++++++++++
 .../config.yaml                               |  36 ++++
 .../README.md                                 |   7 +-
 .../config.yaml                               |   2 +-
 .../README.md                                 |   2 +
 .../templating/generate_templates.py          |  48 ++++-
 10 files changed, 476 insertions(+), 9 deletions(-)
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml

diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
index 69b76ef6..7c3c746e 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/README.md
@@ -1,6 +1,6 @@
-# Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct
+# TensorRT Torch Backend Briton with meta-llama/Llama-3.2-3B-Instruct
 
-This is a Deployment for Tensorflow Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+This is a Deployment for TensorRT Torch Backend Briton with meta-llama/Llama-3.2-3B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
 
 With Briton you get the following benefits by default:
 - *Lowest-latency* latency, beating frameworks such as vllm
@@ -163,6 +163,9 @@ trt_llm:
     max_batch_size: 32
     max_num_tokens: 32768
     max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
 
 ```
 
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml
index 4f793a47..a908d803 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8/config.yaml
@@ -30,5 +30,4 @@ trt_llm:
     max_seq_len: 32768
   version_overrides:
     briton_version: null
-    engine_builder_version: 0.20.0.post13.dev1
-
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md
new file mode 100644
index 00000000..3e7f2ed7
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/README.md
@@ -0,0 +1,172 @@
+# TensorRT Torch Backend Briton with meta-llama/Llama-3.3-70B-Instruct
+
+This is a Deployment for TensorRT Torch Backend Briton with meta-llama/Llama-3.3-70B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+meta-llama/Llama-3.3-70B-Instruct  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+Note: [This is a gated/private model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Do not set the actual value of key in the config.yaml. `hf_access_token: null` is fine - the true value will be fetched from the secret store.
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: meta-llama/Llama-3.3-70B-Instruct
+      revision: main
+      source: HF
+    quantization_type: fp4
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml
new file mode 100644
index 00000000..003fe7b1
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4/config.yaml
@@ -0,0 +1,33 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: meta-llama/Llama-3.3-70B-Instruct
+      revision: main
+      source: HF
+    quantization_type: fp4
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md
new file mode 100644
index 00000000..056a072c
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/README.md
@@ -0,0 +1,175 @@
+# TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct
+
+This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen2.5-Coder-7B-Instruct  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: Qwen/Qwen2.5-Coder-7B-Instruct
+      revision: main
+      source: HF
+    quantization_config:
+      calib_max_seq_length: 2048
+      calib_size: 2048
+    quantization_type: fp4
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml
new file mode 100644
index 00000000..ee4e5c1b
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4/config.yaml
@@ -0,0 +1,36 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: Qwen/Qwen2.5-Coder-7B-Instruct
+      revision: main
+      source: HF
+    quantization_config:
+      calib_max_seq_length: 2048
+      calib_size: 2048
+    quantization_type: fp4
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
index 78939979..eca3c52f 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/README.md
@@ -1,6 +1,6 @@
-# Tensorflow Briton with Qwen/Qwen3-32B
+# TensorRT Torch Backend Briton with Qwen/Qwen3-32B
 
-This is a Deployment for Tensorflow Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via Tensorflow for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-32B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
 
 With Briton you get the following benefits by default:
 - *Lowest-latency* latency, beating frameworks such as vllm
@@ -163,6 +163,9 @@ trt_llm:
     max_batch_size: 32
     max_num_tokens: 32768
     max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
 
 ```
 
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml
index cc7e3b8f..e79361a3 100644
--- a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8/config.yaml
@@ -30,4 +30,4 @@ trt_llm:
     max_seq_len: 32768
   version_overrides:
     briton_version: null
-    engine_builder_version: 0.20.0.post13.dev1
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md
index 10960413..8211713b 100644
--- a/11-embeddings-reranker-classification-tensorrt/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/README.md
@@ -87,6 +87,7 @@ Examples:
  - [Qwen/QwQ-32B-reasoning-with-speculative-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwq-32b-reasoning-with-speculative-fp8)
  - [Qwen/Qwen2.5-72B-Instruct-tp2-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-72b-instruct-tp2-fp8)
  - [Qwen/Qwen2.5-7B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-7b-instruct-with-speculative-lookahead-decoding-fp8)
+ - [Qwen/Qwen2.5-Coder-7B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4)
  - [Qwen/Qwen2.5-Coder-7B-Instruct-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-min-latency-fp8)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp8)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp4)
@@ -105,6 +106,7 @@ Examples:
  - [meta-llama/Llama-3.2-3B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.2-3b-instruct-EngineV2-fp8)
  - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp8)
  - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-fp4)
+ - [meta-llama/Llama-3.3-70B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-EngineV2-fp4)
  - [meta-llama/Llama-3.3-70B-Instruct-tp4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-meta-llama-llama-3.3-70b-instruct-tp4-fp8)
  - [microsoft/phi-4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-microsoft-phi-4-fp8)
  - [mistralai/Mistral-7B-Instruct-v0.3-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-mistralai-mistral-7b-instruct-v0.3)
diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
index 8d19d5c4..f5e03640 100644
--- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
+++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
@@ -36,6 +36,7 @@
 ROOT_NAME = Path(REPO_URL.split("/")[-1])
 BEI_VERSION = os.environ.get("BEI")
 ENGINE_BUILDER_VERSION = os.environ.get("ENGINE_BUILDER")
+ENGINE_V2_BUILDER_VERSION = os.environ.get("ENGINE_BUILDER_V2")
 BRITON_VERSION = os.environ.get("BRITON")
 
 
@@ -307,7 +308,7 @@ def make_truss_config(self, dp):
 
 @dataclasses.dataclass
 class BritonV2(Solution):
-    name: str = "TensorRT-LLM Briton"
+    name: str = "TensorRT Torch Backend Briton"
     nickname: str = "Briton"
     benefits: str = """Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
 
@@ -358,7 +359,7 @@ def make_truss_config(self, dp):
                 )
             )
 
-        overrides_engine_builder = ENGINE_BUILDER_VERSION
+        overrides_engine_builder = ENGINE_V2_BUILDER_VERSION
         overrides_briton = BRITON_VERSION
 
         if overrides_engine_builder is not None or overrides_briton is not None:
@@ -1429,6 +1430,24 @@ def llamalike_lookahead(
     return config
 
 
+def llamalike_lookahead_v2(
+    quant: TrussTRTLLMQuantizationType = TrussTRTLLMQuantizationType.FP8_KV,
+    repoid="meta-llama/Llama-3.3-70B-Instruct",
+    use_dynamic_lengths: bool = False,
+    **kwargs,
+):
+    config = llamalike_config_v2(quant, repoid, **kwargs)
+    config.build.speculator = TrussSpeculatorConfiguration(
+        # settings from https://arxiv.org/pdf/2402.02057
+        speculative_decoding_mode="LOOKAHEAD_DECODING",
+        lookahead_windows_size=3 if not use_dynamic_lengths else 1,
+        lookahead_ngram_size=8 if not use_dynamic_lengths else 32,
+        lookahead_verification_set_size=3 if not use_dynamic_lengths else 1,
+        enable_b10_lookahead=True,  #
+    )
+    return config
+
+
 def llamalike_spec_dec(
     quant: TrussTRTLLMQuantizationType = TrussTRTLLMQuantizationType.FP8_KV,
     tp=1,
@@ -1822,6 +1841,31 @@ def llamalike_config_v2(
             )
         ),
     ),
+    # Deployment(
+    #     "Qwen/Qwen2.5-Coder-7B-Instruct-min-latency",
+    #     "Qwen/Qwen2.5-Coder-7B-Instruct",
+    #     Accelerator.B200,
+    #     TextGen(),
+    #     solution=BritonV2(
+    #         trt_config=llamalike_lookahead_v2(
+    #             repoid="Qwen/Qwen2.5-Coder-7B-Instruct",
+    #             use_dynamic_lengths=True,
+    #             quant=TrussTRTLLMQuantizationType.FP4,
+    #         )
+    #     ),
+    # ),
+    Deployment(
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+        Accelerator.B200,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="Qwen/Qwen2.5-Coder-7B-Instruct",
+                quant=TrussTRTLLMQuantizationType.FP4,
+            )
+        ),
+    ),
     Deployment(
         "Qwen/Qwen3-8B-min-latency",
         "Qwen/Qwen3-8B",

From b7278282ff231d0bab608267f48fb35e324d0a4d Mon Sep 17 00:00:00 2001
From: AaryamSharmaBaseten <aaryam.sharma@baseten.co>
Date: Mon, 6 Oct 2025 23:25:34 +0000
Subject: [PATCH 5/5] Added more examples

---
 .../README.md                                 | 172 +++++++++++++++++
 .../config.yaml                               |  33 ++++
 .../README.md                                 | 175 ++++++++++++++++++
 .../config.yaml                               |  36 ++++
 .../README.md                                 | 172 +++++++++++++++++
 .../config.yaml                               |  33 ++++
 .../README.md                                 | 173 +++++++++++++++++
 .../config.yaml                               |  33 ++++
 .../README.md                                 | 173 +++++++++++++++++
 .../config.yaml                               |  33 ++++
 .../README.md                                 |   5 +
 .../templating/generate_templates.py          |  60 ++++++
 12 files changed, 1098 insertions(+)
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md
 create mode 100644 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml

diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md
new file mode 100644
index 00000000..280b856f
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/README.md
@@ -0,0 +1,172 @@
+# TensorRT Torch Backend Briton with deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+This is a Deployment for TensorRT Torch Backend Briton with deepseek-ai/DeepSeek-R1-Distill-Llama-70B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+deepseek-ai/DeepSeek-R1-Distill-Llama-70B  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      revision: main
+      source: HF
+    quantization_type: fp4_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml
new file mode 100644
index 00000000..761e61ea
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4/config.yaml
@@ -0,0 +1,33 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      revision: main
+      source: HF
+    quantization_type: fp4_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md
new file mode 100644
index 00000000..fd28a1f1
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/README.md
@@ -0,0 +1,175 @@
+# TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct
+
+This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen2.5-Coder-7B-Instruct. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen2.5-Coder-7B-Instruct  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: Qwen/Qwen2.5-Coder-7B-Instruct
+      revision: main
+      source: HF
+    quantization_config:
+      calib_max_seq_length: 2048
+      calib_size: 2048
+    quantization_type: no_quant
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml
new file mode 100644
index 00000000..469a6f16
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2/config.yaml
@@ -0,0 +1,36 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: Qwen/Qwen2.5-Coder-7B-Instruct
+      revision: main
+      source: HF
+    quantization_config:
+      calib_max_seq_length: 2048
+      calib_size: 2048
+    quantization_type: no_quant
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md
new file mode 100644
index 00000000..c0438029
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/README.md
@@ -0,0 +1,172 @@
+# TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507
+
+This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen3-30B-A3B-Instruct-2507  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: Qwen/Qwen3-30B-A3B-Instruct-2507
+      revision: main
+      source: HF
+    quantization_type: fp4
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml
new file mode 100644
index 00000000..3e5694aa
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4/config.yaml
@@ -0,0 +1,33 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4-truss-example
+python_version: py39
+resources:
+  accelerator: B200
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: Qwen/Qwen3-30B-A3B-Instruct-2507
+      revision: main
+      source: HF
+    quantization_type: fp4
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md
new file mode 100644
index 00000000..a31f2758
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/README.md
@@ -0,0 +1,173 @@
+# TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507
+
+This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-30B-A3B-Instruct-2507. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen3-30B-A3B-Instruct-2507  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency.
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: Qwen/Qwen3-30B-A3B-Instruct-2507
+      revision: main
+      source: HF
+    quantization_type: fp8
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml
new file mode 100644
index 00000000..8240c488
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8/config.yaml
@@ -0,0 +1,33 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: Qwen/Qwen3-30B-A3B-Instruct-2507
+      revision: main
+      source: HF
+    quantization_type: fp8
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md
new file mode 100644
index 00000000..4e792f6d
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/README.md
@@ -0,0 +1,173 @@
+# TensorRT Torch Backend Briton with Qwen/Qwen3-4B
+
+This is a Deployment for TensorRT Torch Backend Briton with Qwen/Qwen3-4B. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+With the V2 Config, you can now also quantize models straight from huggingface in FP8 and FP4, and also use KV Caching.
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen3-4B  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+This model is quantized to FP8 for deployment, which is supported by Nvidia's newest GPUs e.g. H100, H100_40GB or L4. Quantization is optional, but leads to higher efficiency.
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen3-4b-EngineV2-fp8-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=fp8_kv`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-4b-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  build:
+    checkpoint_repository:
+      repo: Qwen/Qwen3-4B
+      revision: main
+      source: HF
+    quantization_type: fp8_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml
new file mode 100644
index 00000000..34f11655
--- /dev/null
+++ b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8/config.yaml
@@ -0,0 +1,33 @@
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen3-4b-EngineV2-fp8-truss-example
+python_version: py39
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+trt_llm:
+  inference_stack: v2
+  build:
+    base_model: decoder
+    checkpoint_repository:
+      repo: Qwen/Qwen3-4B
+      revision: main
+      source: HF
+    quantization_type: fp8_kv
+  runtime:
+    max_batch_size: 32
+    max_num_tokens: 32768
+    max_seq_len: 32768
+  version_overrides:
+    briton_version: null
+    engine_builder_version: 0.20.0.post13.dev3
diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md
index 8211713b..69c5b0ef 100644
--- a/11-embeddings-reranker-classification-tensorrt/README.md
+++ b/11-embeddings-reranker-classification-tensorrt/README.md
@@ -88,12 +88,17 @@ Examples:
  - [Qwen/Qwen2.5-72B-Instruct-tp2-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-72b-instruct-tp2-fp8)
  - [Qwen/Qwen2.5-7B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-7b-instruct-with-speculative-lookahead-decoding-fp8)
  - [Qwen/Qwen2.5-Coder-7B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2-fp4)
+ - [Qwen/Qwen2.5-Coder-7B-Instruct-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-EngineV2)
  - [Qwen/Qwen2.5-Coder-7B-Instruct-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-coder-7b-instruct-min-latency-fp8)
+ - [Qwen/Qwen3-30B-A3B-Instruct-2507-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp8)
+ - [Qwen/Qwen3-30B-A3B-Instruct-2507-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-30b-a3b-instruct-2507-EngineV2-fp4)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp8)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-fp4)
  - [Qwen/Qwen3-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-32b-EngineV2-fp8)
+ - [Qwen/Qwen3-4B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-4b-EngineV2-fp8)
  - [Qwen/Qwen3-8B-min-latency-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen3-8b-min-latency-fp8)
  - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-fp8)
+ - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-EngineV2-fp4)
  - [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-qwen-32b-fp8)
  - [google/gemma-3-1b-it-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-google-gemma-3-1b-it)
  - [google/gemma-3-270m-it-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-google-gemma-3-270m-it)
diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
index f5e03640..f14e7101 100644
--- a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
+++ b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
@@ -1692,6 +1692,42 @@ def llamalike_config_v2(
             )
         ),
     ),
+    Deployment(
+        "Qwen/Qwen3-4B",
+        "Qwen/Qwen3-4B",
+        Accelerator.H100,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="Qwen/Qwen3-4B",
+                quant=TrussTRTLLMQuantizationType.FP8_KV,
+            )
+        ),
+    ),
+    Deployment(
+        "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        Accelerator.H100,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="Qwen/Qwen3-30B-A3B-Instruct-2507",
+                quant=TrussTRTLLMQuantizationType.FP8,
+            )
+        ),
+    ),
+    Deployment(
+        "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        Accelerator.B200,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="Qwen/Qwen3-30B-A3B-Instruct-2507",
+                quant=TrussTRTLLMQuantizationType.FP4,
+            )
+        ),
+    ),
     Deployment(
         "meta-llama/Llama-3.1-405B",
         "meta-llama/Llama-3.1-405B",
@@ -1726,6 +1762,18 @@ def llamalike_config_v2(
             )
         ),
     ),
+    Deployment(
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        Accelerator.B200,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+                quant=TrussTRTLLMQuantizationType.FP4_KV,
+            )
+        ),
+    ),
     # Qwen/Qwen2.5-72B-Instruct
     Deployment(
         "Qwen/Qwen2.5-72B-Instruct-tp2",
@@ -1866,6 +1914,18 @@ def llamalike_config_v2(
             )
         ),
     ),
+    Deployment(
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+        Accelerator.B200,
+        TextGen(),
+        solution=BritonV2(
+            trt_config=llamalike_config_v2(
+                repoid="Qwen/Qwen2.5-Coder-7B-Instruct",
+                quant=TrussTRTLLMQuantizationType.NO_QUANT,
+            )
+        ),
+    ),
     Deployment(
         "Qwen/Qwen3-8B-min-latency",
         "Qwen/Qwen3-8B",