add qwen moe

michaelfeil · michaelfeil · commit efe19de54db3 · 2025-03-07T02:02:15.000Z
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4/README.md b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4/README.md
@@ -0,0 +1,178 @@
+# TensorRT-LLM Briton with Qwen/Qwen2-57B-A14B-MoE-int4
+
+This is a Deployment for TensorRT-LLM Briton with Qwen/Qwen2-57B-A14B-MoE-int4. Briton is Baseten's solution for production-grade deployments via TensorRT-LLM for Causal Language Models models. (e.g. LLama, Qwen, Mistral)
+
+With Briton you get the following benefits by default:
+- *Lowest-latency* latency, beating frameworks such as vllm
+- *Highest-throughput* inference, automatically using XQA kernels, paged kv caching and inflight batching.
+- *distributed inference* run large models (such as LLama-405B) tensor-parallel
+- *json-schema based structured output for any model*
+- *chunked prefilling* for long generation tasks
+
+Optionally, you can also enable:
+- *speculative decoding* using an external draft model or self-speculative decoding
+- *fp8 quantization* deployments on H100, H200 and L4 GPUs
+
+
+# Examples:
+This deployment is specifically designed for the Hugging Face model [Qwen/Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct).
+Suitable models can be identified by the `ForCausalLM` suffix in the model name. Currently we support e.g. LLama, Qwen, Mistral models.
+
+Qwen/Qwen2-57B-A14B-Instruct  is a text-generation model, used to generate text given a prompt. \nIt is frequently used in chatbots, text completion, structured output and more.
+
+
+## Deployment with Truss
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+
+First, clone this repository:
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd 11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4
+```
+
+With `11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4` as your working directory, you can deploy the model with the following command. Paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+# prints:
+# ✨ Model Briton-qwen-qwen2-57b-a14b-moe-int4-truss-example was successfully pushed ✨
+# 🪵  View logs for your deployment at https://app.baseten.co/models/yyyyyy/logs/xxxxxx
+```
+
+## Call your model
+
+### OpenAI compatible inference
+This solution is OpenAI compatible, which means you can use the OpenAI client library to interact with the model.
+
+```python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    api_key=os.environ['BASETEN_API_KEY'],
+    base_url="https://model-xxxxxx.api.baseten.co/environments/production/sync/v1"
+)
+
+# Default completion
+response_completion = client.completions.create(
+    model="not_required",
+    prompt="Q: Tell me everything about Baseten.co! A:",
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Chat completion
+response_chat = client.chat.completions.create(
+    model="",
+    messages=[
+        {"role": "user", "content": "Tell me everything about Baseten.co!"}
+    ],
+    temperature=0.3,
+    max_tokens=100,
+)
+
+# Structured output
+from pydantic import BaseModel
+
+class CalendarEvent(BaseModel):
+    name: str
+    date: str
+    participants: list[str]
+
+completion = client.beta.chat.completions.parse(
+    model="not_required",
+    messages=[
+        {"role": "system", "content": "Extract the event information."},
+        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
+    ],
+    response_format=CalendarEvent,
+)
+
+event = completion.choices[0].message.parsed
+
+# If you model supports tool-calling, you can use the following example:
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current temperature for a given location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City and country e.g. Bogotá, Colombia"
+                }
+            },
+            "required": [
+                "location"
+            ],
+            "additionalProperties": False
+        },
+        "strict": True
+    }
+}]
+
+completion = client.chat.completions.create(
+    model="not_required",
+    messages=[{"role": "user", "content": "What is the weather like in Paris today?"}],
+    tools=tools
+)
+
+print(completion.choices[0].message.tool_calls)
+```
+
+
+## Config.yaml
+By default, the following configuration is used for this deployment. This config uses `quantization_type=weights_int4`. This is optional, remove the `quantization_type` field or set it to `no_quant` for float16/bfloat16.
+
+```yaml
+build_commands: []
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen2-57b-a14b-moe-int4-truss-example
+python_version: py39
+requirements: []
+resources:
+  accelerator: A100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+trt_llm:
+  build:
+    base_model: llama
+    checkpoint_repository:
+      repo: Qwen/Qwen2-57B-A14B-Instruct
+      revision: main
+      source: HF
+    max_seq_len: 32768
+    num_builder_gpus: 4
+    quantization_config:
+      calib_max_seq_length: 4096
+      calib_size: 3072
+    quantization_type: weights_int4
+    tensor_parallel_count: 1
+  runtime:
+    enable_chunked_context: true
+
+```
+
+## Support
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4/config.yaml
@@ -0,0 +1,39 @@
+build_commands: []
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input:
+    max_tokens: 512
+    messages:
+    - content: Tell me everything you know about optimized inference.
+      role: user
+    stream: true
+    temperature: 0.5
+  tags:
+  - openai-compatible
+model_name: Briton-qwen-qwen2-57b-a14b-moe-int4-truss-example
+python_version: py39
+requirements: []
+resources:
+  accelerator: A100
+  cpu: '1'
+  memory: 10Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+trt_llm:
+  build:
+    base_model: llama
+    checkpoint_repository:
+      repo: Qwen/Qwen2-57B-A14B-Instruct
+      revision: main
+      source: HF
+    max_seq_len: 32768
+    num_builder_gpus: 4
+    quantization_config:
+      calib_max_seq_length: 4096
+      calib_size: 3072
+    quantization_type: weights_int4
+    tensor_parallel_count: 1
+  runtime:
+    enable_chunked_context: true
diff --git a/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwq-32b-reasoning-fp8/config.yaml b/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwq-32b-reasoning-fp8/config.yaml
@@ -3,15 +3,15 @@ environment_variables: {}
 external_package_dirs: []
 model_metadata:
   example_model_input:
-    max_tokens: 2048
+    max_tokens: 512
     messages:
     - content: Tell me everything you know about optimized inference.
       role: user
     stream: true
     temperature: 0.5
   tags:
   - openai-compatible
-model_name: Qwen QwQ 32B
+model_name: Briton-qwen-qwq-32b-reasoning-fp8-truss-example
 python_version: py39
 requirements: []
 resources:
diff --git a/11-embeddings-reranker-classification-tensorrt/README.md b/11-embeddings-reranker-classification-tensorrt/README.md
@@ -66,6 +66,7 @@ Optionally, you can also enable:
 Examples:
  - [Qwen/QwQ-32B-reasoning-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwq-32b-reasoning-fp8)
  - [Qwen/QwQ-32B-reasoning-with-speculative-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwq-32b-reasoning-with-speculative-fp8)
+ - [Qwen/Qwen2-57B-A14B-MoE-int4-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2-57b-a14b-moe-int4)
  - [Qwen/Qwen2.5-72B-Instruct-tp2-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-72b-instruct-tp2-fp8)
  - [Qwen/Qwen2.5-7B-Instruct-with-speculative-lookahead-decoding-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-qwen-qwen2.5-7b-instruct-with-speculative-lookahead-decoding-fp8)
  - [deepseek-ai/DeepSeek-R1-Distill-Llama-70B-Briton](https://github.com/basetenlabs/truss-examples/tree/main/11-embeddings-reranker-classification-tensorrt/Briton-deepseek-ai-deepseek-r1-distill-llama-70b-fp8)
diff --git a/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py b/11-embeddings-reranker-classification-tensorrt/templating/generate_templates.py
@@ -209,9 +209,8 @@ def make_truss_config(self, dp):
         self.trt_config.build.max_seq_len = max_position_embeddings
         assert max_position_embeddings >= 512, "Model needs to have at least 512 tokens"
         if (
-            hf_cfg.model_type == "qwen2"
-            and self.trt_config.build.quantization_type
-            in [TrussTRTLLMQuantizationType.FP8_KV, TrussTRTLLMQuantizationType.FP8]
+            hf_cfg.model_type in ["qwen2", "qwen2_moe"]
+            and self.trt_config.build.quantization_type is not None
         ):
             if (
                 self.trt_config.build.quantization_type
@@ -220,19 +219,13 @@ def make_truss_config(self, dp):
                 raise ValueError(
                     f"Qwen2 models do not support FP8_KV quantization / have quality issues with this dtype - please use regular FP8 for now in the model library {dp.hf_model_id}"
                 )
-            elif (
-                self.trt_config.build.quantization_type
-                == TrussTRTLLMQuantizationType.FP8
-            ):
-                # increase the quantization example size for qwen2 models
-                self.trt_config.build.quantization_config = (
-                    TrussTRTQuantizationConfiguration(
-                        calib_size=3072,
-                        calib_max_seq_length=min(
-                            4096, self.trt_config.build.max_seq_len
-                        ),
-                    )
+            # increase the quantization example size for qwen2 models
+            self.trt_config.build.quantization_config = (
+                TrussTRTQuantizationConfiguration(
+                    calib_size=3072,
+                    calib_max_seq_length=min(4096, self.trt_config.build.max_seq_len),
                 )
+            )
 
         secrets = {}
         if dp.is_gated:
@@ -897,7 +890,7 @@ def llamalike_config(
     # config for meta-llama/Llama-3.3-70B-Instruct (FP8)
     build_kwargs = dict()
     runtime_kwargs = dict()
-    if quant != TrussTRTLLMQuantizationType.NO_QUANT:
+    if quant != TrussTRTLLMQuantizationType.NO_QUANT and tp in [1, 2]:
         if tp == 1:
             build_kwargs["num_builder_gpus"] = 4
     if quant == TrussTRTLLMQuantizationType.FP8_KV:
@@ -927,8 +920,6 @@ def llamalike_config(
     )
 
     if quant in [
-        TrussTRTLLMQuantizationType.WEIGHTS_ONLY_INT4,
-        TrussTRTLLMQuantizationType.WEIGHTS_ONLY_INT8,
         TrussTRTLLMQuantizationType.WEIGHTS_INT4_KV_INT8,
     ]:
         config.build.plugin_configuration.use_paged_context_fmha = False
@@ -1103,6 +1094,19 @@ def llamalike_spec_dec(
             )
         ),
     ),
+    Deployment(
+        "Qwen/Qwen2-57B-A14B-MoE-int4",
+        "Qwen/Qwen2-57B-A14B-Instruct",
+        Accelerator.A100,
+        TextGen(),
+        solution=Briton(
+            trt_config=llamalike_config(
+                repoid="Qwen/Qwen2-57B-A14B-Instruct",
+                tp=1,
+                quant=TrussTRTLLMQuantizationType.WEIGHTS_ONLY_INT4,
+            )
+        ),
+    ),
     # mistralai/Mistral-Small-24B-Instruct-2501
     Deployment(
         "mistralai/Mistral-Small-24B-Instruct-2501",