turn off plugin for all fp8 qwen models

philipkiely-baseten · philipkiely-baseten · commit 87551df0695c · 2025-03-06T15:45:32.000-08:00
diff --git a/qwen/engine-qwen-2-5-14b-coder-instruct/config.yaml b/qwen/engine-qwen-2-5-14b-coder-instruct/config.yaml
@@ -42,7 +42,7 @@ trt_llm:
     tensor_parallel_count: 1
     plugin_configuration:
       use_paged_context_fmha: true
-      use_fp8_context_fmha: true
+      use_fp8_context_fmha: false
       paged_kv_cache: true
   runtime:
     batch_scheduler_policy: max_utilization
diff --git a/qwen/engine-qwen-2-5-14b-instruct/config.yaml b/qwen/engine-qwen-2-5-14b-instruct/config.yaml
@@ -36,7 +36,7 @@ trt_llm:
     tensor_parallel_count: 1
     plugin_configuration:
       use_paged_context_fmha: true
-      use_fp8_context_fmha: true
+      use_fp8_context_fmha: false
       paged_kv_cache: true
   runtime:
     batch_scheduler_policy: max_utilization
diff --git a/qwen/engine-qwen-2-5-32b-coder-instruct/config.yaml b/qwen/engine-qwen-2-5-32b-coder-instruct/config.yaml
@@ -42,7 +42,7 @@ trt_llm:
     tensor_parallel_count: 1
     plugin_configuration:
       use_paged_context_fmha: true
-      use_fp8_context_fmha: true
+      use_fp8_context_fmha: false
       paged_kv_cache: true
   runtime:
     batch_scheduler_policy: max_utilization
diff --git a/qwen/engine-qwen-2-5-32b-instruct/config.yaml b/qwen/engine-qwen-2-5-32b-instruct/config.yaml
@@ -42,7 +42,7 @@ trt_llm:
     tensor_parallel_count: 1
     plugin_configuration:
       use_paged_context_fmha: true
-      use_fp8_context_fmha: true
+      use_fp8_context_fmha: false
       paged_kv_cache: true
   runtime:
     batch_scheduler_policy: max_utilization
diff --git a/qwen/engine-qwen-2-5-72b-instruct/config.yaml b/qwen/engine-qwen-2-5-72b-instruct/config.yaml
@@ -42,7 +42,7 @@ trt_llm:
     tensor_parallel_count: 2
     plugin_configuration:
       use_paged_context_fmha: true
-      use_fp8_context_fmha: true
+      use_fp8_context_fmha: false
       paged_kv_cache: true
   runtime:
     batch_scheduler_policy: max_utilization
diff --git a/qwen/engine-qwen-2-5-72b-math-instruct/config.yaml b/qwen/engine-qwen-2-5-72b-math-instruct/config.yaml
@@ -42,7 +42,7 @@ trt_llm:
     tensor_parallel_count: 2
     plugin_configuration:
       use_paged_context_fmha: true
-      use_fp8_context_fmha: true
+      use_fp8_context_fmha: false
       paged_kv_cache: true
   runtime:
     batch_scheduler_policy: max_utilization