LM Eval tests -- ignore vision tower for VL fp8 test (#1562)

brian-dellabetta · web-flow · commit ca00edd09e25 · 2025-06-17T15:10:13.000-04:00
SUMMARY: The current lm-eval test for vision language models with fp8_dynamic scheme include the vision tower component of the model in the compression. As discussed with @anmarques and @eldarkurtic, this is generally not a good idea and we want to err on the side of accuracy over improved runtime when the tradeoff exists. Excluding vision tower from compression slightly decreases accuracy (from 0.866 to 0.833) in this case, but generally speaking it will degrade performance and we don't want to encourage users to do so. This PR updates the test to explicitly ignore the vision tower components in the test TEST PLAN: Ran test a few times locally, reproduced 0.8333 value each time. Confirmed the model size is now slightly larger with the vision tower excluded from compression --------- Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
@@ -0,0 +1,19 @@
+quant_stage:
+  quant_modifiers:
+    QuantizationModifier:
+      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 8
+            type: "float"
+            symmetric: true
+            strategy: "channel"
+            dynamic: false
+          input_activations:
+            num_bits: 8
+            type: "float"
+            symmetric: true
+            strategy: "token"
+            dynamic: true
+          targets: ["Linear"]
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -2,6 +2,7 @@ cadence: weekly
 model: Qwen/Qwen2.5-VL-7B-Instruct
 model_class: Qwen2_5_VLForConditionalGeneration
 scheme: FP8_DYNAMIC
+recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
 lmeval:
   model: "hf-multimodal"
   model_args:
@@ -13,5 +14,5 @@ lmeval:
   batch_size: 8
   # dense model achieves accuracy of 0.9 +/ 0.0557
   metrics:
-    acc,none: 0.8667
+    acc,none: 0.8333
     acc_stderr,none: 0.0557