Compress VLM model components to int8_sym instead of int8_asym (#1002)

nikita-savelyevv · web-flow · commit c7d6227f87ed · 2024-11-18T14:19:04.000+04:00
* Compress VLM model components to int8_sym instead of int8_asym

* Tweak references

* Update reference values
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -437,7 +437,7 @@ def _quantize_ovbasemodel(
                     sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
                     sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names]
                     for sub_model in sub_models:
-                        _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=False))
+                        _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True))
                     self.model.clear_requests()
                 else:
                     _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -182,10 +182,10 @@
     "open-clip": (20, 28),
     "stable-diffusion-3": (66, 42, 58, 30),
     "flux": (56, 24, 28, 64),
-    "llava": (30, 18, 2),
-    "llava_next": (30, 18, 2),
-    "minicpmv": (30, 52, 2, 12),
-    "nanollava": (30, 30, 2),
+    "llava": (30, 9, 1),
+    "llava_next": (30, 9, 1),
+    "minicpmv": (30, 26, 1, 6),
+    "nanollava": (30, 15, 1),
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"