[OpenVINO] Add model inference check to weight-only and pipeline quantization testing (#1470)

nikita-savelyevv · web-flow · commit 3130e907fb79 · 2025-10-17T14:36:43.000+02:00
* Add model inference check to weight-only and pipeline quantization testing

* Fix tests

* Fix tests 2

* Fix nanollava; add generation config

* Add generation config for seq2seqlm

* Merge seq2seq conditions back

* Apply suggestion
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -29,12 +29,15 @@
 import pytest
 import numpy as np
 import torch
+from PIL import Image
 from parameterized import parameterized
 import nncf
 from transformers import (
     AutoModelForQuestionAnswering,
     AutoTokenizer,
     AutoProcessor,
+    AutoConfig,
+    GenerationConfig,
 )
 from transformers.testing_utils import slow
 from transformers.utils.quantization_config import QuantizationMethod
@@ -76,6 +79,7 @@
     _DEFAULT_4BIT_WQ_CONFIG,
     _quantization_config_from_dict,
 )
+from optimum.intel.openvino.modeling_visual_language import _OVNanoLlavaForCausalLM
 from optimum.intel.openvino.utils import TemporaryDirectory
 from copy import deepcopy
 
@@ -574,50 +578,11 @@ def test_ov_model_static_quantization_with_auto_dataset(
             ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config)
             ov_model.save_pretrained(tmp_dir)
 
-            if model_cls in [OVModelForSpeechSeq2Seq, OVModelForSeq2SeqLM]:
-                if ov_model.decoder_with_past is None:
-                    expected_fake_nodes_per_model.pop("decoder_with_past", None)
-                    expected_num_weight_nodes_per_model.pop("decoder_with_past", None)
-
-                if model_cls == OVModelForSpeechSeq2Seq:
-                    input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
-                    ov_model.generate(input_features)
-                else:
-                    tokenizer = AutoTokenizer.from_pretrained(model_id)
-                    inputs = tokenizer("This is a sample <mask>", return_tensors="pt")
-                    ov_model.generate(**inputs)
-            elif model_cls in (OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForMaskedLM):
-                tokenizer = AutoTokenizer.from_pretrained(model_id)
-                if tokenizer.pad_token is None:
-                    tokenizer.pad_token = tokenizer.eos_token
-                tokens = tokenizer("This is a sample <mask>", return_tensors="pt")
-                ov_model(**tokens)
-            elif model_cls in (
-                OVStableDiffusionPipeline,
-                OVStableDiffusionXLPipeline,
-                OVLatentConsistencyModelPipeline,
-            ):
-                ov_model(prompt="A text-to-image prompt")
-            elif model_cls == OVSentenceTransformer:
-                ov_model.encode(["This is a sample input"])
-            elif model_cls == OVModelForZeroShotImageClassification:
-                processor = AutoProcessor.from_pretrained(model_id)
-                image = np.random.rand(224, 224, 3).astype(np.uint8)
-                inputs = processor(text=["This is a sample text"], images=image, return_tensors="pt")
-                ov_model(**inputs)
-            elif model_cls == OVModelForVisualCausalLM:
-                processor = AutoProcessor.from_pretrained(model_id)
-                image = np.random.rand(224, 224, 3).astype(np.uint8)
-                inputs = ov_model.preprocess_inputs(image=image, text="This is a sample text", processor=processor)
-                ov_model(**inputs)
-            elif model_cls == OVSamModel:
-                processor = AutoProcessor.from_pretrained(model_id)
-                image = np.random.rand(224, 224, 3).astype(np.uint8)
-                inputs = processor(image, input_points=[[[0, 0]]], return_tensors="pt")
-                ov_model(**inputs)
-            else:
-                raise Exception("Unexpected model class.")
+            check_model_inference(ov_model, model_id, trust_remote_code=False)
 
+            if model_cls in [OVModelForSpeechSeq2Seq, OVModelForSeq2SeqLM] and ov_model.decoder_with_past is None:
+                expected_fake_nodes_per_model.pop("decoder_with_past", None)
+                expected_num_weight_nodes_per_model.pop("decoder_with_past", None)
             check_compression_state_per_model(
                 self,
                 ov_model.ov_submodels,
@@ -1311,6 +1276,9 @@ def test_ovmodel_4bit_auto_compression_with_config(
             check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
 
             model.save_pretrained(tmp_dir)
+            model = model_cls.from_pretrained(tmp_dir, trust_remote_code=trust_remote_code)
+            check_model_inference(model, model_id, trust_remote_code)
+
             # At the moment the first model in the list is the only one we apply data-aware compression to
             wc_rt_info = next(iter(submodels.values())).get_rt_info()["nncf"]["weight_compression"]
             self.assertEqual(quantization_config.quant_method.lower() == "awq", wc_rt_info["awq"].value == "True")
@@ -1728,6 +1696,7 @@ def eval_expression_if_possible(expression):
             model.save_pretrained(tmp_dir)
 
             model = model_cls.from_pretrained(tmp_dir, trust_remote_code=trust_remote_code)
+            check_model_inference(model, model_id, trust_remote_code)
             check_compression_state_per_model(
                 self, model.ov_submodels, expected_num_weight_nodes_per_model, expected_fake_nodes_per_model
             )
@@ -2260,3 +2229,64 @@ def check_optimization_not_applicable_to_optimized_model(model, quantization_con
         match="Cannot apply optimization to the model because it was already optimized with the following config",
     ):
         quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
+
+
+def check_model_inference(ov_model, model_id, trust_remote_code):
+    if isinstance(ov_model, (OVModelForSpeechSeq2Seq, OVModelForSeq2SeqLM)):
+        gen_config = GenerationConfig(
+            max_new_tokens=10,
+            min_new_tokens=10,
+            num_beams=2,
+            do_sample=False,
+            eos_token_id=None,
+        )
+        if isinstance(ov_model, OVModelForSpeechSeq2Seq):
+            input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
+            generate_kwrgs = {}
+            if is_transformers_version(">=", "4.50"):
+                generate_kwrgs = {"use_model_defaults": False}
+            ov_model.generate(input_features, generation_config=gen_config, **generate_kwrgs)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+            inputs = tokenizer("This is a sample <mask>", return_tensors="pt")
+            ov_model.generate(**inputs, generation_config=gen_config)
+    elif isinstance(ov_model, (OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForMaskedLM)):
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        tokens = tokenizer("This is a sample <mask>", return_tensors="pt")
+        ov_model(**tokens)
+    elif isinstance(
+        ov_model,
+        (
+            OVStableDiffusionPipeline,
+            OVStableDiffusion3Pipeline,
+            OVStableDiffusionXLPipeline,
+            OVLatentConsistencyModelPipeline,
+        ),
+    ):
+        ov_model(prompt="A text-to-image prompt")
+    elif isinstance(ov_model, OVSentenceTransformer):
+        ov_model.encode(["This is a sample input"])
+    elif isinstance(ov_model, OVModelForZeroShotImageClassification):
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+        image = np.random.rand(224, 224, 3).astype(np.uint8)
+        inputs = processor(text=["This is a sample text"], images=image, return_tensors="pt")
+        ov_model(**inputs)
+    elif isinstance(ov_model, OVModelForVisualCausalLM):
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+        processor_id = config.mm_vision_tower if isinstance(ov_model, _OVNanoLlavaForCausalLM) else model_id
+        processor = AutoProcessor.from_pretrained(processor_id, trust_remote_code=trust_remote_code)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+        image = Image.fromarray(np.random.rand(224, 224, 3).astype(np.uint8))
+        inputs = ov_model.preprocess_inputs(
+            image=image, text="This is a sample text", processor=processor, tokenizer=tokenizer, config=config
+        )
+        ov_model(**inputs)
+    elif isinstance(ov_model, OVSamModel):
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+        image = np.random.rand(224, 224, 3).astype(np.uint8)
+        inputs = processor(image, input_points=[[[0, 0]]], return_tensors="pt")
+        ov_model(**inputs)
+    else:
+        raise Exception("Unexpected model class.")