feat(metrics): paper docstring fixes, VQA use_probability default, vlm docstrings

davidberenstein1957 · davidberenstein1957 · commit 209e12e16e91 · 2026-03-05T14:53:59.000+01:00
- VieScore: docstring arXiv:2312.14867, TIGER-AI-Lab/VIEScore
- Image Edit Score: docstring EditScore, ADIEE
- VQA: docstring arXiv:2404.01291, use_probability=True default
- vlm_base: full Parameters/Returns for score(), _score_with_logprobs

Made-with: Cursor
diff --git a/src/pruna/evaluation/metrics/metric_img_edit_score.py b/src/pruna/evaluation/metrics/metric_img_edit_score.py
@@ -15,7 +15,9 @@
 """
 Image Edit Score metric.
 
-Reference: VieScore https://github.com/ByteDance/IEA-eval
+VLM-based instruction-following score for image editing. Evaluates how well an edited image
+follows the given editing instruction on a 0-10 scale. Related work: EditScore (arXiv:2509.23909),
+ADIEE (ICCV 2025).
 """
 
 from __future__ import annotations
@@ -40,8 +42,10 @@ class ImageEditScoreMetric(StatefulMetric):
     """
     Image Edit Score metric.
 
-    Evaluates how well an image was edited based on editing instructions.
-    Higher scores indicate better editing quality.
+    VLM-based instruction-following score for image editing. Evaluates how well an edited image
+    follows the given editing instruction. Higher scores indicate better editing quality.
+
+    Related work: EditScore (arXiv:2509.23909), ADIEE (ICCV 2025).
 
     Parameters
     ----------
diff --git a/src/pruna/evaluation/metrics/metric_viescore.py b/src/pruna/evaluation/metrics/metric_viescore.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 """
-VieScore metric for evaluating image quality (semantic + quality).
+VIEScore metric for evaluating conditional image synthesis (semantic + quality).
 
-Reference: VieScore https://github.com/ByteDance/IEA-eval
+Reference: VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation
+(ACL 2024) - https://arxiv.org/abs/2312.14867, https://github.com/TIGER-AI-Lab/VIEScore
 """
 
 from __future__ import annotations
@@ -39,7 +40,7 @@
 @MetricRegistry.register("viescore")
 class VieScoreMetric(StatefulMetric):
     """
-    VieScore metric for evaluating image quality (semantic + quality).
+    VIEScore metric for evaluating conditional image synthesis (semantic + quality).
 
     Uses VLM to assess both semantic alignment and visual quality.
     Higher scores indicate better overall quality.
@@ -49,6 +50,12 @@ class VieScoreMetric(StatefulMetric):
     - Quality score: Naturalness and artifacts
     - Overall: Geometric mean of semantic and quality
 
+    References
+    ----------
+    VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation (ACL 2024)
+    https://arxiv.org/abs/2312.14867
+    https://github.com/TIGER-AI-Lab/VIEScore
+
     Parameters
     ----------
     *args : Any
diff --git a/src/pruna/evaluation/metrics/metric_vqa.py b/src/pruna/evaluation/metrics/metric_vqa.py
@@ -15,7 +15,12 @@
 """
 VQA (Visual Question Answering) metric.
 
-Reference: VQAScore https://arxiv.org/abs/2310.08868
+Reference: VQAScore - Evaluating Text-to-Visual Generation with Image-to-Text Generation
+https://arxiv.org/abs/2404.01291
+
+Note: VQAScore uses P(Yes) (probability of "Yes" answer) for ranking. With litellm,
+use_probability=True (default) requests logprobs for soft scores when the provider supports it.
+Set use_probability=False for binary 0/1. TransformersVLM always uses binary.
 """
 
 from __future__ import annotations
@@ -39,9 +44,12 @@ class VQAMetric(StatefulMetric):
     """
     VQA (Visual Question Answering) metric.
 
-    Uses VLM to answer questions about images and compare with expected answers.
+    Uses VLM to answer "Does this image show '{prompt}'?" and scores alignment.
     Higher scores indicate better image-text alignment.
 
+    VQAScore (arXiv:2404.01291) uses P(Yes) for ranking. Default use_probability=True
+    with litellm requests logprobs for soft scores when supported.
+
     Parameters
     ----------
     *args : Any
@@ -64,6 +72,9 @@ class VQAMetric(StatefulMetric):
         API key for litellm.
     call_type : str, optional
         Call type for the metric.
+    use_probability : bool, optional
+        If True, use P(Yes) when backend supports logprobs (litellm). Otherwise binary 0/1.
+        Default is True for paper alignment.
     **kwargs : Any
         Additional arguments.
     """
@@ -86,11 +97,13 @@ def __init__(
         device=None,
         api_key: Optional[str] = None,
         call_type: str = SINGLE,
+        use_probability: bool = True,
         **kwargs,
     ):
         super().__init__(device=device)
         self.device = set_to_best_available_device(device)
         self.structured_output = structured_output
+        self.use_probability = use_probability
 
         self.vlm = get_vlm(
             vlm=vlm,
@@ -117,7 +130,13 @@ def update(self, x: List[Any] | torch.Tensor, gt: torch.Tensor, outputs: torch.T
         for i, image in enumerate(images):
             prompt = prompts[i] if i < len(prompts) else ""
             question = f'Does this image show "{prompt}"?'
-            score = self.vlm.score([image], [question], ["Yes"], response_format=self.response_format)[0]
+            score = self.vlm.score(
+                [image],
+                [question],
+                ["Yes"],
+                response_format=self.response_format,
+                use_probability=self.use_probability,
+            )[0]
             self.scores.append(score)
 
     def compute(self) -> MetricResult:
diff --git a/src/pruna/evaluation/metrics/vlm_base.py b/src/pruna/evaluation/metrics/vlm_base.py
@@ -28,6 +28,7 @@
 
 import base64
 import io
+import math
 import os
 from abc import ABC, abstractmethod
 from typing import Any, List, Literal, Optional, Type, TypeVar
@@ -129,6 +130,7 @@ def score(
         images: List[Image.Image],
         questions: List[str],
         answers: List[str],
+        use_probability: bool = False,
         **kwargs: Any,
     ) -> List[float]:
         """
@@ -142,13 +144,15 @@ def score(
             List of questions.
         answers : List[str]
             List of expected answers.
+        use_probability : bool, optional
+            If True and supported, return P(expected answer) instead of binary 0/1.
         **kwargs : Any
             Additional arguments passed to the implementation.
 
         Returns
         -------
         List[float]
-            Scores for each image-question pair.
+            Scores for each image-question pair (0-1, or probability when use_probability).
         """
         pass
 
@@ -253,11 +257,15 @@ def score(
         images: List[Image.Image],
         questions: List[str],
         answers: List[str],
+        use_probability: bool = False,
         **kwargs: Any,
     ) -> List[float]:
         """
         Score how well answers match images for given questions.
 
+        When use_probability=True, requests logprobs from the API and returns P(expected).
+        Falls back to binary 0/1 if logprobs not available.
+
         Parameters
         ----------
         images : List[Image.Image]
@@ -266,22 +274,80 @@ def score(
             List of questions.
         answers : List[str]
             List of expected answers.
+        use_probability : bool, optional
+            If True, return P(expected) from logprobs when available. Default is False.
         **kwargs : Any
-            Additional arguments passed to generate.
+            Additional arguments passed to litellm completion.
 
         Returns
         -------
         List[float]
-            Scores for each image-question pair.
+            Scores for each image-question pair (0-1, or probability when use_probability).
         """
         scores = []
         for image, question, answer in zip(images, questions, answers):
             prompt = f"{question} Please answer yes or no."
-            response = self.generate([image], [prompt], **kwargs)[0].lower()
-            score = 1.0 if answer.lower() in response else 0.0
+            if use_probability:
+                score = self._score_with_logprobs(image, prompt, answer, **kwargs)
+            else:
+                response = self.generate([image], [prompt], **kwargs)[0].lower()
+                score = 1.0 if answer.lower() in response else 0.0
             scores.append(score)
         return scores
 
+    def _score_with_logprobs(self, image: Image.Image, prompt: str, expected: str, **kwargs: Any) -> float:
+        """
+        Get P(expected) from logprobs when available.
+
+        Parameters
+        ----------
+        image : Image.Image
+            PIL Image to score.
+        prompt : str
+            Question prompt.
+        expected : str
+            Expected answer (e.g., "Yes").
+        **kwargs : Any
+            Additional arguments passed to litellm completion.
+
+        Returns
+        -------
+        float
+            Probability of expected answer (0-1), or binary 0/1 on fallback.
+        """
+        content = [
+            {"type": "text", "text": prompt},
+            {"type": "image_url", "image_url": {"url": self._image_to_data_url(image)}},
+        ]
+        completion_kwargs = {
+            "model": self.model_name,
+            "messages": [{"role": "user", "content": content}],
+            "api_key": self.api_key,
+            "logprobs": True,
+            "top_logprobs": 5,
+            **self.extra_kwargs,
+            **kwargs,
+        }
+        try:
+            response = self._litellm.completion(**completion_kwargs)
+            choice = response.choices[0]
+            logprobs = getattr(choice, "logprobs", None) or getattr(choice.message, "logprobs", None)
+            if logprobs and hasattr(logprobs, "content"):
+                for tok in (logprobs.content or []):
+                    top = getattr(tok, "top_logprobs", None) or []
+                    for t in top:
+                        token_str = getattr(t, "token", "") or str(t).lower()
+                        if token_str and expected.lower() in token_str.lower():
+                            logprob = float(getattr(t, "logprob", -1e9) or -1e9)
+                            return min(1.0, max(0.0, math.exp(logprob)))
+            content_str = (choice.message.content or "").lower()
+            if expected.lower() in content_str:
+                return 1.0
+            return 0.0
+        except Exception:
+            response = self.generate([image], [prompt], **kwargs)[0].lower()
+            return 1.0 if expected.lower() in response else 0.0
+
     def _image_to_data_url(self, image: Image.Image) -> str:
         buffer = io.BytesIO()
         image.save(buffer, format="PNG")
@@ -458,11 +524,14 @@ def score(
         images: List[Image.Image],
         questions: List[str],
         answers: List[str],
+        use_probability: bool = False,
         **kwargs: Any,
     ) -> List[float]:
         """
         Score how well answers match images for given questions.
 
+        use_probability is not supported for TransformersVLM; uses binary 0/1.
+
         Parameters
         ----------
         images : List[Image.Image]
@@ -471,13 +540,15 @@ def score(
             List of questions.
         answers : List[str]
             List of expected answers.
+        use_probability : bool, optional
+            Ignored; TransformersVLM always uses binary 0/1.
         **kwargs : Any
             Additional arguments passed to generate.
 
         Returns
         -------
         List[float]
-            Scores for each image-question pair.
+            Scores for each image-question pair (0 or 1).
         """
         scores = []
         for image, question, answer in zip(images, questions, answers):