feat(metrics): DINO Score CLS fix, multi-model support, paper docstring fixes

davidberenstein1957 · davidberenstein1957 · commit 9b49913cc370 · 2026-03-05T14:47:53.000+01:00
- DINO Score: fix CLS token extraction ([:,0] for v1/v3, x_norm_clstoken for v2)
- DINO Score: add model options (dino, dinov2_vits14, dinov2_vitb14, dinov3_*)
- DINO Score: add MPS support
- VieScore, Image Edit Score, VQA: update docstrings per paper refs
- VQA: add use_probability for P(Yes) via logprobs (litellm)
- Add tests for each DINO model (parametrized, slow mark for dinov2)

Made-with: Cursor
diff --git a/src/pruna/evaluation/metrics/metric_dino_score.py b/src/pruna/evaluation/metrics/metric_dino_score.py
@@ -14,9 +14,8 @@
 
 from __future__ import annotations
 
-from typing import Any, List
+from typing import Any, List, Literal
 
-import timm
 import torch
 
 # Ruff complains when we don't import functional as f, but common practice is to import it as F
@@ -33,6 +32,14 @@
 
 DINO_SCORE = "dino_score"
 
+DINO_PREPROCESS = transforms.Compose(
+    [
+        transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+        transforms.CenterCrop(224),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ]
+)
+
 
 @MetricRegistry.register(DINO_SCORE)
 class DinoScore(StatefulMetric):
@@ -41,48 +48,97 @@ class DinoScore(StatefulMetric):
 
     A similarity metric based on DINO (self-distillation with no labels),
     a self-supervised vision transformer trained to learn high-level image representations without annotations.
-    DinoScore compares the embeddings of generated and reference images in this representation space,
+    DinoScore compares the [CLS] token embeddings of generated and reference images in this representation space,
     producing a value where higher scores indicate that the generated images preserve more of the semantic content of the
     reference images.
 
-    Reference
+    Supports DINO (v1), DINOv2, and DINOv3 backbones. DINOv3 models may require weights from Meta's download form.
+
+    References
     ----------
-    https://github.com/facebookresearch/dino
-    https://arxiv.org/abs/2104.14294
+    DINO: https://github.com/facebookresearch/dino, https://arxiv.org/abs/2104.14294
+    DINOv2: https://github.com/facebookresearch/dinov2
+    DINOv3: https://github.com/facebookresearch/dinov3
 
     Parameters
     ----------
     device : str | torch.device | None
         The device to use for the metric.
+    model : {"dino", "dinov2_vits14", "dinov2_vitb14", "dinov2_vitl14", "dinov3_vits16", "dinov3_vitb16", "dinov3_vitl16"}
+        Backbone variant. "dino" uses timm vit_small_patch16_224.dino (DINO v1).
+        "dinov2_*" uses torch.hub facebookresearch/dinov2. "dinov3_*" uses timm (requires timm>=1.0.20).
     call_type : str
         The call type to use for the metric.
     """
 
     similarities: List[Tensor]
     metric_name: str = DINO_SCORE
     higher_is_better: bool = True
-    runs_on: List[str] = ["cuda", "cpu"]
+    runs_on: List[str] = ["cuda", "cpu", "mps"]
     default_call_type: str = "gt_y"
 
-    def __init__(self, device: str | torch.device | None = None, call_type: str = SINGLE):
-        super().__init__()
+    def __init__(
+        self,
+        device: str | torch.device | None = None,
+        model: Literal[
+            "dino", "dinov2_vits14", "dinov2_vitb14", "dinov2_vitl14", "dinov3_vits16", "dinov3_vitb16", "dinov3_vitl16"
+        ] = "dino",
+        call_type: str = SINGLE,
+    ):
+        super().__init__(device=device)
         self.device = set_to_best_available_device(device)
         if device is not None and not any(self.device.startswith(prefix) for prefix in self.runs_on):
             pruna_logger.error(f"DinoScore: device {device} not supported. Supported devices: {self.runs_on}")
             raise
         self.call_type = get_call_type_for_single_metric(call_type, self.default_call_type)
-        # Load the DINO ViT-S/16 model once
-        self.model = timm.create_model("vit_small_patch16_224.dino", pretrained=True)
+        self.model_name = model
+        self.model = self._load_model(model)
         self.model.eval().to(self.device)
-        # Add internal state to accumulate similarities
         self.add_state("similarities", default=[])
-        self.processor = transforms.Compose(
-            [
-                transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-                transforms.CenterCrop(224),
-                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
-            ]
-        )
+        self.processor = DINO_PREPROCESS
+
+    def _load_model(
+        self,
+        model: str,
+    ) -> torch.nn.Module:
+        if model == "dino":
+            import timm
+            return timm.create_model("vit_small_patch16_224.dino", pretrained=True)
+        if model.startswith("dinov2_"):
+            return torch.hub.load("facebookresearch/dinov2", model)
+        if model.startswith("dinov3_"):
+            import timm
+            timm_map = {
+                "dinov3_vits16": "vit_small_patch16_dinov3.lvd1689m",
+                "dinov3_vitb16": "vit_base_patch16_dinov3.lvd1689m",
+                "dinov3_vitl16": "vit_large_patch16_dinov3.lvd1689m",
+            }
+            timm_name = timm_map.get(model)
+            if timm_name is None:
+                raise ValueError(f"Unsupported DINOv3 model: {model}. Choose from {list(timm_map.keys())}")
+            try:
+                return timm.create_model(timm_name, pretrained=True)
+            except Exception as e:
+                raise ValueError(
+                    f"DINOv3 requires timm>=1.0.20 and model weights from Meta. "
+                    f"See https://github.com/facebookresearch/dinov3. Error: {e}"
+                ) from e
+        raise ValueError(f"Unsupported model: {model}")
+
+    def _get_embeddings(self, x: Tensor) -> Tensor:
+        if self.model_name == "dino":
+            features = self.model.forward_features(x)
+            return features[:, 0]
+        if self.model_name.startswith("dinov2_"):
+            out = self.model.forward_features(x)
+            return out["x_norm_clstoken"]
+        if self.model_name.startswith("dinov3_"):
+            features = self.model.forward_features(x)
+            return features[:, 0]
+        features = self.model.forward_features(x)
+        if isinstance(features, dict):
+            return features["x_norm_clstoken"]
+        return features[:, 0]
 
     @torch.no_grad()
     def update(self, x: List[Any] | Tensor, gt: Tensor, outputs: torch.Tensor) -> None:
@@ -102,11 +158,8 @@ def update(self, x: List[Any] | Tensor, gt: Tensor, outputs: torch.Tensor) -> No
         inputs, preds = metric_inputs
         inputs = self.processor(inputs)
         preds = self.processor(preds)
-        # Extract embeddings ([CLS] token)
-        emb_x = self.model.forward_features(inputs)
-        emb_y = self.model.forward_features(preds)
-
-        # Normalize embeddings
+        emb_x = self._get_embeddings(inputs)
+        emb_y = self._get_embeddings(preds)
         emb_x = F.normalize(emb_x, dim=1)
         emb_y = F.normalize(emb_y, dim=1)
 
diff --git a/src/pruna/evaluation/metrics/metric_img_edit_score.py b/src/pruna/evaluation/metrics/metric_img_edit_score.py
@@ -15,7 +15,9 @@
 """
 Image Edit Score metric.
 
-Reference: VieScore https://github.com/ByteDance/IEA-eval
+VLM-based instruction-following score for image editing. Evaluates how well an edited image
+follows the given editing instruction on a 0-10 scale. Related work: EditScore (arXiv:2509.23909),
+ADIEE (ICCV 2025).
 """
 
 from __future__ import annotations
@@ -40,8 +42,10 @@ class ImageEditScoreMetric(StatefulMetric):
     """
     Image Edit Score metric.
 
-    Evaluates how well an image was edited based on editing instructions.
-    Higher scores indicate better editing quality.
+    VLM-based instruction-following score for image editing. Evaluates how well an edited image
+    follows the given editing instruction. Higher scores indicate better editing quality.
+
+    Related work: EditScore (arXiv:2509.23909), ADIEE (ICCV 2025).
 
     Parameters
     ----------
diff --git a/src/pruna/evaluation/metrics/metric_viescore.py b/src/pruna/evaluation/metrics/metric_viescore.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 """
-VieScore metric for evaluating image quality (semantic + quality).
+VIEScore metric for evaluating conditional image synthesis (semantic + quality).
 
-Reference: VieScore https://github.com/ByteDance/IEA-eval
+Reference: VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation
+(ACL 2024) - https://arxiv.org/abs/2312.14867, https://github.com/TIGER-AI-Lab/VIEScore
 """
 
 from __future__ import annotations
@@ -39,7 +40,7 @@
 @MetricRegistry.register("viescore")
 class VieScoreMetric(StatefulMetric):
     """
-    VieScore metric for evaluating image quality (semantic + quality).
+    VIEScore metric for evaluating conditional image synthesis (semantic + quality).
 
     Uses VLM to assess both semantic alignment and visual quality.
     Higher scores indicate better overall quality.
@@ -49,6 +50,12 @@ class VieScoreMetric(StatefulMetric):
     - Quality score: Naturalness and artifacts
     - Overall: Geometric mean of semantic and quality
 
+    References
+    ----------
+    VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation (ACL 2024)
+    https://arxiv.org/abs/2312.14867
+    https://github.com/TIGER-AI-Lab/VIEScore
+
     Parameters
     ----------
     *args : Any
diff --git a/src/pruna/evaluation/metrics/metric_vqa.py b/src/pruna/evaluation/metrics/metric_vqa.py
@@ -15,7 +15,12 @@
 """
 VQA (Visual Question Answering) metric.
 
-Reference: VQAScore https://arxiv.org/abs/2310.08868
+Reference: VQAScore - Evaluating Text-to-Visual Generation with Image-to-Text Generation
+https://arxiv.org/abs/2404.01291
+
+Note: VQAScore uses P(Yes) (probability of "Yes" answer) for ranking. This implementation
+defaults to binary (0/1) for compatibility. Set use_probability=True when using litellm
+with a provider that supports logprobs to get soft scores.
 """
 
 from __future__ import annotations
@@ -39,9 +44,12 @@ class VQAMetric(StatefulMetric):
     """
     VQA (Visual Question Answering) metric.
 
-    Uses VLM to answer questions about images and compare with expected answers.
+    Uses VLM to answer "Does this image show '{prompt}'?" and scores alignment.
     Higher scores indicate better image-text alignment.
 
+    VQAScore (arXiv:2404.01291) uses P(Yes) for ranking. Default is binary (0/1).
+    Set use_probability=True with litellm + logprobs-capable provider for soft scores.
+
     Parameters
     ----------
     *args : Any
@@ -64,6 +72,9 @@ class VQAMetric(StatefulMetric):
         API key for litellm.
     call_type : str, optional
         Call type for the metric.
+    use_probability : bool, optional
+        If True, use P(Yes) when backend supports logprobs (litellm). Otherwise binary 0/1.
+        Default is False for backward compatibility.
     **kwargs : Any
         Additional arguments.
     """
@@ -86,11 +97,13 @@ def __init__(
         device=None,
         api_key: Optional[str] = None,
         call_type: str = SINGLE,
+        use_probability: bool = False,
         **kwargs,
     ):
         super().__init__(device=device)
         self.device = set_to_best_available_device(device)
         self.structured_output = structured_output
+        self.use_probability = use_probability
 
         self.vlm = get_vlm(
             vlm=vlm,
@@ -117,7 +130,13 @@ def update(self, x: List[Any] | torch.Tensor, gt: torch.Tensor, outputs: torch.T
         for i, image in enumerate(images):
             prompt = prompts[i] if i < len(prompts) else ""
             question = f'Does this image show "{prompt}"?'
-            score = self.vlm.score([image], [question], ["Yes"], response_format=self.response_format)[0]
+            score = self.vlm.score(
+                [image],
+                [question],
+                ["Yes"],
+                response_format=self.response_format,
+                use_probability=self.use_probability,
+            )[0]
             self.scores.append(score)
 
     def compute(self) -> MetricResult:
diff --git a/src/pruna/evaluation/metrics/vlm_base.py b/src/pruna/evaluation/metrics/vlm_base.py
diff --git a/tests/evaluation/test_dino_score.py b/tests/evaluation/test_dino_score.py