feat(metrics): DINO Score CLS fix, HF models, multi-model support

davidberenstein1957 · davidberenstein1957 · commit 4b9949a7783a · 2026-03-05T14:54:13.000+01:00
- Fix CLS token extraction ([:,0] for v1/v3, x_norm_clstoken for v2)
- Add DINOv3 via Hugging Face (facebook/dinov3-*)
- Add DINOv2 (torch.hub), DINO v1 (timm)
- Parametrized tests for each model

Made-with: Cursor
diff --git a/src/pruna/evaluation/metrics/metric_dino_score.py b/src/pruna/evaluation/metrics/metric_dino_score.py
@@ -14,15 +14,15 @@
 
 from __future__ import annotations
 
-from typing import Any, List
+from typing import Any, List, Literal
 
-import timm
 import torch
 
 # Ruff complains when we don't import functional as f, but common practice is to import it as F
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor
 from torchvision import transforms
+from torchvision.transforms.functional import to_pil_image
 
 from pruna.engine.utils import set_to_best_available_device
 from pruna.evaluation.metrics.metric_stateful import StatefulMetric
@@ -33,6 +33,14 @@
 
 DINO_SCORE = "dino_score"
 
+DINO_PREPROCESS = transforms.Compose(
+    [
+        transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+        transforms.CenterCrop(224),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ]
+)
+
 
 @MetricRegistry.register(DINO_SCORE)
 class DinoScore(StatefulMetric):
@@ -41,49 +49,117 @@ class DinoScore(StatefulMetric):
 
     A similarity metric based on DINO (self-distillation with no labels),
     a self-supervised vision transformer trained to learn high-level image representations without annotations.
-    DinoScore compares the embeddings of generated and reference images in this representation space,
+    DinoScore compares the [CLS] token embeddings of generated and reference images in this representation space,
     producing a value where higher scores indicate that the generated images preserve more of the semantic content of the
     reference images.
 
-    Reference
+    Supports DINO (v1), DINOv2, and DINOv3 backbones. DINOv3 uses Hugging Face Transformers
+    (facebook/dinov3-*) with weights on Hugging Face Hub. Requires transformers>=4.56.0.
+    DINOv3 models are gated; accept the model at huggingface.co before first use.
+
+    References
     ----------
-    https://github.com/facebookresearch/dino
-    https://arxiv.org/abs/2104.14294
+    DINO: https://github.com/facebookresearch/dino, https://arxiv.org/abs/2104.14294
+    DINOv2: https://github.com/facebookresearch/dinov2
+    DINOv3: https://github.com/facebookresearch/dinov3
 
     Parameters
     ----------
     device : str | torch.device | None
         The device to use for the metric.
+    model : str
+        Backbone variant. "dino" uses timm vit_small_patch16_224.dino (DINO v1).
+        "dinov2_*" uses torch.hub facebookresearch/dinov2. "dinov3_*" uses
+        Hugging Face facebook/dinov3-* (ViT and ConvNeXt).
     call_type : str
         The call type to use for the metric.
     """
 
+    DINOV3_HF_MODELS: dict[str, str] = {
+        "dinov3_vits16": "facebook/dinov3-vits16-pretrain-lvd1689m",
+        "dinov3_vits16plus": "facebook/dinov3-vits16plus-pretrain-lvd1689m",
+        "dinov3_vitb16": "facebook/dinov3-vitb16-pretrain-lvd1689m",
+        "dinov3_vitl16": "facebook/dinov3-vitl16-pretrain-lvd1689m",
+        "dinov3_vith16plus": "facebook/dinov3-vith16plus-pretrain-lvd1689m",
+        "dinov3_vit7b16": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
+        "dinov3_convnext_tiny": "facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
+        "dinov3_convnext_small": "facebook/dinov3-convnext-small-pretrain-lvd1689m",
+        "dinov3_convnext_base": "facebook/dinov3-convnext-base-pretrain-lvd1689m",
+        "dinov3_convnext_large": "facebook/dinov3-convnext-large-pretrain-lvd1689m",
+        "dinov3_vitl16_sat": "facebook/dinov3-vitl16-pretrain-sat493m",
+        "dinov3_vit7b16_sat": "facebook/dinov3-vit7b16-pretrain-sat493m",
+    }
+
     similarities: List[Tensor]
     metric_name: str = DINO_SCORE
     higher_is_better: bool = True
-    runs_on: List[str] = ["cuda", "cpu"]
+    runs_on: List[str] = ["cuda", "cpu", "mps"]
     default_call_type: str = "gt_y"
 
-    def __init__(self, device: str | torch.device | None = None, call_type: str = SINGLE):
-        super().__init__()
+    def __init__(
+        self,
+        device: str | torch.device | None = None,
+        model: str = "dino",
+        call_type: str = SINGLE,
+    ):
+        super().__init__(device=device)
         self.device = set_to_best_available_device(device)
         if device is not None and not any(self.device.startswith(prefix) for prefix in self.runs_on):
             pruna_logger.error(f"DinoScore: device {device} not supported. Supported devices: {self.runs_on}")
             raise
         self.call_type = get_call_type_for_single_metric(call_type, self.default_call_type)
-        # Load the DINO ViT-S/16 model once
-        self.model = timm.create_model("vit_small_patch16_224.dino", pretrained=True)
+        self.model_name = model
+        loaded = self._load_model(model)
+        if isinstance(loaded, tuple):
+            self.model, self._hf_processor = loaded
+            self.processor = None
+        else:
+            self.model = loaded
+            self._hf_processor = None
+            self.processor = DINO_PREPROCESS
         self.model.eval().to(self.device)
-        # Add internal state to accumulate similarities
         self.add_state("similarities", default=[])
-        self.processor = transforms.Compose(
-            [
-                transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-                transforms.CenterCrop(224),
-                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
-            ]
+
+    def _load_model(
+        self,
+        model: str,
+    ) -> torch.nn.Module | tuple[torch.nn.Module, object]:
+        if model == "dino":
+            import timm
+            return timm.create_model("vit_small_patch16_224.dino", pretrained=True)
+        if model.startswith("dinov2_"):
+            return torch.hub.load("facebookresearch/dinov2", model)
+        if model in self.DINOV3_HF_MODELS:
+            from transformers import AutoImageProcessor, AutoModel
+            hf_id = self.DINOV3_HF_MODELS[model]
+            processor = AutoImageProcessor.from_pretrained(hf_id)
+            backbone = AutoModel.from_pretrained(hf_id)
+            return backbone, processor
+        raise ValueError(
+            f"Unsupported model: {model}. "
+            f"DINOv3 options: {list(self.DINOV3_HF_MODELS.keys())}"
         )
 
+    def _get_embeddings(self, x: Tensor) -> Tensor:
+        if self.model_name == "dino":
+            features = self.model.forward_features(x)
+            return features[:, 0]
+        if self.model_name.startswith("dinov2_"):
+            out = self.model.forward_features(x)
+            return out["x_norm_clstoken"]
+        features = self.model.forward_features(x)
+        if isinstance(features, dict):
+            return features["x_norm_clstoken"]
+        return features[:, 0]
+
+    def _get_embeddings_hf(self, x: Tensor) -> Tensor:
+        images = [to_pil_image(x[i]) for i in range(x.shape[0])]
+        inputs = self._hf_processor(images=images, return_tensors="pt")
+        pixel_values = inputs["pixel_values"].to(self.device)
+        with torch.no_grad():
+            outputs = self.model(pixel_values)
+        return outputs.pooler_output
+
     @torch.no_grad()
     def update(self, x: List[Any] | Tensor, gt: Tensor, outputs: torch.Tensor) -> None:
         """
@@ -100,13 +176,14 @@ def update(self, x: List[Any] | Tensor, gt: Tensor, outputs: torch.Tensor) -> No
         """
         metric_inputs = metric_data_processor(x, gt, outputs, self.call_type)
         inputs, preds = metric_inputs
-        inputs = self.processor(inputs)
-        preds = self.processor(preds)
-        # Extract embeddings ([CLS] token)
-        emb_x = self.model.forward_features(inputs)
-        emb_y = self.model.forward_features(preds)
-
-        # Normalize embeddings
+        if self._hf_processor is not None:
+            emb_x = self._get_embeddings_hf(inputs)
+            emb_y = self._get_embeddings_hf(preds)
+        else:
+            inputs = self.processor(inputs)
+            preds = self.processor(preds)
+            emb_x = self._get_embeddings(inputs)
+            emb_y = self._get_embeddings(preds)
         emb_x = F.normalize(emb_x, dim=1)
         emb_y = F.normalize(emb_y, dim=1)
 
diff --git a/tests/evaluation/test_dino_score.py b/tests/evaluation/test_dino_score.py
@@ -2,22 +2,51 @@
 import pytest
 from pruna.evaluation.metrics.metric_dino_score import DinoScore
 
-def test_dino_score():
-    """Test the DinoScore metric."""
-    # Use CPU for testing
-    metric = DinoScore(device="cpu")
+DINO_MODELS = [
+    pytest.param("dino", id="dino_v1"),
+    pytest.param("dinov2_vits14", id="dinov2_vits14", marks=pytest.mark.slow),
+    pytest.param("dinov2_vitb14", id="dinov2_vitb14", marks=pytest.mark.slow),
+    pytest.param("dinov2_vitl14", id="dinov2_vitl14", marks=pytest.mark.slow),
+    pytest.param(
+        "dinov3_vits16",
+        id="dinov3_vits16",
+        marks=[
+            pytest.mark.slow,
+            pytest.mark.skip(reason="facebook/dinov3-* are gated; accept at huggingface.co first"),
+        ],
+    ),
+    pytest.param(
+        "dinov3_convnext_tiny",
+        id="dinov3_convnext_tiny",
+        marks=[
+            pytest.mark.slow,
+            pytest.mark.skip(reason="facebook/dinov3-* are gated; accept at huggingface.co first"),
+        ],
+    ),
+]
 
-    # Create dummy images (batch of 2 images, 3x224x224)
-    x = torch.rand(2, 3, 224, 224)
-    y = torch.rand(2,3, 224, 224)
 
-    # Update metric
+@pytest.mark.cpu
+@pytest.mark.parametrize("model", DINO_MODELS)
+def test_dino_score_models(model: str):
+    """Test DinoScore with each supported backbone (dino, dinov2, dinov3)."""
+    metric = DinoScore(device="cpu", model=model)
+    x = torch.rand(2, 3, 224, 224)
+    y = torch.rand(2, 3, 224, 224)
     metric.update(x, y, y)
-
-    # Compute result
     result = metric.compute()
+    assert result.name == "dino_score"
+    assert isinstance(result.result, float)
+    assert -1.0 - 1e-5 <= result.result <= 1.0 + 1e-5
 
+
+def test_dino_score():
+    """Test the DinoScore metric with default model (backward compatibility)."""
+    metric = DinoScore(device="cpu")
+    x = torch.rand(2, 3, 224, 224)
+    y = torch.rand(2, 3, 224, 224)
+    metric.update(x, y, y)
+    result = metric.compute()
     assert result.name == "dino_score"
     assert isinstance(result.result, float)
-    # Cosine similarity should be between -1 and 1
-    assert -1.0 <= result.result <= 1.0
+    assert -1.0 - 1e-5 <= result.result <= 1.0 + 1e-5