huggingface · Vinayaktoor · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025
diff --git a/docs/source/en/model_doc/deimv2.md b/docs/source/en/model_doc/deimv2.md
@@ -0,0 +1,132 @@
+<!--Copyright 2025 The HuggingFace Team. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. -->
+
+This model was released in 2025 and added to Hugging Face Transformers in 2025-10. [web:28][web:25]
+
+DEIMv2
+<div style="float: right;"> <div class="flex flex-wrap space-x-1"> <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white"> <img alt="Object Detection" src="https://img.shields.io/badge/Object%20Detection-0ea5e9?style=flat"> <img alt="AutoBackbone" src="https://img.shields.io/badge/AutoBackbone-16a34a?style=flat"> </div> </div>
+Overview
+
+DEIMv2 is a real‑time object detection architecture built on DINOv3 features, introducing a Spatial Tuning Adapter (STA) to convert single‑scale ViT features into a lightweight multi‑scale pyramid, a simplified decoder, and an upgraded Dense one‑to‑one matching strategy. [web:16][web:6]
+
+This integration uses the AutoBackbone API so DINO‑family backbones can be reused without re‑implementation in the detection head; the initial release targets DINOv3/ViT backbones, with tiny HGNetv2 variants planned as follow‑ups.
+
+[!TIP]
+The smallest working example below shows how to run inference and obtain boxes, scores, and labels from post‑processing. [web:25][web:28]
+
+<hfoptions id="usage"> <hfoption id="Pipeline">
+
+from PIL import Image
+from transformers import pipeline
+
+detector = pipeline(
+    task="object-detection",
+    model="your-org/deimv2-dinov3-base"
+)
+image = Image.open("path/to/your/image.jpg")
+outputs = detector(image)
+print(outputs[:3])
+
+</hfoption> <hfoption id="AutoModel">
+
+from PIL import Image
+import requests
+from transformers import Deimv2ImageProcessor, Deimv2ForObjectDetection
+
+ckpt = "your-org/deimv2-dinov3-base" # replace when a checkpoint is available
+model = Deimv2ForObjectDetection.from_pretrained(ckpt)
+processor = Deimv2ImageProcessor.from_pretrained(ckpt)
+
+url = "https://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor.preprocess([image], return_tensors="pt")
+outputs = model(**inputs)
+results = processor.post_process_object_detection(outputs, threshold=0.5)
+print(results)
+
+</hfoption> <hfoption id="transformers CLI">
+
+echo -e "https://images.cocodataset.org/val2017/000000039769.jpg" | transformers run \
+--task object-detection \
+--model your-org/deimv2-dinov3-base
+
+</hfoption> </hfoptions>
+Model notes
+
+Backbone via AutoBackbone: loads DINOv3/ViT variants and exposes feature maps to the DEIMv2 head.
+
+Spatial Tuning Adapter (STA): transforms single‑scale features into a multi‑scale pyramid for accurate localization with minimal overhead.
+
+Decoder and Dense O2O: streamlined decoder with one‑to‑one assignment for stable training and real‑time throughput.
+
+Expected inputs and outputs
+
+Inputs: pixel_values shaped 
+𝐵
+×
+3
+×
+𝐻
+×
+𝑊
+B×3×H×W, produced by Deimv2ImageProcessor.preprocess.
+
+Outputs: class logits 
+𝐵
+×
+𝑄
+×
+𝐶
+B×Q×C and normalized pred_boxes 
+𝐵
+×
+𝑄
+×
+4
+B×Q×4; use post_process_object_detection to filter and convert to absolute coordinates.
+
+Configuration
+
+[[autodoc]] Deimv2Config
+
+__init__
+
+This configuration defines backbone settings, query count, decoder depth, STA parameters, and sets model_type="deimv2". Any changes to the configuration (e.g., hidden_dim, num_queries, or STA scale factors) are reflected in model initialization.
+
+Base model
+
+[[autodoc]] Deimv2Model
+
+forward
+
+Connects the backbone to STA and decoder. Returns decoder hidden states for the detection head.
+
+Task head
+
+[[autodoc]] Deimv2ForObjectDetection
+
+forward
+
+Predicts class logits and normalized bounding boxes for a fixed set of queries. Compatible with the post-processing API to get final detection outputs.
+
+Image Processor
+
+[[autodoc]] Deimv2ImageProcessor
+
+preprocess
+
+post_process_object_detection
+
+Handles resizing, normalization, batching, and conversion of model outputs to boxes, scores, and labels. Supports different input image sizes and batch processing.
+
+Resources
+
+Paper: “Real‑Time Object Detection Meets DINOv3.” [web:16][web:7]
+
+Official repository and model zoo for reference implementations and weights. [web:3][web:12]
+
+AutoBackbone documentation for reusing vision backbones. [web:17][web:28]
+
+Citations
+
+Please cite the original DEIMv2 paper when using this model: “Real‑Time Object Detection Meets DINOv3.” [web:16][web:7]
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -776,6 +776,9 @@
     from .utils.quantization_config import TorchAoConfig as TorchAoConfig
     from .utils.quantization_config import VptqConfig as VptqConfig
     from .video_processing_utils import BaseVideoProcessor as BaseVideoProcessor
+    from .models.deimv2.configuration_deimv2 import Deimv2Config
+    from .models.deimv2.image_processing_deimv2 import Deimv2ImageProcessor
+    from .models.deimv2.modeling_deimv2 import Deimv2ForObjectDetection
 
 else:
     import sys

diff --git a/src/transformers/models/deimv2/README.md b/src/transformers/models/deimv2/README.md
@@ -0,0 +1,3 @@
+# DEIMv2
+
+Implementation of the DEIMv2 model for object detection and multi-scale feature modeling.
diff --git a/src/transformers/models/deimv2/__init__.py b/src/transformers/models/deimv2/__init__.py
@@ -0,0 +1,15 @@
+from typing import Dict, List
+
+# Lazy import structure used across Transformers
+from ...utils import _LazyModule, OptionalDependencyNotAvailable
+import importlib
+import sys
+
+_import_structure = {
+    "configuration_deimv2": ["Deimv2Config"],
+    "image_processing_deimv2": ["Deimv2ImageProcessor"],
+    "modeling_deimv2": ["Deimv2Model", "Deimv2ForObjectDetection"],
+}
+
+# Provide a lazy module so imports are fast and consistent with HF style.
+sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/deimv2/configuration_deimv2.py b/src/transformers/models/deimv2/configuration_deimv2.py
@@ -0,0 +1,74 @@
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+from ...configuration_utils import PretrainedConfig
+
+# Try to import AutoBackboneConfig with a guard 
+try:
+    from ..auto.configuration_auto import AutoBackboneConfig
+except Exception:
+    AutoBackboneConfig = None
+
+@dataclass
+class Deimv2Preset:
+    hidden_dim: int
+    num_queries: int
+    num_decoder_layers: int
+    backbone: str
+
+DEIMV2_PRESETS: Dict[str, Deimv2Preset] = {
+    "base-dinov3-s": Deimv2Preset(hidden_dim=256, num_queries=300, num_decoder_layers=6, backbone="facebook/dinov2-small"),
+    "base-dinov3-b": Deimv2Preset(hidden_dim=256, num_queries=300, num_decoder_layers=6, backbone="facebook/dinov2-base"),
+}
+
+class Deimv2Config(PretrainedConfig):
+    model_type = "deimv2"
+
+    def __init__(
+        self,
+        backbone_config: Optional[Dict[str, Any]] = None,
+        hidden_dim: int = 256,
+        num_queries: int = 300,
+        num_decoder_layers: int = 6,
+        num_labels: int = 91,
+        # STA and decoder knobs 
+        sta_num_scales: int = 4,
+        use_dense_o2o: bool = True,
+        layer_norm_type: str = "rms",
+        activation: str = "swish",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # If AutoBackboneConfig is available, use it to create a default backbone config
+        if backbone_config is None and AutoBackboneConfig is not None:
+            backbone_config = AutoBackboneConfig.from_pretrained(DEIMV2_PRESETS["base-dinov3-b"].backbone).to_dict()
+        elif backbone_config is None:
+            # Last resort: empty dict — user must pass explicit backbone_config
+            backbone_config = {}
+
+        self.backbone_config = backbone_config
+        self.hidden_dim = hidden_dim
+        self.num_queries = num_queries
+        self.num_decoder_layers = num_decoder_layers
+        self.num_labels = num_labels
+        self.sta_num_scales = sta_num_scales
+        self.use_dense_o2o = use_dense_o2o
+        self.layer_norm_type = layer_norm_type
+        self.activation = activation
+
+    @classmethod
+    def from_preset(cls, preset_name: str, **kwargs) -> "Deimv2Config":
+        if preset_name not in DEIMV2_PRESETS:
+            raise ValueError(f"Preset '{preset_name}' not found. Available presets: {list(DEIMV2_PRESETS.keys())}")
+        preset = DEIMV2_PRESETS[preset_name]
+        if AutoBackboneConfig is not None:
+            backbone_config = AutoBackboneConfig.from_pretrained(preset.backbone).to_dict()
+        else:
+            backbone_config = {}
+        return cls(
+            backbone_config=backbone_config,
+            hidden_dim=preset.hidden_dim,
+            num_queries=preset.num_queries,
+            num_decoder_layers=preset.num_decoder_layers,
+            **kwargs,
+        )
diff --git a/src/transformers/models/deimv2/image_processing_deimv2.py b/src/transformers/models/deimv2/image_processing_deimv2.py
@@ -0,0 +1,85 @@
+from typing import List, Dict, Any, Union
+import torch
+from PIL import Image
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, normalize, to_channel_dimension_format
+
+import numpy as np
+
+import torch
+def is_torch_tensor(x):
+    return isinstance(x, torch.Tensor)
+
+class Deimv2ImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+
+    def __init__(self, size: int = 1024, image_mean=None, image_std=None, **kwargs):
+        super().__init__(**kwargs)
+        self.size = size
+        self.image_mean = image_mean or [0.485, 0.456, 0.406]
+        self.image_std = image_std or [0.229, 0.224, 0.225]
+
+    def preprocess(self, images: List[Union[Image.Image, "np.ndarray", torch.Tensor]], return_tensors="pt", **kwargs) -> BatchFeature:
+        pixel_values = []
+        for img in images:
+            # If tensor already, assume it is CxHxW or HxWxC depending on data
+            if is_torch_tensor(img):
+                t = img
+                if t.ndim == 3 and t.shape[0] in (1, 3):  # channels_first
+                    t = t.to(torch.float32)
+                else:
+                    t = t.permute(2, 0, 1).to(torch.float32)
+            else:
+                # Convert to PIL.Image if it's numpy array
+                if not isinstance(img, Image.Image):
+                    img = Image.fromarray(img.astype(np.uint8))
+                img = resize(img, size={"shortest_edge": self.size})
+                arr = to_channel_dimension_format(img, "channels_first")  # likely returns numpy array
+                # convert to tensor and scale to [0,1]
+                t = torch.tensor(arr, dtype=torch.float32) / 255.0
+
+            # normalize (expects channels_first tensor)
+            t = normalize(t, mean=self.image_mean, std=self.image_std)
+            pixel_values.append(torch.as_tensor(t, dtype=torch.float32))
+
+        pixel_values = torch.stack(pixel_values, dim=0)
+        return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
+
+    def post_process_object_detection(self, outputs, threshold: float = 0.5, target_sizes=None) -> List[Dict[str, Any]]:
+        # Minimal passthrough; replace with real box/logit decoding for final PR
+        logits = outputs["logits"]
+        boxes = outputs["pred_boxes"]
+        probs = logits.sigmoid()
+        results = []
+        for prob, box in zip(probs, boxes):
+            keep_mask = prob.max(dim=-1).values > threshold
+            kept_scores = prob[keep_mask]
+            if kept_scores.numel() == 0:
+                results.append({"scores": torch.tensor([]), "labels": torch.tensor([]), "boxes": torch.tensor([])})
+                continue
+            # for each kept index, take max score and label
+            scores, _ = kept_scores.max(dim=-1)
+            labels = kept_scores.argmax(dim=-1)
+            kept_boxes = box[keep_mask]
+            results.append({"scores": scores, "labels": labels, "boxes": kept_boxes})
+
+        if target_sizes is not None:
+            for result, size in zip(results, target_sizes):
+                img_h, img_w = size
+                boxes = result["boxes"]
+                if isinstance(boxes, torch.Tensor) and boxes.numel() != 0:
+                    # Expect boxes normalized as cxcywh or similar—user must keep consistent format
+                    # Here we assume boxes are normalized [cx, cy, w, h] and convert to pixel coords [x1,y1,x2,y2]
+                    # If boxes are already in xyxy, remove the conversion step.
+                    cxcywh = boxes
+                    cx = cxcywh[:, 0] * img_w
+                    cy = cxcywh[:, 1] * img_h
+                    w = cxcywh[:, 2] * img_w
+                    h = cxcywh[:, 3] * img_h
+                    x1 = cx - 0.5 * w
+                    y1 = cy - 0.5 * h
+                    x2 = cx + 0.5 * w
+                    y2 = cy + 0.5 * h
+                    boxes_xyxy = torch.stack([x1, y1, x2, y2], dim=1)
+                    result["boxes"] = boxes_xyxy
+        return results
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# DEIMv2

		Implementation of the DEIMv2 model for object detection and multi-scale feature modeling.