Merge pull request #1786 from roboflow/feature/grounding-dino-registsred-in-inference-exp

PawelPeczek-Roboflow · web-flow · commit c65cb82c6943 · 2025-12-08T19:57:42.000+01:00
`GroundingDino` registered in `inference-exp`
diff --git a/inference_experimental/inference_exp/models/auto_loaders/models_registry.py b/inference_experimental/inference_exp/models/auto_loaders/models_registry.py
@@ -18,6 +18,7 @@
 STRUCTURED_OCR_TASK = "structured-ocr"
 TEXT_ONLY_OCR_TASK = "text-only-ocr"
 GAZE_DETECTION_TASK = "gaze-detection"
+OPEN_VOCABULARY_OBJECT_DETECTION_TASK = "open-vocabulary-object-detection"
 
 
 @dataclass(frozen=True)
@@ -384,6 +385,14 @@ class RegistryEntry:
         module_name="inference_exp.models.l2cs.l2cs_onnx",
         class_name="L2CSNetOnnx",
     ),
+    (
+        "grounding-dino",
+        OPEN_VOCABULARY_OBJECT_DETECTION_TASK,
+        BackendType.TORCH,
+    ): LazyClass(
+        module_name="inference_exp.models.grounding_dino.grounding_dino_torch",
+        class_name="GroundingDinoForObjectDetectionTorch",
+    ),
 }
 
 
diff --git a/inference_experimental/inference_exp/models/grounding_dino/grounding_dino_torch.py b/inference_experimental/inference_exp/models/grounding_dino/grounding_dino_torch.py
@@ -1,38 +1,22 @@
-import os
+import os.path
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torchvision
+from groundingdino.util.inference import load_model, predict
 from inference_exp import Detections
 from inference_exp.configuration import DEFAULT_DEVICE
 from inference_exp.entities import ColorFormat, ImageDimensions
-from inference_exp.errors import MissingDependencyError, ModelRuntimeError
+from inference_exp.errors import ModelRuntimeError
 from inference_exp.models.base.object_detection import (
     OpenVocabularyObjectDetectionModel,
 )
 from inference_exp.models.common.model_packages import get_model_package_contents
-from inference_exp.utils.download import download_files_to_directory
 from torch import nn
 from torchvision import transforms
 from torchvision.ops import box_convert
 
-try:
-    from groundingdino.util.inference import load_model, predict
-except ImportError as import_error:
-    raise MissingDependencyError(
-        message=f"Could not import GroundingDino model - this error means that some additional dependencies "
-        f"are not installed in the environment. If you run the `inference-exp` library directly in your Python "
-        f"program, make sure the following extras of the package are installed: `grounding-dino`."
-        f"If you see this error using Roboflow infrastructure, make sure the service you use does support the model. "
-        f"You can also contact Roboflow to get support.",
-        help_url="https://todo",
-    ) from import_error
-
-
-DEFAULT_CONFIG_URL = "https://raw.githubusercontent.com/roboflow/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py"
-DEFAULT_CONFIG_MD5 = "bdb07fc17b611d622633d133d2cf873a"
-
 
 class GroundingDinoForObjectDetectionTorch(
     OpenVocabularyObjectDetectionModel[
@@ -50,23 +34,16 @@ def from_pretrained(
     ) -> "GroundingDinoForObjectDetectionTorch":
         model_package_content = get_model_package_contents(
             model_package_dir=model_name_or_path,
-            elements=["groundingdino_swint_ogc.pth"],
+            elements=["weights.pth", "config.py"],
         )
-        config_path = os.path.join(model_name_or_path, "GroundingDINO_SwinT_OGC.py")
-        if not os.path.exists(config_path):
-            download_files_to_directory(
-                target_dir=model_name_or_path,
-                files_specs=[
-                    (
-                        "GroundingDINO_SwinT_OGC.py",
-                        DEFAULT_CONFIG_URL,
-                        DEFAULT_CONFIG_MD5,
-                    )
-                ],
-            )
+        text_encoder_dir = os.path.join(model_name_or_path, "text_encoder")
+        loader_kwargs = {}
+        if os.path.isdir(text_encoder_dir):
+            loader_kwargs["text_encoder_type"] = text_encoder_dir
         model = load_model(
-            model_config_path=config_path,
-            model_checkpoint_path=model_package_content["groundingdino_swint_ogc.pth"],
+            model_config_path=model_package_content["config.py"],
+            model_checkpoint_path=model_package_content["weights.pth"],
+            **loader_kwargs,
         ).to(device)
         return cls(model=model, device=device)
 
@@ -176,19 +153,20 @@ def forward(
             text_threshold = conf_thresh
         caption = ". ".join(classes)
         all_boxes, all_logits, all_phrases = [], [], []
-        for image in pre_processed_images:
-            boxes, logits, phrases = predict(
-                model=self._model,
-                image=image,
-                caption=caption,
-                box_threshold=conf_thresh,
-                text_threshold=text_threshold,
-                device=self._device,
-                remove_combined=True,
-            )
-            all_boxes.append(boxes)
-            all_logits.append(logits)
-            all_phrases.append(phrases)
+        with torch.inference_mode():
+            for image in pre_processed_images:
+                boxes, logits, phrases = predict(
+                    model=self._model,
+                    image=image,
+                    caption=caption,
+                    box_threshold=conf_thresh,
+                    text_threshold=text_threshold,
+                    device=self._device,
+                    remove_combined=True,
+                )
+                all_boxes.append(boxes)
+                all_logits.append(logits)
+                all_phrases.append(phrases)
         return all_boxes, all_logits, all_phrases, classes
 
     def post_process(
diff --git a/inference_experimental/pyproject.toml b/inference_experimental/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
   "scikit-image>=0.24.0,<0.26.0",
   "easyocr~=1.7.2",
   "sentencepiece>=0.2.0,<0.3.0",
+  "rf_groundingdino==0.3.0"
 ]
 
 [project.optional-dependencies]
@@ -83,9 +84,6 @@ onnx-jp6-cu126 = [
 mediapipe = [
   "rf-mediapipe>=0.9,<0.11.0"
 ]
-grounding-dino = [
-  "rf_groundingdino==0.2.0"
-]
 trt10 = [
   "tensorrt-cu12>=10.0.0,<11.0.0; platform_system == 'Linux' or platform_system == 'Windows'",
   "tensorrt-lean-cu12>=10.0.0,<11.0.0; platform_system == 'Linux' or platform_system == 'Windows'",
diff --git a/inference_experimental/tests/e2e_platform_tests/test_grounding_dino_e2e.py b/inference_experimental/tests/e2e_platform_tests/test_grounding_dino_e2e.py
@@ -0,0 +1,16 @@
+import numpy as np
+import pytest
+from inference_exp import AutoModel
+
+
+@pytest.mark.e2e_model_inference
+def test_grounding_dino(dog_image_numpy: np.ndarray, roboflow_api_key: str) -> None:
+    # given
+    model = AutoModel.from_pretrained("grounding-dino", api_key=roboflow_api_key)
+
+    # when
+    predictions = model(dog_image_numpy, ["dog", "person", "bagpack"], conf_thresh=0.33)
+
+    # then
+    assert len(predictions[0].xyxy) == 3
+    assert set(predictions[0].class_id.tolist()) == {0, 1, 2}
diff --git a/inference_experimental/uv.lock b/inference_experimental/uv.lock
diff --git a/requirements/requirements.groundingdino.txt b/requirements/requirements.groundingdino.txt