Added support for creating Detections instances from SAM3 (#2103)

Erol444 · pre-commit-ci[bot] · Borda · web-flow · commit 839d117f6cbe · 2026-02-12T16:12:42.000+09:00
* Added support for creating Detections instances from SAM3 output - both from `inference` and from RF hosted server (dict)
* added tests, addressed pr comments
* fix(pre_commit): 🎨 auto format pre-commit hooks
* Apply suggestions from code review

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/src/supervision/detection/core.py b/src/supervision/detection/core.py
@@ -17,7 +17,11 @@
     process_transformers_v4_segmentation_result,
     process_transformers_v5_segmentation_result,
 )
-from supervision.detection.utils.converters import mask_to_xyxy, xywh_to_xyxy
+from supervision.detection.utils.converters import (
+    mask_to_xyxy,
+    polygon_to_mask,
+    xywh_to_xyxy,
+)
 from supervision.detection.utils.internal import (
     extract_ultralytics_masks,
     get_data_item,
@@ -52,7 +56,7 @@
 )
 from supervision.geometry.core import Position
 from supervision.utils.internal import deprecated, get_instance_variables
-from supervision.validators import validate_detections_fields
+from supervision.validators import validate_detections_fields, validate_resolution
 
 
 @dataclass
@@ -280,9 +284,11 @@ def from_ultralytics(cls, ultralytics_results) -> Detections:
                 xyxy=ultralytics_results.obb.xyxy.cpu().numpy(),
                 confidence=ultralytics_results.obb.conf.cpu().numpy(),
                 class_id=class_id,
-                tracker_id=ultralytics_results.obb.id.int().cpu().numpy()
-                if ultralytics_results.obb.id is not None
-                else None,
+                tracker_id=(
+                    ultralytics_results.obb.id.int().cpu().numpy()
+                    if ultralytics_results.obb.id is not None
+                    else None
+                ),
                 data={
                     ORIENTED_BOX_COORDINATES: oriented_box_coordinates,
                     CLASS_NAME_DATA_FIELD: class_names,
@@ -308,9 +314,11 @@ def from_ultralytics(cls, ultralytics_results) -> Detections:
                 confidence=ultralytics_results.boxes.conf.cpu().numpy(),
                 class_id=class_id,
                 mask=extract_ultralytics_masks(ultralytics_results),
-                tracker_id=ultralytics_results.boxes.id.int().cpu().numpy()
-                if ultralytics_results.boxes.id is not None
-                else None,
+                tracker_id=(
+                    ultralytics_results.boxes.id.int().cpu().numpy()
+                    if ultralytics_results.boxes.id is not None
+                    else None
+                ),
                 data={CLASS_NAME_DATA_FIELD: class_names},
             )
 
@@ -464,9 +472,11 @@ def from_mmdetection(cls, mmdet_results) -> Detections:
             xyxy=mmdet_results.pred_instances.bboxes.cpu().numpy(),
             confidence=mmdet_results.pred_instances.scores.cpu().numpy(),
             class_id=mmdet_results.pred_instances.labels.cpu().numpy().astype(int),
-            mask=mmdet_results.pred_instances.masks.cpu().numpy()
-            if "masks" in mmdet_results.pred_instances
-            else None,
+            mask=(
+                mmdet_results.pred_instances.masks.cpu().numpy()
+                if "masks" in mmdet_results.pred_instances
+                else None
+            ),
         )
 
     @classmethod
@@ -584,9 +594,11 @@ class IDs, and confidences of the predictions.
         return cls(
             xyxy=detectron2_results["instances"].pred_boxes.tensor.cpu().numpy(),
             confidence=detectron2_results["instances"].scores.cpu().numpy(),
-            mask=detectron2_results["instances"].pred_masks.cpu().numpy()
-            if hasattr(detectron2_results["instances"], "pred_masks")
-            else None,
+            mask=(
+                detectron2_results["instances"].pred_masks.cpu().numpy()
+                if hasattr(detectron2_results["instances"], "pred_masks")
+                else None
+            ),
             class_id=detectron2_results["instances"]
             .pred_classes.cpu()
             .numpy()
@@ -687,6 +699,119 @@ def from_sam(cls, sam_result: list[dict]) -> Detections:
         xyxy = xywh_to_xyxy(xywh=xywh)
         return cls(xyxy=xyxy, mask=mask)
 
+    @classmethod
+    def from_sam3(
+        cls, sam3_result: dict | Any, resolution_wh: tuple[int, int]
+    ) -> Detections:
+        """
+        Creates a Detections instance from
+        [SAM 3](https://github.com/facebookresearch/sam3) inference result.
+
+        Args:
+            sam3_result (dict | Any): The output result from SAM 3 inference,
+                either Sam3PromptResult from inference package or dict containing
+                prompt_results with polygon predictions.
+            resolution_wh (Tuple[int, int]): The width and height of the image
+                used for mask generation.
+
+        Returns:
+            Detections: A new Detections object.
+                The `class_id` field contains the prompt index for each polygon.
+
+        Example:
+            ```python
+            import cv2
+            import supervision as sv
+            from inference.models.sam3 import SegmentAnything3
+            from inference.core.entities.requests.sam3 import Sam3Prompt
+
+            image = cv2.imread("<SOURCE_IMAGE_PATH>")
+            model = SegmentAnything3(
+                model_id="sam3/sam3_final",
+                api_key="<ROBOFLOW_API_KEY>"
+            )
+
+            prompts = [
+                Sam3Prompt(type="text", text="car"),
+                Sam3Prompt(type="text", text="tire"),
+            ]
+
+            result = model.segment_image(
+                image=image,
+                prompts=prompts,
+                output_prob_thresh=0.5,
+                format="polygon"
+            )
+
+            height, width = image.shape[:2]
+            detections = sv.Detections.from_sam3(
+                sam3_result=result,
+                resolution_wh=(width, height)
+            )
+            ```
+        """
+        width, height = validate_resolution(resolution_wh)
+
+        masks = []
+        confidences = []
+        class_ids = []
+
+        if isinstance(sam3_result, dict):
+            prompt_results = sam3_result.get("prompt_results", [])
+        else:
+            prompt_results = getattr(sam3_result, "prompt_results", [])
+
+        for i, prompt_result in enumerate(prompt_results):
+            if isinstance(prompt_result, dict):
+                predictions = prompt_result.get("predictions", [])
+                prompt_index = prompt_result.get("prompt_index", i)
+            else:
+                predictions = getattr(prompt_result, "predictions", [])
+                prompt_index = getattr(prompt_result, "prompt_index", i)
+
+            for prediction in predictions:
+                if isinstance(prediction, dict):
+                    prediction_format = prediction.get("format")
+                    if prediction_format and prediction_format != "polygon":
+                        continue
+                    pred_masks = prediction.get("masks", [])
+                    confidence = prediction.get("confidence", 1.0)
+                else:
+                    prediction_format = getattr(prediction, "format", None)
+                    if prediction_format and prediction_format != "polygon":
+                        continue
+                    pred_masks = getattr(prediction, "masks", [])
+                    confidence = getattr(prediction, "confidence", 1.0)
+
+                if not pred_masks:
+                    continue
+
+                full_mask = np.zeros((height, width), dtype=bool)
+                for poly in pred_masks:
+                    polygon = np.array(poly, dtype=np.int32)
+                    mask = polygon_to_mask(
+                        polygon=polygon, resolution_wh=(width, height)
+                    )
+                    mask = mask.astype(bool, copy=False)
+                    np.logical_or(full_mask, mask, out=full_mask)
+
+                masks.append(full_mask)
+                confidences.append(confidence)
+                class_ids.append(prompt_index)
+
+        if not masks:
+            return cls.empty()
+
+        masks_np = np.stack(masks, axis=0)
+        xyxy = mask_to_xyxy(masks_np)
+
+        return cls(
+            xyxy=xyxy.astype(np.float32),
+            mask=masks_np,
+            confidence=np.array(confidences, dtype=np.float32),
+            class_id=np.array(class_ids, dtype=int),
+        )
+
     @classmethod
     def from_azure_analyze_image(
         cls, azure_result: dict, class_map: dict[int, str] | None = None
diff --git a/tests/detection/test_from_sam.py b/tests/detection/test_from_sam.py
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from supervision.detection.core import Detections
+
+SERVERLESS_SAM3_DICT = {
+    "prompt_results": [
+        {
+            "prompt_index": 0,
+            "echo": {
+                "prompt_index": 0,
+                "type": "text",
+                "text": "person",
+                "num_boxes": 0,
+            },
+            "predictions": [
+                {
+                    "masks": [[[295, 675], [294, 676]], [[496, 617], [495, 618]]],
+                    "confidence": 0.94921875,
+                    "format": "polygon",
+                }
+            ],
+        },
+        {
+            "prompt_index": 1,
+            "echo": {"prompt_index": 1, "type": "text", "text": "dog", "num_boxes": 0},
+            "predictions": [
+                {
+                    "masks": [[[316, 561], [316, 562]], [[345, 251], [344, 252]]],
+                    "confidence": 0.89453125,
+                    "format": "polygon",
+                }
+            ],
+        },
+    ],
+    "time": 0.14756996370851994,
+}
+HOSTED_SAM3_DICT = {
+    "prompt_results": [
+        {
+            "prompt_index": 0,
+            "echo": {
+                "prompt_index": 0,
+                "type": "text",
+                "text": "bottle",
+                "num_boxes": 0,
+            },
+            "predictions": [
+                {
+                    "masks": [[[1364, 200], [1365, 201]]],
+                    "confidence": 0.8984375,
+                    "format": "polygon",
+                },
+                {
+                    "masks": [[[1140, 171], [1139, 170]]],
+                    "confidence": 0.94140625,
+                    "format": "polygon",
+                },
+            ],
+        }
+    ],
+    "time": 0.7277156260097399,
+}
+
+
+@pytest.mark.parametrize(
+    ("sam_result", "expected_xyxy", "expected_mask_shape"),
+    [
+        (
+            [
+                {
+                    "segmentation": np.ones((10, 10), dtype=bool),
+                    "bbox": [0, 0, 10, 10],
+                    "area": 100,
+                }
+            ],
+            np.array([[0, 0, 10, 10]], dtype=np.float32),
+            (1, 10, 10),
+        ),
+        ([], np.empty((0, 4), dtype=np.float32), None),
+    ],
+)
+def test_from_sam(
+    sam_result: list[dict],
+    expected_xyxy: np.ndarray,
+    expected_mask_shape: tuple[int, ...] | None,
+) -> None:
+    detections = Detections.from_sam(sam_result=sam_result)
+
+    assert np.array_equal(detections.xyxy, expected_xyxy)
+    if expected_mask_shape is not None:
+        assert detections.mask.shape == expected_mask_shape
+    else:
+        assert detections.mask is None
+
+
+@pytest.mark.parametrize(
+    (
+        "sam3_result",
+        "resolution_wh",
+        "expected_xyxy",
+        "expected_confidence",
+        "expected_class_id",
+    ),
+    [
+        (
+            {
+                "prompt_results": [
+                    {
+                        "predictions": [
+                            {
+                                "format": "polygon",
+                                "masks": [[[0, 0], [10, 0], [10, 10], [0, 10]]],
+                                "confidence": 0.9,
+                            }
+                        ],
+                        "prompt_index": 0,
+                    }
+                ]
+            },
+            (100, 100),
+            np.array([[0, 0, 10, 10]], dtype=np.float32),
+            np.array([0.9], dtype=np.float32),
+            np.array([0], dtype=int),
+        ),
+        (
+            {"prompt_results": []},
+            (100, 100),
+            np.empty((0, 4), dtype=np.float32),
+            np.empty((0,), dtype=np.float32),
+            np.empty((0,), dtype=int),
+        ),
+        (
+            SERVERLESS_SAM3_DICT,
+            (1000, 1000),
+            np.array(
+                [[294.0, 617.0, 496.0, 676.0], [316.0, 251.0, 345.0, 562.0]],
+                dtype=np.float32,
+            ),
+            np.array([0.94921875, 0.89453125], dtype=np.float32),
+            np.array([0, 1], dtype=int),
+        ),
+        (
+            HOSTED_SAM3_DICT,
+            (2000, 2000),
+            np.array(
+                [[1364.0, 200.0, 1365.0, 201.0], [1139.0, 170.0, 1140.0, 171.0]],
+                dtype=np.float32,
+            ),
+            np.array([0.898438, 0.941406], dtype=np.float32),
+            np.array([0, 0], dtype=int),
+        ),
+    ],
+)
+def test_from_sam3(
+    sam3_result: dict,
+    resolution_wh: tuple[int, int],
+    expected_xyxy: np.ndarray,
+    expected_confidence: np.ndarray,
+    expected_class_id: np.ndarray,
+) -> None:
+    detections = Detections.from_sam3(
+        sam3_result=sam3_result, resolution_wh=resolution_wh
+    )
+
+    np.testing.assert_allclose(detections.xyxy, expected_xyxy, atol=1e-5)
+    np.testing.assert_allclose(detections.confidence, expected_confidence, atol=1e-5)
+    np.testing.assert_array_equal(detections.class_id, expected_class_id)
+
+
+def test_from_sam3_invalid_resolution() -> None:
+    sam3_result = {"prompt_results": []}
+    with pytest.raises(
+        ValueError, match=r"Both dimensions in resolution must be positive\."
+    ):
+        Detections.from_sam3(sam3_result=sam3_result, resolution_wh=(-100, 100))