Merge pull request #2015 from roboflow/feature/support_for_qwen_3_vl

SkalskiP · web-flow · commit 373fbae1fa71 · 2025-11-16T01:02:29.000+01:00
Qwen3-VL support
diff --git a/supervision/detection/core.py b/supervision/detection/core.py
@@ -47,6 +47,7 @@
     from_moondream,
     from_paligemma,
     from_qwen_2_5_vl,
+    from_qwen_3_vl,
     validate_vlm_parameters,
 )
 from supervision.geometry.core import Position
@@ -951,6 +952,36 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
             # array([0, 1])
             ```
 
+        !!! example "Qwen3-VL"
+
+            ```python
+            import supervision as sv
+
+            qwen_3_vl_result = \"\"\"```json
+            [
+                {"bbox_2d": [139, 768, 315, 954], "label": "cat"},
+                {"bbox_2d": [366, 679, 536, 849], "label": "dog"}
+            ]
+            ```\"\"\"
+            detections = sv.Detections.from_lmm(
+                sv.LMM.QWEN_3_VL,
+                qwen_3_vl_result,
+                resolution_wh=(1000, 1000),
+                classes=['cat', 'dog'],
+            )
+            detections.xyxy
+            # array([[139., 768., 315., 954.], [366., 679., 536., 849.]])
+
+            detections.class_id
+            # array([0, 1])
+
+            detections.data
+            # {'class_name': array(['cat', 'dog'], dtype='<U10')}
+
+            detections.class_id
+            # array([0, 1])
+            ```
+
         !!! example "Gemini 2.0"
             ```python
             import supervision as sv
@@ -1211,6 +1242,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
         | PaliGemma           | `PALIGEMMA`          | detection               | `resolution_wh`             | `classes`           |
         | PaliGemma 2         | `PALIGEMMA`          | detection               | `resolution_wh`             | `classes`           |
         | Qwen2.5-VL          | `QWEN_2_5_VL`        | detection               | `resolution_wh`, `input_wh` | `classes`           |
+        | Qwen3-VL            | `QWEN_3_VL`          | detection               | `resolution_wh`,            | `classes`           |
         | Google Gemini 2.0   | `GOOGLE_GEMINI_2_0`  | detection               | `resolution_wh`             | `classes`           |
         | Google Gemini 2.5   | `GOOGLE_GEMINI_2_5`  | detection, segmentation | `resolution_wh`             | `classes`           |
         | Moondream           | `MOONDREAM`          | detection               | `resolution_wh`             |                     |
@@ -1328,6 +1360,36 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
             # array([0, 1])
             ```
 
+        !!! example "Qwen3-VL"
+
+            ```python
+            import supervision as sv
+
+            qwen_3_vl_result = \"\"\"```json
+            [
+                {"bbox_2d": [139, 768, 315, 954], "label": "cat"},
+                {"bbox_2d": [366, 679, 536, 849], "label": "dog"}
+            ]
+            ```\"\"\"
+            detections = sv.Detections.from_vlm(
+                sv.VLM.QWEN_3_VL,
+                qwen_3_vl_result,
+                resolution_wh=(1000, 1000),
+                classes=['cat', 'dog'],
+            )
+            detections.xyxy
+            # array([[139., 768., 315., 954.], [366., 679., 536., 849.]])
+
+            detections.class_id
+            # array([0, 1])
+
+            detections.data
+            # {'class_name': array(['cat', 'dog'], dtype='<U10')}
+
+            detections.class_id
+            # array([0, 1])
+            ```
+
         !!! example "Gemini 2.0"
             ```python
             import supervision as sv
@@ -1556,7 +1618,14 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
         if vlm == VLM.QWEN_2_5_VL:
             xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs)
             data = {CLASS_NAME_DATA_FIELD: class_name}
-            return cls(xyxy=xyxy, class_id=class_id, data=data)
+            confidence = np.ones(len(xyxy), dtype=float)
+            return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data)
+
+        if vlm == VLM.QWEN_3_VL:
+            xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs)
+            data = {CLASS_NAME_DATA_FIELD: class_name}
+            confidence = np.ones(len(xyxy), dtype=float)
+            return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data)
 
         if vlm == VLM.DEEPSEEK_VL_2:
             xyxy, class_id, class_name = from_deepseek_vl_2(result, **kwargs)
diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import ast
 import base64
 import io
 import json
@@ -27,7 +28,8 @@ class LMM(Enum):
     Attributes:
         PALIGEMMA: Google's PaliGemma vision-language model.
         FLORENCE_2: Microsoft's Florence-2 vision-language model.
-        QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
+        QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.\
+        QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
         GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
         GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
         MOONDREAM: The Moondream vision-language model.
@@ -36,6 +38,7 @@ class LMM(Enum):
     PALIGEMMA = "paligemma"
     FLORENCE_2 = "florence_2"
     QWEN_2_5_VL = "qwen_2_5_vl"
+    QWEN_3_VL = "qwen_3_vl"
     DEEPSEEK_VL_2 = "deepseek_vl_2"
     GOOGLE_GEMINI_2_0 = "gemini_2_0"
     GOOGLE_GEMINI_2_5 = "gemini_2_5"
@@ -69,6 +72,7 @@ class VLM(Enum):
         PALIGEMMA: Google's PaliGemma vision-language model.
         FLORENCE_2: Microsoft's Florence-2 vision-language model.
         QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
+        QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
         GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
         GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
         MOONDREAM: The Moondream vision-language model.
@@ -77,6 +81,7 @@ class VLM(Enum):
     PALIGEMMA = "paligemma"
     FLORENCE_2 = "florence_2"
     QWEN_2_5_VL = "qwen_2_5_vl"
+    QWEN_3_VL = "qwen_3_vl"
     DEEPSEEK_VL_2 = "deepseek_vl_2"
     GOOGLE_GEMINI_2_0 = "gemini_2_0"
     GOOGLE_GEMINI_2_5 = "gemini_2_5"
@@ -106,6 +111,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.PALIGEMMA: str,
     VLM.FLORENCE_2: dict,
     VLM.QWEN_2_5_VL: str,
+    VLM.QWEN_3_VL: str,
     VLM.DEEPSEEK_VL_2: str,
     VLM.GOOGLE_GEMINI_2_0: str,
     VLM.GOOGLE_GEMINI_2_5: str,
@@ -116,6 +122,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.PALIGEMMA: ["resolution_wh"],
     VLM.FLORENCE_2: ["resolution_wh"],
     VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh"],
+    VLM.QWEN_3_VL: ["resolution_wh"],
     VLM.DEEPSEEK_VL_2: ["resolution_wh"],
     VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
     VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
@@ -126,6 +133,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.PALIGEMMA: ["resolution_wh", "classes"],
     VLM.FLORENCE_2: ["resolution_wh"],
     VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh", "classes"],
+    VLM.QWEN_3_VL: ["resolution_wh", "classes"],
     VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"],
     VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
     VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
@@ -235,14 +243,59 @@ def from_paligemma(
     return xyxy, class_id, class_name
 
 
+def recover_truncated_qwen_2_5_vl_response(text: str) -> Any | None:
+    """
+    Attempt to recover and parse a truncated or malformed JSON snippet from Qwen-2.5-VL
+    output.
+
+    This utility extracts a JSON-like portion from a string that may be truncated or
+    malformed, cleans trailing commas, and attempts to parse it into a Python object.
+
+    Args:
+        text (str): Raw text containing the JSON snippet possibly truncated or
+            incomplete.
+
+    Returns:
+        Parsed Python object (usually list) if recovery and parsing succeed;
+            otherwise `None`.
+    """
+    try:
+        first_bracket = text.find("[")
+        if first_bracket == -1:
+            return None
+        snippet = text[first_bracket:]
+
+        last_brace = snippet.rfind("}")
+        if last_brace == -1:
+            return None
+
+        snippet = snippet[: last_brace + 1]
+
+        prefix_end = snippet.find("[")
+        if prefix_end == -1:
+            return None
+
+        prefix = snippet[: prefix_end + 1]
+        body = snippet[prefix_end + 1 :].rstrip()
+
+        if body.endswith(","):
+            body = body[:-1].rstrip()
+
+        repaired = prefix + body + "]"
+
+        return json.loads(repaired)
+    except Exception:
+        return None
+
+
 def from_qwen_2_5_vl(
     result: str,
     input_wh: tuple[int, int],
     resolution_wh: tuple[int, int],
     classes: list[str] | None = None,
 ) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
     """
-    Parse and scale bounding boxes from Qwen-2.5-VL style JSON output.
+    Parse and rescale bounding boxes and class labels from Qwen-2.5-VL JSON output.
 
     The JSON is expected to be enclosed in triple backticks with the format:
       ```json
@@ -253,38 +306,52 @@ def from_qwen_2_5_vl(
       ```
 
     Args:
-        result: String containing the JSON snippet enclosed by triple backticks.
-        input_wh: (input_width, input_height) describing the original bounding box
-            scale.
-        resolution_wh: (output_width, output_height) to which we rescale the boxes.
-        classes: Optional list of valid class names. If provided, returned boxes/labels
-            are filtered to only those classes found here.
+        result (str): String containing Qwen-2.5-VL JSON bounding box and label data.
+        input_wh (tuple[int, int]): Width and height of the coordinate space where boxes
+            are normalized.
+        resolution_wh (tuple[int, int]): Target width and height to scale bounding
+            boxes.
+        classes (list[str] or None): Optional list of valid class names to filter
+            results. If provided, only boxes with labels in this list are returned.
 
     Returns:
-        xyxy (np.ndarray): An array of shape `(n, 4)` containing
-            the bounding boxes coordinates in format `[x1, y1, x2, y2]`
-        class_id (Optional[np.ndarray]): An array of shape `(n,)` containing
-            the class indices for each bounding box (or None if `classes` is not
-            provided)
-        class_name (np.ndarray): An array of shape `(n,)` containing
-            the class labels for each bounding box
+        xyxy (np.ndarray): Array of shape `(N, 4)` with rescaled bounding boxes in
+            `(x_min, y_min, x_max, y_max)` format.
+        class_id (np.ndarray or None): Array of shape `(N,)` with indices of classes,
+            or `None` if no filtering applied.
+        class_name (np.ndarray): Array of shape `(N,)` with class names as strings.
     """
 
     in_w, in_h = validate_resolution(input_wh)
     out_w, out_h = validate_resolution(resolution_wh)
 
-    pattern = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL)
-
-    match = pattern.search(result)
-    if not match:
-        return np.empty((0, 4)), None, np.empty((0,), dtype=str)
+    text = result.strip()
+    text = re.sub(r"^```(json)?", "", text, flags=re.IGNORECASE).strip()
+    text = re.sub(r"```$", "", text).strip()
 
-    json_snippet = match.group(1)
+    start = text.find("[")
+    end = text.rfind("]")
+    if start != -1 and end != -1 and end > start:
+        text = text[start : end + 1].strip()
 
     try:
-        data = json.loads(json_snippet)
+        data = json.loads(text)
     except json.JSONDecodeError:
-        return np.empty((0, 4)), None, np.empty((0,), dtype=str)
+        repaired = recover_truncated_qwen_2_5_vl_response(text)
+        if repaired is not None:
+            data = repaired
+        else:
+            try:
+                data = ast.literal_eval(text)
+            except (ValueError, SyntaxError, TypeError):
+                return (
+                    np.empty((0, 4)),
+                    np.empty((0,), dtype=int),
+                    np.empty((0,), dtype=str),
+                )
+
+    if not isinstance(data, list):
+        return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str))
 
     boxes_list = []
     labels_list = []
@@ -296,7 +363,7 @@ def from_qwen_2_5_vl(
         labels_list.append(item["label"])
 
     if not boxes_list:
-        return np.empty((0, 4)), None, np.empty((0,), dtype=str)
+        return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str))
 
     xyxy = np.array(boxes_list, dtype=float)
     class_name = np.array(labels_list, dtype=str)
@@ -315,6 +382,36 @@ def from_qwen_2_5_vl(
     return xyxy, class_id, class_name
 
 
+def from_qwen_3_vl(
+    result: str,
+    resolution_wh: tuple[int, int],
+    classes: list[str] | None = None,
+) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
+    """
+    Parse and scale bounding boxes from Qwen-3-VL style JSON output.
+
+    Args:
+        result (str): String containing the Qwen-3-VL JSON output.
+        resolution_wh (tuple[int, int]): Target resolution `(width, height)` to
+            scale bounding boxes.
+        classes (list[str] or None): Optional list of valid classes to filter
+            results.
+
+    Returns:
+        xyxy (np.ndarray): Array of bounding boxes with shape `(N, 4)` in
+            `(x_min, y_min, x_max, y_max)` format scaled to `resolution_wh`.
+        class_id (np.ndarray or None): Array of class indices for each box, or
+            None if no filtering by classes.
+        class_name (np.ndarray): Array of class names as strings.
+    """
+    return from_qwen_2_5_vl(
+        result=result,
+        input_wh=(1000, 1000),
+        resolution_wh=resolution_wh,
+        classes=classes,
+    )
+
+
 def from_deepseek_vl_2(
     result: str, resolution_wh: tuple[int, int], classes: list[str] | None = None
 ) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py