Skip to content

Commit 373fbae

Browse files
authored
Merge pull request #2015 from roboflow/feature/support_for_qwen_3_vl
Qwen3-VL support
2 parents 25c2f5c + 790ccac commit 373fbae

File tree

3 files changed

+232
-29
lines changed

3 files changed

+232
-29
lines changed

supervision/detection/core.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from_moondream,
4848
from_paligemma,
4949
from_qwen_2_5_vl,
50+
from_qwen_3_vl,
5051
validate_vlm_parameters,
5152
)
5253
from supervision.geometry.core import Position
@@ -951,6 +952,36 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
951952
# array([0, 1])
952953
```
953954
955+
!!! example "Qwen3-VL"
956+
957+
```python
958+
import supervision as sv
959+
960+
qwen_3_vl_result = \"\"\"```json
961+
[
962+
{"bbox_2d": [139, 768, 315, 954], "label": "cat"},
963+
{"bbox_2d": [366, 679, 536, 849], "label": "dog"}
964+
]
965+
```\"\"\"
966+
detections = sv.Detections.from_lmm(
967+
sv.LMM.QWEN_3_VL,
968+
qwen_3_vl_result,
969+
resolution_wh=(1000, 1000),
970+
classes=['cat', 'dog'],
971+
)
972+
detections.xyxy
973+
# array([[139., 768., 315., 954.], [366., 679., 536., 849.]])
974+
975+
detections.class_id
976+
# array([0, 1])
977+
978+
detections.data
979+
# {'class_name': array(['cat', 'dog'], dtype='<U10')}
980+
981+
detections.class_id
982+
# array([0, 1])
983+
```
984+
954985
!!! example "Gemini 2.0"
955986
```python
956987
import supervision as sv
@@ -1211,6 +1242,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
12111242
| PaliGemma | `PALIGEMMA` | detection | `resolution_wh` | `classes` |
12121243
| PaliGemma 2 | `PALIGEMMA` | detection | `resolution_wh` | `classes` |
12131244
| Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` |
1245+
| Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` |
12141246
| Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` |
12151247
| Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` |
12161248
| Moondream | `MOONDREAM` | detection | `resolution_wh` | |
@@ -1328,6 +1360,36 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
13281360
# array([0, 1])
13291361
```
13301362
1363+
!!! example "Qwen3-VL"
1364+
1365+
```python
1366+
import supervision as sv
1367+
1368+
qwen_3_vl_result = \"\"\"```json
1369+
[
1370+
{"bbox_2d": [139, 768, 315, 954], "label": "cat"},
1371+
{"bbox_2d": [366, 679, 536, 849], "label": "dog"}
1372+
]
1373+
```\"\"\"
1374+
detections = sv.Detections.from_vlm(
1375+
sv.VLM.QWEN_3_VL,
1376+
qwen_3_vl_result,
1377+
resolution_wh=(1000, 1000),
1378+
classes=['cat', 'dog'],
1379+
)
1380+
detections.xyxy
1381+
# array([[139., 768., 315., 954.], [366., 679., 536., 849.]])
1382+
1383+
detections.class_id
1384+
# array([0, 1])
1385+
1386+
detections.data
1387+
# {'class_name': array(['cat', 'dog'], dtype='<U10')}
1388+
1389+
detections.class_id
1390+
# array([0, 1])
1391+
```
1392+
13311393
!!! example "Gemini 2.0"
13321394
```python
13331395
import supervision as sv
@@ -1556,7 +1618,14 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
15561618
if vlm == VLM.QWEN_2_5_VL:
15571619
xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs)
15581620
data = {CLASS_NAME_DATA_FIELD: class_name}
1559-
return cls(xyxy=xyxy, class_id=class_id, data=data)
1621+
confidence = np.ones(len(xyxy), dtype=float)
1622+
return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data)
1623+
1624+
if vlm == VLM.QWEN_3_VL:
1625+
xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs)
1626+
data = {CLASS_NAME_DATA_FIELD: class_name}
1627+
confidence = np.ones(len(xyxy), dtype=float)
1628+
return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data)
15601629

15611630
if vlm == VLM.DEEPSEEK_VL_2:
15621631
xyxy, class_id, class_name = from_deepseek_vl_2(result, **kwargs)

supervision/detection/vlm.py

Lines changed: 121 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import ast
34
import base64
45
import io
56
import json
@@ -27,7 +28,8 @@ class LMM(Enum):
2728
Attributes:
2829
PALIGEMMA: Google's PaliGemma vision-language model.
2930
FLORENCE_2: Microsoft's Florence-2 vision-language model.
30-
QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
31+
QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.\
32+
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
3133
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
3234
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
3335
MOONDREAM: The Moondream vision-language model.
@@ -36,6 +38,7 @@ class LMM(Enum):
3638
PALIGEMMA = "paligemma"
3739
FLORENCE_2 = "florence_2"
3840
QWEN_2_5_VL = "qwen_2_5_vl"
41+
QWEN_3_VL = "qwen_3_vl"
3942
DEEPSEEK_VL_2 = "deepseek_vl_2"
4043
GOOGLE_GEMINI_2_0 = "gemini_2_0"
4144
GOOGLE_GEMINI_2_5 = "gemini_2_5"
@@ -69,6 +72,7 @@ class VLM(Enum):
6972
PALIGEMMA: Google's PaliGemma vision-language model.
7073
FLORENCE_2: Microsoft's Florence-2 vision-language model.
7174
QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
75+
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
7276
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
7377
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
7478
MOONDREAM: The Moondream vision-language model.
@@ -77,6 +81,7 @@ class VLM(Enum):
7781
PALIGEMMA = "paligemma"
7882
FLORENCE_2 = "florence_2"
7983
QWEN_2_5_VL = "qwen_2_5_vl"
84+
QWEN_3_VL = "qwen_3_vl"
8085
DEEPSEEK_VL_2 = "deepseek_vl_2"
8186
GOOGLE_GEMINI_2_0 = "gemini_2_0"
8287
GOOGLE_GEMINI_2_5 = "gemini_2_5"
@@ -106,6 +111,7 @@ def from_value(cls, value: VLM | str) -> VLM:
106111
VLM.PALIGEMMA: str,
107112
VLM.FLORENCE_2: dict,
108113
VLM.QWEN_2_5_VL: str,
114+
VLM.QWEN_3_VL: str,
109115
VLM.DEEPSEEK_VL_2: str,
110116
VLM.GOOGLE_GEMINI_2_0: str,
111117
VLM.GOOGLE_GEMINI_2_5: str,
@@ -116,6 +122,7 @@ def from_value(cls, value: VLM | str) -> VLM:
116122
VLM.PALIGEMMA: ["resolution_wh"],
117123
VLM.FLORENCE_2: ["resolution_wh"],
118124
VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh"],
125+
VLM.QWEN_3_VL: ["resolution_wh"],
119126
VLM.DEEPSEEK_VL_2: ["resolution_wh"],
120127
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
121128
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
@@ -126,6 +133,7 @@ def from_value(cls, value: VLM | str) -> VLM:
126133
VLM.PALIGEMMA: ["resolution_wh", "classes"],
127134
VLM.FLORENCE_2: ["resolution_wh"],
128135
VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh", "classes"],
136+
VLM.QWEN_3_VL: ["resolution_wh", "classes"],
129137
VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"],
130138
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
131139
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
@@ -235,14 +243,59 @@ def from_paligemma(
235243
return xyxy, class_id, class_name
236244

237245

246+
def recover_truncated_qwen_2_5_vl_response(text: str) -> Any | None:
247+
"""
248+
Attempt to recover and parse a truncated or malformed JSON snippet from Qwen-2.5-VL
249+
output.
250+
251+
This utility extracts a JSON-like portion from a string that may be truncated or
252+
malformed, cleans trailing commas, and attempts to parse it into a Python object.
253+
254+
Args:
255+
text (str): Raw text containing the JSON snippet possibly truncated or
256+
incomplete.
257+
258+
Returns:
259+
Parsed Python object (usually list) if recovery and parsing succeed;
260+
otherwise `None`.
261+
"""
262+
try:
263+
first_bracket = text.find("[")
264+
if first_bracket == -1:
265+
return None
266+
snippet = text[first_bracket:]
267+
268+
last_brace = snippet.rfind("}")
269+
if last_brace == -1:
270+
return None
271+
272+
snippet = snippet[: last_brace + 1]
273+
274+
prefix_end = snippet.find("[")
275+
if prefix_end == -1:
276+
return None
277+
278+
prefix = snippet[: prefix_end + 1]
279+
body = snippet[prefix_end + 1 :].rstrip()
280+
281+
if body.endswith(","):
282+
body = body[:-1].rstrip()
283+
284+
repaired = prefix + body + "]"
285+
286+
return json.loads(repaired)
287+
except Exception:
288+
return None
289+
290+
238291
def from_qwen_2_5_vl(
239292
result: str,
240293
input_wh: tuple[int, int],
241294
resolution_wh: tuple[int, int],
242295
classes: list[str] | None = None,
243296
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
244297
"""
245-
Parse and scale bounding boxes from Qwen-2.5-VL style JSON output.
298+
Parse and rescale bounding boxes and class labels from Qwen-2.5-VL JSON output.
246299
247300
The JSON is expected to be enclosed in triple backticks with the format:
248301
```json
@@ -253,38 +306,52 @@ def from_qwen_2_5_vl(
253306
```
254307
255308
Args:
256-
result: String containing the JSON snippet enclosed by triple backticks.
257-
input_wh: (input_width, input_height) describing the original bounding box
258-
scale.
259-
resolution_wh: (output_width, output_height) to which we rescale the boxes.
260-
classes: Optional list of valid class names. If provided, returned boxes/labels
261-
are filtered to only those classes found here.
309+
result (str): String containing Qwen-2.5-VL JSON bounding box and label data.
310+
input_wh (tuple[int, int]): Width and height of the coordinate space where boxes
311+
are normalized.
312+
resolution_wh (tuple[int, int]): Target width and height to scale bounding
313+
boxes.
314+
classes (list[str] or None): Optional list of valid class names to filter
315+
results. If provided, only boxes with labels in this list are returned.
262316
263317
Returns:
264-
xyxy (np.ndarray): An array of shape `(n, 4)` containing
265-
the bounding boxes coordinates in format `[x1, y1, x2, y2]`
266-
class_id (Optional[np.ndarray]): An array of shape `(n,)` containing
267-
the class indices for each bounding box (or None if `classes` is not
268-
provided)
269-
class_name (np.ndarray): An array of shape `(n,)` containing
270-
the class labels for each bounding box
318+
xyxy (np.ndarray): Array of shape `(N, 4)` with rescaled bounding boxes in
319+
`(x_min, y_min, x_max, y_max)` format.
320+
class_id (np.ndarray or None): Array of shape `(N,)` with indices of classes,
321+
or `None` if no filtering applied.
322+
class_name (np.ndarray): Array of shape `(N,)` with class names as strings.
271323
"""
272324

273325
in_w, in_h = validate_resolution(input_wh)
274326
out_w, out_h = validate_resolution(resolution_wh)
275327

276-
pattern = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL)
277-
278-
match = pattern.search(result)
279-
if not match:
280-
return np.empty((0, 4)), None, np.empty((0,), dtype=str)
328+
text = result.strip()
329+
text = re.sub(r"^```(json)?", "", text, flags=re.IGNORECASE).strip()
330+
text = re.sub(r"```$", "", text).strip()
281331

282-
json_snippet = match.group(1)
332+
start = text.find("[")
333+
end = text.rfind("]")
334+
if start != -1 and end != -1 and end > start:
335+
text = text[start : end + 1].strip()
283336

284337
try:
285-
data = json.loads(json_snippet)
338+
data = json.loads(text)
286339
except json.JSONDecodeError:
287-
return np.empty((0, 4)), None, np.empty((0,), dtype=str)
340+
repaired = recover_truncated_qwen_2_5_vl_response(text)
341+
if repaired is not None:
342+
data = repaired
343+
else:
344+
try:
345+
data = ast.literal_eval(text)
346+
except (ValueError, SyntaxError, TypeError):
347+
return (
348+
np.empty((0, 4)),
349+
np.empty((0,), dtype=int),
350+
np.empty((0,), dtype=str),
351+
)
352+
353+
if not isinstance(data, list):
354+
return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str))
288355

289356
boxes_list = []
290357
labels_list = []
@@ -296,7 +363,7 @@ def from_qwen_2_5_vl(
296363
labels_list.append(item["label"])
297364

298365
if not boxes_list:
299-
return np.empty((0, 4)), None, np.empty((0,), dtype=str)
366+
return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str))
300367

301368
xyxy = np.array(boxes_list, dtype=float)
302369
class_name = np.array(labels_list, dtype=str)
@@ -315,6 +382,36 @@ def from_qwen_2_5_vl(
315382
return xyxy, class_id, class_name
316383

317384

385+
def from_qwen_3_vl(
386+
result: str,
387+
resolution_wh: tuple[int, int],
388+
classes: list[str] | None = None,
389+
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
390+
"""
391+
Parse and scale bounding boxes from Qwen-3-VL style JSON output.
392+
393+
Args:
394+
result (str): String containing the Qwen-3-VL JSON output.
395+
resolution_wh (tuple[int, int]): Target resolution `(width, height)` to
396+
scale bounding boxes.
397+
classes (list[str] or None): Optional list of valid classes to filter
398+
results.
399+
400+
Returns:
401+
xyxy (np.ndarray): Array of bounding boxes with shape `(N, 4)` in
402+
`(x_min, y_min, x_max, y_max)` format scaled to `resolution_wh`.
403+
class_id (np.ndarray or None): Array of class indices for each box, or
404+
None if no filtering by classes.
405+
class_name (np.ndarray): Array of class names as strings.
406+
"""
407+
return from_qwen_2_5_vl(
408+
result=result,
409+
input_wh=(1000, 1000),
410+
resolution_wh=resolution_wh,
411+
classes=classes,
412+
)
413+
414+
318415
def from_deepseek_vl_2(
319416
result: str, resolution_wh: tuple[int, int], classes: list[str] | None = None
320417
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:

0 commit comments

Comments
 (0)