11from __future__ import annotations
22
3+ import ast
34import base64
45import io
56import json
@@ -27,7 +28,8 @@ class LMM(Enum):
2728 Attributes:
2829 PALIGEMMA: Google's PaliGemma vision-language model.
2930 FLORENCE_2: Microsoft's Florence-2 vision-language model.
30- QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
31+ QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.\
32+ QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
3133 GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
3234 GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
3335 MOONDREAM: The Moondream vision-language model.
@@ -36,6 +38,7 @@ class LMM(Enum):
3638 PALIGEMMA = "paligemma"
3739 FLORENCE_2 = "florence_2"
3840 QWEN_2_5_VL = "qwen_2_5_vl"
41+ QWEN_3_VL = "qwen_3_vl"
3942 DEEPSEEK_VL_2 = "deepseek_vl_2"
4043 GOOGLE_GEMINI_2_0 = "gemini_2_0"
4144 GOOGLE_GEMINI_2_5 = "gemini_2_5"
@@ -69,6 +72,7 @@ class VLM(Enum):
6972 PALIGEMMA: Google's PaliGemma vision-language model.
7073 FLORENCE_2: Microsoft's Florence-2 vision-language model.
7174 QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
75+ QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
7276 GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
7377 GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
7478 MOONDREAM: The Moondream vision-language model.
@@ -77,6 +81,7 @@ class VLM(Enum):
7781 PALIGEMMA = "paligemma"
7882 FLORENCE_2 = "florence_2"
7983 QWEN_2_5_VL = "qwen_2_5_vl"
84+ QWEN_3_VL = "qwen_3_vl"
8085 DEEPSEEK_VL_2 = "deepseek_vl_2"
8186 GOOGLE_GEMINI_2_0 = "gemini_2_0"
8287 GOOGLE_GEMINI_2_5 = "gemini_2_5"
@@ -106,6 +111,7 @@ def from_value(cls, value: VLM | str) -> VLM:
106111 VLM .PALIGEMMA : str ,
107112 VLM .FLORENCE_2 : dict ,
108113 VLM .QWEN_2_5_VL : str ,
114+ VLM .QWEN_3_VL : str ,
109115 VLM .DEEPSEEK_VL_2 : str ,
110116 VLM .GOOGLE_GEMINI_2_0 : str ,
111117 VLM .GOOGLE_GEMINI_2_5 : str ,
@@ -116,6 +122,7 @@ def from_value(cls, value: VLM | str) -> VLM:
116122 VLM .PALIGEMMA : ["resolution_wh" ],
117123 VLM .FLORENCE_2 : ["resolution_wh" ],
118124 VLM .QWEN_2_5_VL : ["input_wh" , "resolution_wh" ],
125+ VLM .QWEN_3_VL : ["resolution_wh" ],
119126 VLM .DEEPSEEK_VL_2 : ["resolution_wh" ],
120127 VLM .GOOGLE_GEMINI_2_0 : ["resolution_wh" ],
121128 VLM .GOOGLE_GEMINI_2_5 : ["resolution_wh" ],
@@ -126,6 +133,7 @@ def from_value(cls, value: VLM | str) -> VLM:
126133 VLM .PALIGEMMA : ["resolution_wh" , "classes" ],
127134 VLM .FLORENCE_2 : ["resolution_wh" ],
128135 VLM .QWEN_2_5_VL : ["input_wh" , "resolution_wh" , "classes" ],
136+ VLM .QWEN_3_VL : ["resolution_wh" , "classes" ],
129137 VLM .DEEPSEEK_VL_2 : ["resolution_wh" , "classes" ],
130138 VLM .GOOGLE_GEMINI_2_0 : ["resolution_wh" , "classes" ],
131139 VLM .GOOGLE_GEMINI_2_5 : ["resolution_wh" , "classes" ],
@@ -235,14 +243,59 @@ def from_paligemma(
235243 return xyxy , class_id , class_name
236244
237245
246+ def recover_truncated_qwen_2_5_vl_response (text : str ) -> Any | None :
247+ """
248+ Attempt to recover and parse a truncated or malformed JSON snippet from Qwen-2.5-VL
249+ output.
250+
251+ This utility extracts a JSON-like portion from a string that may be truncated or
252+ malformed, cleans trailing commas, and attempts to parse it into a Python object.
253+
254+ Args:
255+ text (str): Raw text containing the JSON snippet possibly truncated or
256+ incomplete.
257+
258+ Returns:
259+ Parsed Python object (usually list) if recovery and parsing succeed;
260+ otherwise `None`.
261+ """
262+ try :
263+ first_bracket = text .find ("[" )
264+ if first_bracket == - 1 :
265+ return None
266+ snippet = text [first_bracket :]
267+
268+ last_brace = snippet .rfind ("}" )
269+ if last_brace == - 1 :
270+ return None
271+
272+ snippet = snippet [: last_brace + 1 ]
273+
274+ prefix_end = snippet .find ("[" )
275+ if prefix_end == - 1 :
276+ return None
277+
278+ prefix = snippet [: prefix_end + 1 ]
279+ body = snippet [prefix_end + 1 :].rstrip ()
280+
281+ if body .endswith ("," ):
282+ body = body [:- 1 ].rstrip ()
283+
284+ repaired = prefix + body + "]"
285+
286+ return json .loads (repaired )
287+ except Exception :
288+ return None
289+
290+
238291def from_qwen_2_5_vl (
239292 result : str ,
240293 input_wh : tuple [int , int ],
241294 resolution_wh : tuple [int , int ],
242295 classes : list [str ] | None = None ,
243296) -> tuple [np .ndarray , np .ndarray | None , np .ndarray ]:
244297 """
245- Parse and scale bounding boxes from Qwen-2.5-VL style JSON output.
298+ Parse and rescale bounding boxes and class labels from Qwen-2.5-VL JSON output.
246299
247300 The JSON is expected to be enclosed in triple backticks with the format:
248301 ```json
@@ -253,38 +306,52 @@ def from_qwen_2_5_vl(
253306 ```
254307
255308 Args:
256- result: String containing the JSON snippet enclosed by triple backticks.
257- input_wh: (input_width, input_height) describing the original bounding box
258- scale.
259- resolution_wh: (output_width, output_height) to which we rescale the boxes.
260- classes: Optional list of valid class names. If provided, returned boxes/labels
261- are filtered to only those classes found here.
309+ result (str): String containing Qwen-2.5-VL JSON bounding box and label data.
310+ input_wh (tuple[int, int]): Width and height of the coordinate space where boxes
311+ are normalized.
312+ resolution_wh (tuple[int, int]): Target width and height to scale bounding
313+ boxes.
314+ classes (list[str] or None): Optional list of valid class names to filter
315+ results. If provided, only boxes with labels in this list are returned.
262316
263317 Returns:
264- xyxy (np.ndarray): An array of shape `(n, 4)` containing
265- the bounding boxes coordinates in format `[x1, y1, x2, y2]`
266- class_id (Optional[np.ndarray]): An array of shape `(n,)` containing
267- the class indices for each bounding box (or None if `classes` is not
268- provided)
269- class_name (np.ndarray): An array of shape `(n,)` containing
270- the class labels for each bounding box
318+ xyxy (np.ndarray): Array of shape `(N, 4)` with rescaled bounding boxes in
319+ `(x_min, y_min, x_max, y_max)` format.
320+ class_id (np.ndarray or None): Array of shape `(N,)` with indices of classes,
321+ or `None` if no filtering applied.
322+ class_name (np.ndarray): Array of shape `(N,)` with class names as strings.
271323 """
272324
273325 in_w , in_h = validate_resolution (input_wh )
274326 out_w , out_h = validate_resolution (resolution_wh )
275327
276- pattern = re .compile (r"```json\s*(.*?)\s*```" , re .DOTALL )
277-
278- match = pattern .search (result )
279- if not match :
280- return np .empty ((0 , 4 )), None , np .empty ((0 ,), dtype = str )
328+ text = result .strip ()
329+ text = re .sub (r"^```(json)?" , "" , text , flags = re .IGNORECASE ).strip ()
330+ text = re .sub (r"```$" , "" , text ).strip ()
281331
282- json_snippet = match .group (1 )
332+ start = text .find ("[" )
333+ end = text .rfind ("]" )
334+ if start != - 1 and end != - 1 and end > start :
335+ text = text [start : end + 1 ].strip ()
283336
284337 try :
285- data = json .loads (json_snippet )
338+ data = json .loads (text )
286339 except json .JSONDecodeError :
287- return np .empty ((0 , 4 )), None , np .empty ((0 ,), dtype = str )
340+ repaired = recover_truncated_qwen_2_5_vl_response (text )
341+ if repaired is not None :
342+ data = repaired
343+ else :
344+ try :
345+ data = ast .literal_eval (text )
346+ except (ValueError , SyntaxError , TypeError ):
347+ return (
348+ np .empty ((0 , 4 )),
349+ np .empty ((0 ,), dtype = int ),
350+ np .empty ((0 ,), dtype = str ),
351+ )
352+
353+ if not isinstance (data , list ):
354+ return (np .empty ((0 , 4 )), np .empty ((0 ,), dtype = int ), np .empty ((0 ,), dtype = str ))
288355
289356 boxes_list = []
290357 labels_list = []
@@ -296,7 +363,7 @@ def from_qwen_2_5_vl(
296363 labels_list .append (item ["label" ])
297364
298365 if not boxes_list :
299- return np .empty ((0 , 4 )), None , np .empty ((0 ,), dtype = str )
366+ return ( np .empty ((0 , 4 )), np . empty (( 0 ,), dtype = int ), np .empty ((0 ,), dtype = str ) )
300367
301368 xyxy = np .array (boxes_list , dtype = float )
302369 class_name = np .array (labels_list , dtype = str )
@@ -315,6 +382,36 @@ def from_qwen_2_5_vl(
315382 return xyxy , class_id , class_name
316383
317384
385+ def from_qwen_3_vl (
386+ result : str ,
387+ resolution_wh : tuple [int , int ],
388+ classes : list [str ] | None = None ,
389+ ) -> tuple [np .ndarray , np .ndarray | None , np .ndarray ]:
390+ """
391+ Parse and scale bounding boxes from Qwen-3-VL style JSON output.
392+
393+ Args:
394+ result (str): String containing the Qwen-3-VL JSON output.
395+ resolution_wh (tuple[int, int]): Target resolution `(width, height)` to
396+ scale bounding boxes.
397+ classes (list[str] or None): Optional list of valid classes to filter
398+ results.
399+
400+ Returns:
401+ xyxy (np.ndarray): Array of bounding boxes with shape `(N, 4)` in
402+ `(x_min, y_min, x_max, y_max)` format scaled to `resolution_wh`.
403+ class_id (np.ndarray or None): Array of class indices for each box, or
404+ None if no filtering by classes.
405+ class_name (np.ndarray): Array of class names as strings.
406+ """
407+ return from_qwen_2_5_vl (
408+ result = result ,
409+ input_wh = (1000 , 1000 ),
410+ resolution_wh = resolution_wh ,
411+ classes = classes ,
412+ )
413+
414+
318415def from_deepseek_vl_2 (
319416 result : str , resolution_wh : tuple [int , int ], classes : list [str ] | None = None
320417) -> tuple [np .ndarray , np .ndarray | None , np .ndarray ]:
0 commit comments