@@ -51,7 +51,7 @@ class Model:
51
51
current_image : Image or None = None
52
52
current_description : str
53
53
54
- def __init__ (self , model_directory ):
54
+ def __init__ (self , model_directory , bbox_mode : str ):
55
55
self .model_directory = model_directory
56
56
self .config = None
57
57
self .vision_model = None
@@ -61,17 +61,22 @@ def __init__(self, model_directory):
61
61
self .current_image = None
62
62
self .current_emb = None
63
63
self .current_description = ""
64
+ bbox_funcs = {
65
+ "qwen2" : self .get_grounding_bb_qwen2 ,
66
+ "qwen25" : self .get_grounding_bb_qwen25 ,
67
+ }
68
+ self .bbox_func = bbox_funcs [bbox_mode ]
64
69
65
70
def load (self ):
66
71
"""Load and initialize the things"""
67
72
self .config = ExLlamaV2Config (self .model_directory )
68
- self .config .max_seq_len = 16384
73
+ self .config .max_seq_len = 8192
69
74
70
75
self .vision_model = ExLlamaV2VisionTower (self .config )
71
76
self .vision_model .load (progress = True )
72
77
73
78
self .model = ExLlamaV2 (self .config )
74
- self .cache = ExLlamaV2Cache (self .model , lazy = True , max_seq_len = 16384 )
79
+ self .cache = ExLlamaV2Cache (self .model , lazy = True , max_seq_len = 32768 )
75
80
self .model .load_autosplit (self .cache , progress = True )
76
81
self .tokenizer = ExLlamaV2Tokenizer (self .config )
77
82
@@ -148,14 +153,21 @@ def inference(self, settext_fn, update_fn):
148
153
lastupdate = time .time ()
149
154
settext_fn (text )
150
155
update_fn ()
156
+ #
157
+ # text = \
158
+ # """And you may find yourself living in a shotgun shack
159
+ # And you may find yourself in another part of the world
160
+ # And you may find yourself behind the wheel of a large automobile
161
+ # And you may find yourself in a beautiful house, with a beautiful wife
162
+ # And you may ask yourself, "Well, how did I get here?\""""
151
163
152
164
settext_fn (text )
153
165
update_fn ()
154
166
self .current_description = text
155
167
print ("Image description from model:" )
156
168
print (text )
157
169
158
- def get_grounding_bb (self , start , end ) -> tuple :
170
+ def get_grounding_bb_qwen2 (self , start , end ) -> tuple :
159
171
"""
160
172
Prompt the model again and try to extraxt the bounding box of the image details indicated by selected portion
161
173
of the description. We do this by repeating the exact same prompt up to and including the selected text, but
@@ -209,6 +221,55 @@ def get_grounding_bb(self, start, end) -> tuple:
209
221
210
222
return a , b
211
223
224
+ def get_grounding_bb_qwen25 (self , start , end ) -> tuple :
225
+ """
226
+ Qwen2.5 works the same way, except the coordinates are no longer normalized and the format is:
227
+ "(x0,y0,x1,y1)"
228
+ """
229
+
230
+ if start >= end :
231
+ return None , None
232
+
233
+ # Including leading space
234
+ if start > 0 and self .current_description [start - 1 ] == " " :
235
+ start -= 1
236
+
237
+ # Repeat the same prompt up to the selection, with grounding tokens added
238
+ prompt = self .get_prompt ()
239
+ prompt += self .current_description [:start ]
240
+ prompt += "<|object_ref_start|>"
241
+ prompt += self .current_description [start :end ]
242
+ prompt += "<|object_ref_end|><|box_start|>("
243
+
244
+ bb_string , res = self .generator .generate (
245
+ prompt = prompt ,
246
+ add_bos = True ,
247
+ max_new_tokens = 28 ,
248
+ stop_conditions = [self .tokenizer .single_id ("<|box_end|>" )],
249
+ gen_settings = ExLlamaV2Sampler .Settings .greedy (),
250
+ embeddings = [self .current_emb ],
251
+ completion_only = True ,
252
+ return_last_results = True , # debug purposes
253
+ )
254
+ bb_string = "(" + bb_string
255
+
256
+ print (f"Generation: { bb_string } " )
257
+ pprint .pprint (res , indent = 4 )
258
+
259
+ # BB string is in the format "(x0,y0,x1,y1)" with integer coordinates
260
+
261
+ s = self .current_image .size
262
+ try :
263
+ d = tuple (map (int , bb_string .strip ("()" ).split ("," )))
264
+ a = (d [0 ] / s [0 ], d [1 ] / s [1 ])
265
+ b = (d [2 ] / s [0 ], d [3 ] / s [1 ])
266
+ except :
267
+ print ("No bounding box could be determined" )
268
+ a , b = None , None
269
+
270
+ return a , b
271
+
272
+
212
273
213
274
class GroundingDemo (QMainWindow ):
214
275
@@ -472,7 +533,7 @@ def on_selection_made(self, pos):
472
533
473
534
print (f"Selected span: { start } , { end } " )
474
535
print (f"Selected text: { repr (self .model .current_description [start :end ])} " )
475
- a , b = self .model .get_grounding_bb (start , end )
536
+ a , b = self .model .bbox_func (start , end )
476
537
self .image_label .set_bounding_box (a , b )
477
538
478
539
@@ -481,9 +542,14 @@ def on_selection_made(self, pos):
481
542
# https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
482
543
483
544
def main ():
484
- model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
545
+
546
+ # model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
547
+ # bbox_mode = "qwen25"
548
+ model_dir = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/6.0bpw"
549
+ bbox_mode = "qwen25"
550
+
485
551
app = QApplication (sys .argv )
486
- model = Model (model_dir )
552
+ model = Model (model_dir , bbox_mode )
487
553
model .load ()
488
554
window = GroundingDemo (model , model_dir )
489
555
window .show ()
0 commit comments