Skip to content

Commit 96b2f9d

Browse files
committed
Add Qwen2.5 mode to grounding demo
1 parent cce6f95 commit 96b2f9d

File tree

1 file changed

+73
-7
lines changed

1 file changed

+73
-7
lines changed

examples/multimodal_grounding_qwen.py

Lines changed: 73 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class Model:
5151
current_image: Image or None = None
5252
current_description: str
5353

54-
def __init__(self, model_directory):
54+
def __init__(self, model_directory, bbox_mode: str):
5555
self.model_directory = model_directory
5656
self.config = None
5757
self.vision_model = None
@@ -61,17 +61,22 @@ def __init__(self, model_directory):
6161
self.current_image = None
6262
self.current_emb = None
6363
self.current_description = ""
64+
bbox_funcs = {
65+
"qwen2": self.get_grounding_bb_qwen2,
66+
"qwen25": self.get_grounding_bb_qwen25,
67+
}
68+
self.bbox_func = bbox_funcs[bbox_mode]
6469

6570
def load(self):
6671
"""Load and initialize the things"""
6772
self.config = ExLlamaV2Config(self.model_directory)
68-
self.config.max_seq_len = 16384
73+
self.config.max_seq_len = 8192
6974

7075
self.vision_model = ExLlamaV2VisionTower(self.config)
7176
self.vision_model.load(progress = True)
7277

7378
self.model = ExLlamaV2(self.config)
74-
self.cache = ExLlamaV2Cache(self.model, lazy = True, max_seq_len = 16384)
79+
self.cache = ExLlamaV2Cache(self.model, lazy = True, max_seq_len = 32768)
7580
self.model.load_autosplit(self.cache, progress = True)
7681
self.tokenizer = ExLlamaV2Tokenizer(self.config)
7782

@@ -148,14 +153,21 @@ def inference(self, settext_fn, update_fn):
148153
lastupdate = time.time()
149154
settext_fn(text)
150155
update_fn()
156+
#
157+
# text = \
158+
# """And you may find yourself living in a shotgun shack
159+
# And you may find yourself in another part of the world
160+
# And you may find yourself behind the wheel of a large automobile
161+
# And you may find yourself in a beautiful house, with a beautiful wife
162+
# And you may ask yourself, "Well, how did I get here?\""""
151163

152164
settext_fn(text)
153165
update_fn()
154166
self.current_description = text
155167
print("Image description from model:")
156168
print(text)
157169

158-
def get_grounding_bb(self, start, end) -> tuple:
170+
def get_grounding_bb_qwen2(self, start, end) -> tuple:
159171
"""
160172
Prompt the model again and try to extraxt the bounding box of the image details indicated by selected portion
161173
of the description. We do this by repeating the exact same prompt up to and including the selected text, but
@@ -209,6 +221,55 @@ def get_grounding_bb(self, start, end) -> tuple:
209221

210222
return a, b
211223

224+
def get_grounding_bb_qwen25(self, start, end) -> tuple:
225+
"""
226+
Qwen2.5 works the same way, except the coordinates are no longer normalized and the format is:
227+
"(x0,y0,x1,y1)"
228+
"""
229+
230+
if start >= end:
231+
return None, None
232+
233+
# Including leading space
234+
if start > 0 and self.current_description[start - 1] == " ":
235+
start -= 1
236+
237+
# Repeat the same prompt up to the selection, with grounding tokens added
238+
prompt = self.get_prompt()
239+
prompt += self.current_description[:start]
240+
prompt += "<|object_ref_start|>"
241+
prompt += self.current_description[start:end]
242+
prompt += "<|object_ref_end|><|box_start|>("
243+
244+
bb_string, res = self.generator.generate(
245+
prompt = prompt,
246+
add_bos = True,
247+
max_new_tokens = 28,
248+
stop_conditions = [self.tokenizer.single_id("<|box_end|>")],
249+
gen_settings = ExLlamaV2Sampler.Settings.greedy(),
250+
embeddings = [self.current_emb],
251+
completion_only = True,
252+
return_last_results = True, # debug purposes
253+
)
254+
bb_string = "(" + bb_string
255+
256+
print(f"Generation: {bb_string}")
257+
pprint.pprint(res, indent = 4)
258+
259+
# BB string is in the format "(x0,y0,x1,y1)" with integer coordinates
260+
261+
s = self.current_image.size
262+
try:
263+
d = tuple(map(int, bb_string.strip("()").split(",")))
264+
a = (d[0] / s[0], d[1] / s[1])
265+
b = (d[2] / s[0], d[3] / s[1])
266+
except:
267+
print("No bounding box could be determined")
268+
a, b = None, None
269+
270+
return a, b
271+
272+
212273

213274
class GroundingDemo(QMainWindow):
214275

@@ -472,7 +533,7 @@ def on_selection_made(self, pos):
472533

473534
print(f"Selected span: {start}, {end}")
474535
print(f"Selected text: {repr(self.model.current_description[start:end])}")
475-
a, b = self.model.get_grounding_bb(start, end)
536+
a, b = self.model.bbox_func(start, end)
476537
self.image_label.set_bounding_box(a, b)
477538

478539

@@ -481,9 +542,14 @@ def on_selection_made(self, pos):
481542
# https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
482543

483544
def main():
484-
model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
545+
546+
# model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
547+
# bbox_mode = "qwen25"
548+
model_dir = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/6.0bpw"
549+
bbox_mode = "qwen25"
550+
485551
app = QApplication(sys.argv)
486-
model = Model(model_dir)
552+
model = Model(model_dir, bbox_mode)
487553
model.load()
488554
window = GroundingDemo(model, model_dir)
489555
window.show()

0 commit comments

Comments
 (0)