Add Qwen2.5 mode to grounding demo

turboderp · turboderp · commit 96b2f9df7711 · 2025-01-29T22:41:36.000+01:00
diff --git a/examples/multimodal_grounding_qwen.py b/examples/multimodal_grounding_qwen.py
@@ -51,7 +51,7 @@ class Model:
     current_image: Image or None = None
     current_description: str
 
-    def __init__(self, model_directory):
+    def __init__(self, model_directory, bbox_mode: str):
         self.model_directory = model_directory
         self.config = None
         self.vision_model = None
@@ -61,17 +61,22 @@ def __init__(self, model_directory):
         self.current_image = None
         self.current_emb = None
         self.current_description = ""
+        bbox_funcs = {
+            "qwen2": self.get_grounding_bb_qwen2,
+            "qwen25": self.get_grounding_bb_qwen25,
+        }
+        self.bbox_func = bbox_funcs[bbox_mode]
 
     def load(self):
         """Load and initialize the things"""
         self.config = ExLlamaV2Config(self.model_directory)
-        self.config.max_seq_len = 16384
+        self.config.max_seq_len = 8192
 
         self.vision_model = ExLlamaV2VisionTower(self.config)
         self.vision_model.load(progress = True)
 
         self.model = ExLlamaV2(self.config)
-        self.cache = ExLlamaV2Cache(self.model, lazy = True, max_seq_len = 16384)
+        self.cache = ExLlamaV2Cache(self.model, lazy = True, max_seq_len = 32768)
         self.model.load_autosplit(self.cache, progress = True)
         self.tokenizer = ExLlamaV2Tokenizer(self.config)
 
@@ -148,14 +153,21 @@ def inference(self, settext_fn, update_fn):
                 lastupdate = time.time()
                 settext_fn(text)
                 update_fn()
+#
+#         text = \
+# """And you may find yourself living in a shotgun shack
+# And you may find yourself in another part of the world
+# And you may find yourself behind the wheel of a large automobile
+# And you may find yourself in a beautiful house, with a beautiful wife
+# And you may ask yourself, "Well, how did I get here?\""""
 
         settext_fn(text)
         update_fn()
         self.current_description = text
         print("Image description from model:")
         print(text)
 
-    def get_grounding_bb(self, start, end) -> tuple:
+    def get_grounding_bb_qwen2(self, start, end) -> tuple:
         """
         Prompt the model again and try to extraxt the bounding box of the image details indicated by selected portion
         of the description. We do this by repeating the exact same prompt up to and including the selected text, but
@@ -209,6 +221,55 @@ def get_grounding_bb(self, start, end) -> tuple:
 
         return a, b
 
+    def get_grounding_bb_qwen25(self, start, end) -> tuple:
+        """
+        Qwen2.5 works the same way, except the coordinates are no longer normalized and the format is:
+        "(x0,y0,x1,y1)"
+        """
+
+        if start >= end:
+            return None, None
+
+        # Including leading space
+        if start > 0 and self.current_description[start - 1] == " ":
+            start -= 1
+
+        # Repeat the same prompt up to the selection, with grounding tokens added
+        prompt = self.get_prompt()
+        prompt += self.current_description[:start]
+        prompt += "<|object_ref_start|>"
+        prompt += self.current_description[start:end]
+        prompt += "<|object_ref_end|><|box_start|>("
+
+        bb_string, res = self.generator.generate(
+            prompt = prompt,
+            add_bos = True,
+            max_new_tokens = 28,
+            stop_conditions = [self.tokenizer.single_id("<|box_end|>")],
+            gen_settings = ExLlamaV2Sampler.Settings.greedy(),
+            embeddings = [self.current_emb],
+            completion_only = True,
+            return_last_results = True,  # debug purposes
+        )
+        bb_string = "(" + bb_string
+
+        print(f"Generation: {bb_string}")
+        pprint.pprint(res, indent = 4)
+
+        # BB string is in the format "(x0,y0,x1,y1)" with integer coordinates
+
+        s = self.current_image.size
+        try:
+            d = tuple(map(int, bb_string.strip("()").split(",")))
+            a = (d[0] / s[0], d[1] / s[1])
+            b = (d[2] / s[0], d[3] / s[1])
+        except:
+            print("No bounding box could be determined")
+            a, b = None, None
+
+        return a, b
+
+
 
 class GroundingDemo(QMainWindow):
 
@@ -472,7 +533,7 @@ def on_selection_made(self, pos):
 
         print(f"Selected span: {start}, {end}")
         print(f"Selected text: {repr(self.model.current_description[start:end])}")
-        a, b = self.model.get_grounding_bb(start, end)
+        a, b = self.model.bbox_func(start, end)
         self.image_label.set_bounding_box(a, b)
 
 
@@ -481,9 +542,14 @@ def on_selection_made(self, pos):
 #   https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
 
 def main():
-    model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
+
+    # model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
+    # bbox_mode = "qwen25"
+    model_dir = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/6.0bpw"
+    bbox_mode = "qwen25"
+
     app = QApplication(sys.argv)
-    model = Model(model_dir)
+    model = Model(model_dir, bbox_mode)
     model.load()
     window = GroundingDemo(model, model_dir)
     window.show()