turboderp-org
diff --git a/‎examples/chat.py
Lines changed: 23 additions & 1 deletion b/‎examples/chat.py
Lines changed: 23 additions & 1 deletion
diff --git a/‎examples/chat_prompts.py
Lines changed: 93 additions & 2 deletions b/‎examples/chat_prompts.py
Lines changed: 93 additions & 2 deletions
diff --git a/‎examples/multimodal.py
Lines changed: 36 additions & 12 deletions b/‎examples/multimodal.py
Lines changed: 36 additions & 12 deletions
@@ -72,6 +72,7 @@
 
 parser.add_argument("-ngram", "--ngram_decoding", action = "store_true", help = "Use n-gram speculative decoding")
 
+parser.add_argument("-mli", "--mli", action = "store_true", help = "Enable multi line input")
 parser.add_argument("-pt", "--print_timings", action = "store_true", help = "Output timings/stats after each prompt")
 parser.add_argument("-amnesia", "--amnesia", action = "store_true", help = "Forget context after every response")
 
@@ -301,7 +302,22 @@ def get_tokenized_context(max_len):
     # Get user prompt
 
     print()
-    up = input(col_user + username + ": " + col_default).strip()
+    print(col_user + username + ": " + col_default, end='', flush=True)
+
+    # multi-lin support
+    if args.mli:
+        content = sys.stdin.read().rstrip()
+    else:
+        content = input().strip()
+
+    # clear context
+    if content == "clear":
+        user_prompts = []
+        responses_ids = []
+        print(col_user + "Context cleared." + col_default, end='', flush=True)
+        continue
+
+    up = username + ": " + content
     print()
 
     # Add to context
@@ -337,6 +353,12 @@ def get_tokenized_context(max_len):
         tokens = res["chunk_token_ids"]
 
         if len(response_text) == 0: chunk = chunk.lstrip()
+
+        # trim thinking from context for qwq model
+        if args.mode == "qwq" and chunk == "</think>":
+            chunk = "end of thinking"
+            responses_ids[-1] = torch.empty((1, 0), dtype = torch.long)
+
         response_text += chunk
         responses_ids[-1] = torch.cat([responses_ids[-1], tokens], dim = -1)
 
 
@@ -210,6 +210,51 @@ def default_system_prompt(self):
             """You are a helpful coding assistant. Always answer as helpfully as possible."""
 
 
+class PromptFormat_qwq(PromptFormat):
+
+    description = "Qwen QwQ format"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return \
+            f"""You are a useful coding assistant, who thinks before answering."""
+
+    def first_prompt(self, sysprompt):
+        r = ""
+        if sysprompt:
+            r += \
+                """<|im_start|>system\n""" + \
+                """<|system_prompt|>""" + \
+                """<|im_end|>\n"""
+        r += \
+            """<|im_start|>user\n""" + \
+            """<|user_prompt|><|im_end|>\n""" + \
+            """<|im_start|>assistant\n<think>\n"""
+        return r
+
+    def subs_prompt(self):
+        return \
+            """<|im_end|>\n""" + \
+            """<|im_start|>user\n""" + \
+            """<|user_prompt|><|im_end|>\n""" + \
+            """<|im_start|>assistant\n<think>\n"""
+
+    def stop_conditions(self, tokenizer):
+        return \
+            [tokenizer.eos_token_id,
+             tokenizer.single_id("<|im_end|>"),
+             """<|im_end|>"""]
+
+    def encoding_options(self):
+        return False, False, True
+
+    def print_extra_newline(self):
+        return True
+
+
 class PromptFormat_chatml(PromptFormat):
 
     description = "ChatML format, as used by e.g. (Mistral)Orca"
@@ -494,8 +539,8 @@ def subs_prompt(self):
     def stop_conditions(self, tokenizer):
         return \
             [tokenizer.eos_token_id,
-             """</s>""",
-             """<end_of_turn>""",
+             tokenizer.single_id("<end_of_turn>"),
+             tokenizer.single_id("<start_of_turn>"),
              ]
 
     def encoding_options(self):
@@ -629,12 +674,57 @@ def print_extra_newline(self):
         return True
 
 
+class PromptFormat_glm(PromptFormat):
+    description = "GLM4"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return \
+            f"""You are a helpful AI assistant."""
+
+    def first_prompt(self, sysprompt):
+        r = """[gMASK]<sop>"""
+        if sysprompt:
+            r += \
+                """<|system|>\n""" + \
+                """<|system_prompt|>"""
+        r += \
+            """<|user|>\n""" + \
+            """<|user_prompt|>""" + \
+            """<|assistant|>\n"""
+        return r
+
+    def subs_prompt(self):
+        return \
+            """<|user|>\n""" + \
+            """<|user_prompt|>""" + \
+            """<|assistant|>\n"""
+
+    def stop_conditions(self, tokenizer):
+        return \
+            [tokenizer.eos_token_id,
+             tokenizer.single_id("<|user|>"),
+             """<|user|>""",
+             ]
+
+    def encoding_options(self):
+        return True, False, True
+
+    def print_extra_newline(self):
+        return True
+
+
+
 prompt_formats = \
 {
     "raw": PromptFormat_raw,
     "llama": PromptFormat_llama,
     "llama3": PromptFormat_llama3,
     "codellama": PromptFormat_codellama,
+    "qwq": PromptFormat_qwq,
     "chatml": PromptFormat_chatml,
     "tinyllama": PromptFormat_tinyllama,
     "zephyr": PromptFormat_zephyr,
@@ -647,4 +737,5 @@ def print_extra_newline(self):
     "phi3": PromptFormat_phi3,
     "granite": PromptFormat_granite,
     "granite3": PromptFormat_granite3,
+    "glm": PromptFormat_glm
 }
@@ -26,37 +26,48 @@
 # Pixtral:
 #   https://huggingface.co/mistral-community/pixtral-12b/
 #   https://huggingface.co/turboderp/pixtral-12b-exl2
+# Mistral-Small 3.1:
+#   https://huggingface.co/prince-canuma/Mistral-Small-3.1-24B-Instruct-2503
 # Qwen2-VL:
 #   https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
 #   https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
+# Gemma3:
+#   https://huggingface.co/google/gemma-3-27b-it
+#   https://huggingface.co/turboderp/gemma-3-27b-it-exl2
 
 # mode = "pixtral"
-mode = "qwen2"
+mode = "mistral3"
+# mode = "qwen2"
+# mode = "gemma3"
 
 streaming = True
 greedy = True
 
 if mode == "pixtral":
     model_directory = "/mnt/str/models/pixtral-12b-exl2/6.0bpw"
 elif mode == "qwen2":
-    model_directory = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
+    model_directory = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/5.0bpw"
+elif mode == "gemma3":
+    model_directory = "/mnt/str/models/gemma3-12b-it-exl2/6.0bpw"
+elif mode == "mistral3":
+    model_directory = "/mnt/str/models/mistral-small-3.1-24b-instruct/exl2/4.5bpw"
 
 images = [
-    {"file": "media/test_image_1.jpg"},
-    {"file": "media/test_image_2.jpg"},
-    # {"url": "https://media.istockphoto.com/id/1212540739/photo/mom-cat-with-kitten.jpg?s=612x612&w=0&k=20&c=RwoWm5-6iY0np7FuKWn8FTSieWxIoO917FF47LfcBKE="},
+    # {"file": "media/test_image_1.jpg"},
+    # {"file": "media/test_image_2.jpg"},
+    {"url": "https://media.istockphoto.com/id/1212540739/photo/mom-cat-with-kitten.jpg?s=612x612&w=0&k=20&c=RwoWm5-6iY0np7FuKWn8FTSieWxIoO917FF47LfcBKE="},
     # {"url": "https://i.dailymail.co.uk/1s/2023/07/10/21/73050285-12283411-Which_way_should_I_go_One_lady_from_the_US_shared_this_incredibl-a-4_1689019614007.jpg"},
     # {"url": "https://images.fineartamerica.com/images-medium-large-5/metal-household-objects-trevor-clifford-photography.jpg"}
 ]
 
-instruction = "Compare and contrast the two experiments."
-# instruction = "Describe the image."
+# instruction = "Compare and contrast the two experiments."
+instruction = "Describe the image."
 # instruction = "Find the alarm clock."  # Qwen2 seems to support this but unsure of how to prompt correctly
 
 # Initialize model
 
 config = ExLlamaV2Config(model_directory)
-config.max_seq_len = 16384  # Pixtral default is 1M
+config.max_seq_len = 8192  # Pixtral default is 1M
 
 # Load vision model and multimodal projector and initialize preprocessor
 
@@ -66,8 +77,8 @@
 # Load EXL2 model
 
 model = ExLlamaV2(config)
-cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
-model.load_autosplit(cache, progress = True)
+cache = ExLlamaV2Cache(model, max_seq_len = 8192, lazy = True)
+model.load_autosplit(progress = True, cache = cache)
 tokenizer = ExLlamaV2Tokenizer(config)
 
 # Create generator
@@ -115,13 +126,14 @@ def get_image(file = None, url = None):
 # Image token IDs are assigned sequentially, however, so two ExLlamaV2Embedding objects created from the same
 # source image will not be recognized as the same image for purposes of prompt caching etc.
 
-if mode == "pixtral":
+if mode in ["pixtral", "mistral3"]:
     prompt = (
         "[INST]" +
         placeholders +
         instruction +
         "[/INST]"
     )
+    stop_conditions = [tokenizer.eos_token_id]
 
 elif mode == "qwen2":
     prompt = (
@@ -133,6 +145,18 @@ def get_image(file = None, url = None):
         "<|im_end|>\n" +
         "<|im_start|>assistant\n"
     )
+    stop_conditions = [tokenizer.eos_token_id]
+
+elif mode == "gemma3":
+    prompt = (
+        "<start_of_turn>user\nYou are a helpful assistant.\n\n\n\n" +
+        placeholders +
+        "\n" +
+        instruction +
+        "<end_of_turn>\n" +
+        "<start_of_turn>model\n"
+    )
+    stop_conditions = [tokenizer.single_id("<end_of_turn>")]
 
 # Generate
 
@@ -149,7 +173,7 @@ def get_image(file = None, url = None):
         input_ids = input_ids,
         max_new_tokens = 500,
         decode_special_tokens = True,
-        stop_conditions = [tokenizer.eos_token_id],
+        stop_conditions = stop_conditions,
         gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
         embeddings = image_embeddings,
     )