Skip to content

Commit 2d48ccd

Browse files
committed
Merge branch 'dev'
2 parents 09c18e9 + 9244003 commit 2d48ccd

33 files changed

+958
-233
lines changed

examples/chat.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272

7373
parser.add_argument("-ngram", "--ngram_decoding", action = "store_true", help = "Use n-gram speculative decoding")
7474

75+
parser.add_argument("-mli", "--mli", action = "store_true", help = "Enable multi line input")
7576
parser.add_argument("-pt", "--print_timings", action = "store_true", help = "Output timings/stats after each prompt")
7677
parser.add_argument("-amnesia", "--amnesia", action = "store_true", help = "Forget context after every response")
7778

@@ -301,7 +302,22 @@ def get_tokenized_context(max_len):
301302
# Get user prompt
302303

303304
print()
304-
up = input(col_user + username + ": " + col_default).strip()
305+
print(col_user + username + ": " + col_default, end='', flush=True)
306+
307+
# multi-lin support
308+
if args.mli:
309+
content = sys.stdin.read().rstrip()
310+
else:
311+
content = input().strip()
312+
313+
# clear context
314+
if content == "clear":
315+
user_prompts = []
316+
responses_ids = []
317+
print(col_user + "Context cleared." + col_default, end='', flush=True)
318+
continue
319+
320+
up = username + ": " + content
305321
print()
306322

307323
# Add to context
@@ -337,6 +353,12 @@ def get_tokenized_context(max_len):
337353
tokens = res["chunk_token_ids"]
338354

339355
if len(response_text) == 0: chunk = chunk.lstrip()
356+
357+
# trim thinking from context for qwq model
358+
if args.mode == "qwq" and chunk == "</think>":
359+
chunk = "end of thinking"
360+
responses_ids[-1] = torch.empty((1, 0), dtype = torch.long)
361+
340362
response_text += chunk
341363
responses_ids[-1] = torch.cat([responses_ids[-1], tokens], dim = -1)
342364

examples/chat_prompts.py

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,51 @@ def default_system_prompt(self):
210210
"""You are a helpful coding assistant. Always answer as helpfully as possible."""
211211

212212

213+
class PromptFormat_qwq(PromptFormat):
214+
215+
description = "Qwen QwQ format"
216+
217+
def __init__(self):
218+
super().__init__()
219+
pass
220+
221+
def default_system_prompt(self):
222+
return \
223+
f"""You are a useful coding assistant, who thinks before answering."""
224+
225+
def first_prompt(self, sysprompt):
226+
r = ""
227+
if sysprompt:
228+
r += \
229+
"""<|im_start|>system\n""" + \
230+
"""<|system_prompt|>""" + \
231+
"""<|im_end|>\n"""
232+
r += \
233+
"""<|im_start|>user\n""" + \
234+
"""<|user_prompt|><|im_end|>\n""" + \
235+
"""<|im_start|>assistant\n<think>\n"""
236+
return r
237+
238+
def subs_prompt(self):
239+
return \
240+
"""<|im_end|>\n""" + \
241+
"""<|im_start|>user\n""" + \
242+
"""<|user_prompt|><|im_end|>\n""" + \
243+
"""<|im_start|>assistant\n<think>\n"""
244+
245+
def stop_conditions(self, tokenizer):
246+
return \
247+
[tokenizer.eos_token_id,
248+
tokenizer.single_id("<|im_end|>"),
249+
"""<|im_end|>"""]
250+
251+
def encoding_options(self):
252+
return False, False, True
253+
254+
def print_extra_newline(self):
255+
return True
256+
257+
213258
class PromptFormat_chatml(PromptFormat):
214259

215260
description = "ChatML format, as used by e.g. (Mistral)Orca"
@@ -494,8 +539,8 @@ def subs_prompt(self):
494539
def stop_conditions(self, tokenizer):
495540
return \
496541
[tokenizer.eos_token_id,
497-
"""</s>""",
498-
"""<end_of_turn>""",
542+
tokenizer.single_id("<end_of_turn>"),
543+
tokenizer.single_id("<start_of_turn>"),
499544
]
500545

501546
def encoding_options(self):
@@ -629,12 +674,57 @@ def print_extra_newline(self):
629674
return True
630675

631676

677+
class PromptFormat_glm(PromptFormat):
678+
description = "GLM4"
679+
680+
def __init__(self):
681+
super().__init__()
682+
pass
683+
684+
def default_system_prompt(self):
685+
return \
686+
f"""You are a helpful AI assistant."""
687+
688+
def first_prompt(self, sysprompt):
689+
r = """[gMASK]<sop>"""
690+
if sysprompt:
691+
r += \
692+
"""<|system|>\n""" + \
693+
"""<|system_prompt|>"""
694+
r += \
695+
"""<|user|>\n""" + \
696+
"""<|user_prompt|>""" + \
697+
"""<|assistant|>\n"""
698+
return r
699+
700+
def subs_prompt(self):
701+
return \
702+
"""<|user|>\n""" + \
703+
"""<|user_prompt|>""" + \
704+
"""<|assistant|>\n"""
705+
706+
def stop_conditions(self, tokenizer):
707+
return \
708+
[tokenizer.eos_token_id,
709+
tokenizer.single_id("<|user|>"),
710+
"""<|user|>""",
711+
]
712+
713+
def encoding_options(self):
714+
return True, False, True
715+
716+
def print_extra_newline(self):
717+
return True
718+
719+
720+
632721
prompt_formats = \
633722
{
634723
"raw": PromptFormat_raw,
635724
"llama": PromptFormat_llama,
636725
"llama3": PromptFormat_llama3,
637726
"codellama": PromptFormat_codellama,
727+
"qwq": PromptFormat_qwq,
638728
"chatml": PromptFormat_chatml,
639729
"tinyllama": PromptFormat_tinyllama,
640730
"zephyr": PromptFormat_zephyr,
@@ -647,4 +737,5 @@ def print_extra_newline(self):
647737
"phi3": PromptFormat_phi3,
648738
"granite": PromptFormat_granite,
649739
"granite3": PromptFormat_granite3,
740+
"glm": PromptFormat_glm
650741
}

examples/multimodal.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,37 +26,48 @@
2626
# Pixtral:
2727
# https://huggingface.co/mistral-community/pixtral-12b/
2828
# https://huggingface.co/turboderp/pixtral-12b-exl2
29+
# Mistral-Small 3.1:
30+
# https://huggingface.co/prince-canuma/Mistral-Small-3.1-24B-Instruct-2503
2931
# Qwen2-VL:
3032
# https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
3133
# https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
34+
# Gemma3:
35+
# https://huggingface.co/google/gemma-3-27b-it
36+
# https://huggingface.co/turboderp/gemma-3-27b-it-exl2
3237

3338
# mode = "pixtral"
34-
mode = "qwen2"
39+
mode = "mistral3"
40+
# mode = "qwen2"
41+
# mode = "gemma3"
3542

3643
streaming = True
3744
greedy = True
3845

3946
if mode == "pixtral":
4047
model_directory = "/mnt/str/models/pixtral-12b-exl2/6.0bpw"
4148
elif mode == "qwen2":
42-
model_directory = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
49+
model_directory = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/5.0bpw"
50+
elif mode == "gemma3":
51+
model_directory = "/mnt/str/models/gemma3-12b-it-exl2/6.0bpw"
52+
elif mode == "mistral3":
53+
model_directory = "/mnt/str/models/mistral-small-3.1-24b-instruct/exl2/4.5bpw"
4354

4455
images = [
45-
{"file": "media/test_image_1.jpg"},
46-
{"file": "media/test_image_2.jpg"},
47-
# {"url": "https://media.istockphoto.com/id/1212540739/photo/mom-cat-with-kitten.jpg?s=612x612&w=0&k=20&c=RwoWm5-6iY0np7FuKWn8FTSieWxIoO917FF47LfcBKE="},
56+
# {"file": "media/test_image_1.jpg"},
57+
# {"file": "media/test_image_2.jpg"},
58+
{"url": "https://media.istockphoto.com/id/1212540739/photo/mom-cat-with-kitten.jpg?s=612x612&w=0&k=20&c=RwoWm5-6iY0np7FuKWn8FTSieWxIoO917FF47LfcBKE="},
4859
# {"url": "https://i.dailymail.co.uk/1s/2023/07/10/21/73050285-12283411-Which_way_should_I_go_One_lady_from_the_US_shared_this_incredibl-a-4_1689019614007.jpg"},
4960
# {"url": "https://images.fineartamerica.com/images-medium-large-5/metal-household-objects-trevor-clifford-photography.jpg"}
5061
]
5162

52-
instruction = "Compare and contrast the two experiments."
53-
# instruction = "Describe the image."
63+
# instruction = "Compare and contrast the two experiments."
64+
instruction = "Describe the image."
5465
# instruction = "Find the alarm clock." # Qwen2 seems to support this but unsure of how to prompt correctly
5566

5667
# Initialize model
5768

5869
config = ExLlamaV2Config(model_directory)
59-
config.max_seq_len = 16384 # Pixtral default is 1M
70+
config.max_seq_len = 8192 # Pixtral default is 1M
6071

6172
# Load vision model and multimodal projector and initialize preprocessor
6273

@@ -66,8 +77,8 @@
6677
# Load EXL2 model
6778

6879
model = ExLlamaV2(config)
69-
cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
70-
model.load_autosplit(cache, progress = True)
80+
cache = ExLlamaV2Cache(model, max_seq_len = 8192, lazy = True)
81+
model.load_autosplit(progress = True, cache = cache)
7182
tokenizer = ExLlamaV2Tokenizer(config)
7283

7384
# Create generator
@@ -115,13 +126,14 @@ def get_image(file = None, url = None):
115126
# Image token IDs are assigned sequentially, however, so two ExLlamaV2Embedding objects created from the same
116127
# source image will not be recognized as the same image for purposes of prompt caching etc.
117128

118-
if mode == "pixtral":
129+
if mode in ["pixtral", "mistral3"]:
119130
prompt = (
120131
"[INST]" +
121132
placeholders +
122133
instruction +
123134
"[/INST]"
124135
)
136+
stop_conditions = [tokenizer.eos_token_id]
125137

126138
elif mode == "qwen2":
127139
prompt = (
@@ -133,6 +145,18 @@ def get_image(file = None, url = None):
133145
"<|im_end|>\n" +
134146
"<|im_start|>assistant\n"
135147
)
148+
stop_conditions = [tokenizer.eos_token_id]
149+
150+
elif mode == "gemma3":
151+
prompt = (
152+
"<start_of_turn>user\nYou are a helpful assistant.\n\n\n\n" +
153+
placeholders +
154+
"\n" +
155+
instruction +
156+
"<end_of_turn>\n" +
157+
"<start_of_turn>model\n"
158+
)
159+
stop_conditions = [tokenizer.single_id("<end_of_turn>")]
136160

137161
# Generate
138162

@@ -149,7 +173,7 @@ def get_image(file = None, url = None):
149173
input_ids = input_ids,
150174
max_new_tokens = 500,
151175
decode_special_tokens = True,
152-
stop_conditions = [tokenizer.eos_token_id],
176+
stop_conditions = stop_conditions,
153177
gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
154178
embeddings = image_embeddings,
155179
)

0 commit comments

Comments
 (0)