Skip to content

Commit 90ad7f4

Browse files
committed
Small fixes and updated recommended models
* Mistral Nemo is crazily good
1 parent b011eab commit 90ad7f4

File tree

5 files changed

+42
-29
lines changed

5 files changed

+42
-29
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,6 @@
3737
*.ini
3838
*.7z
3939
*.zip
40+
*.rar
4041
/base/common копия.cpp
4142
/base/ggml-opencl-bu.cpp

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ Tested on Windows only for now. AVX2 releases only for now, older releases were
6262
* [llama.cpp](https://github.com/ggerganov/llama.cpp)
6363
* [imgui](https://github.com/ocornut/imgui)
6464
* Retro theme based on https://github.com/ocornut/imgui/issues/707#issuecomment-254610737
65-
* Vulkan experimental build uses [this PR](https://github.com/ggerganov/llama.cpp/pull/2059)
66-
* [redmond-puffin-13b](https://huggingface.co/TheBloke/Redmond-Puffin-13B-GGUF) from config.json (q4_K_S version works faster)
67-
* [mistral-7b-instruct](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) from config.json (q4_K_S version works faster)
65+
* [mistral-7b-instruct-v0.1](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) from config.json (q4_K_S version works faster)
66+
* [Mistral-Nemo-Instruct-2407-GGUF](https://huggingface.co/ZeroWw/Mistral-Nemo-Instruct-2407-GGUF) from config.json (specific quants with output and embed tensors quantized to f16, q5 is the smallest)
67+
* [redmond-puffin-13b (previously recommended)](https://huggingface.co/TheBloke/Redmond-Puffin-13B-GGUF) from config.json (q4_K_S version works faster)
6868

6969
### Additional notes
7070

71+
* Vulkan experimental build used [this PR](https://github.com/ggerganov/llama.cpp/pull/2059)
7172
* ggmlv3 version is very old and almost deprecated for now, as almost no new models are using the old format

chat_plain.h

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -773,7 +773,8 @@ class chat
773773
}
774774

775775
embd_inp.clear();
776-
embd_inp.push_back(decoder_start_token_id);
776+
//embd_inp.push_back(decoder_start_token_id);
777+
embd_inp.emplace_back(decoder_start_token_id);
777778
}
778779

779780
return 0;
@@ -973,8 +974,8 @@ class chat
973974
}
974975

975976
if (embd_inp.empty()) {
976-
//embd_inp.emplace_back(llama_token_bos(model));
977-
embd_inp.push_back(llama_token_bos(model));
977+
embd_inp.emplace_back(llama_token_bos(model));
978+
//embd_inp.push_back(llama_token_bos(model));
978979
}
979980

980981
// Tokenize negative prompt
@@ -1323,8 +1324,8 @@ class chat
13231324

13241325
// add it to the context
13251326
//embd.emplace_back(id);
1326-
//embd.emplace_back(id);
1327-
embd.push_back(id);
1327+
embd.emplace_back(id);
1328+
//embd.push_back(id);
13281329
++n_last_message;
13291330
++n_last_message_past;
13301331

@@ -1502,8 +1503,8 @@ class chat
15021503
// some user input remains from prompt or interaction, forward it to processing
15031504
while ((int) embd_inp.size() > n_consumed) {
15041505
//fprintf(stderr, ">");
1505-
//embd.emplace_back(embd_inp[n_consumed]);
1506-
embd.push_back(embd_inp[n_consumed]);
1506+
embd.emplace_back(embd_inp[n_consumed]);
1507+
//embd.push_back(embd_inp[n_consumed]);
15071508
//last_n_tokens.erase(last_n_tokens.begin());
15081509
//last_n_tokens.emplace_back(embd_inp[n_consumed]);
15091510
//last_tokens.erase(last_tokens.begin());
@@ -1552,8 +1553,8 @@ class chat
15521553

15531554
void appendPrefixBos(){
15541555
if (params.input_prefix_bos) {
1555-
//embd_inp.emplace_back(llama_token_bos(model));
1556-
embd_inp.push_back(llama_token_bos(model));
1556+
embd_inp.emplace_back(llama_token_bos(model));
1557+
//embd_inp.push_back(llama_token_bos(model));
15571558
}
15581559
}
15591560

@@ -1624,8 +1625,8 @@ class chat
16241625

16251626
for (size_t i = original_size; i < embd_inp.size(); ++i) {
16261627
const llama_token token = embd_inp[i];
1627-
//output_tokens.emplace_back(token);
1628-
output_tokens.push_back(token);
1628+
output_tokens.emplace_back(token);
1629+
//output_tokens.push_back(token);
16291630
//output_ss << llama_token_to_piece(ctx, token);
16301631
//std::cout << "tkns = " << embd_inp.size() << std::endl;
16311632
}
@@ -1798,14 +1799,16 @@ class chat
17981799

17991800
// some user input remains from prompt or interaction, forward it to processing
18001801
while ((int) embd_inp.size() > n_consumed) {
1801-
embd.push_back(embd_inp[n_consumed]);
1802+
//embd.push_back(embd_inp[n_consumed]);
1803+
embd.emplace_back(embd_inp[n_consumed]);
18021804

18031805
// GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
18041806
// Most likely will remove this in the future to avoid exposing "prev"
18051807
// Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
18061808
// penalty will be applied only based on the tokens generated by the model.
18071809
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
1808-
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
1810+
//ctx_sampling->prev.push_back(embd_inp[n_consumed]);
1811+
ctx_sampling->prev.emplace_back(embd_inp[n_consumed]);
18091812

18101813
++n_consumed;
18111814
if ((int) embd.size() >= params.n_batch) {

config.json

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"model": "models/redmond-puffin-13b.Q4_K_S.gguf",
2+
"model": "models/Mistral-Nemo-Instruct-2407.q5_k.gguf",
33
"prompt": "TASK:",
44
"input_suffix": "RESULT:",
55
"reverse-prompt": "TASK:",
@@ -28,21 +28,29 @@
2828
"mirostat_tau": 5.0,
2929
"mirostat_eta": 0.1,
3030
"samplers_sequence": "tkm",
31-
"models/redmond-puffin-13b.Q4_K_S.gguf":
31+
"models/Mistral-Nemo-Instruct-2407.q5_k.gguf":
3232
{
33-
"prompt": "### SYSTEM: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n### USER:",
34-
"reverse-prompt": "### USER:",
35-
"input_suffix": "### ASSISTANT:",
36-
"ctx-size": 4096,
37-
"preset": "Midnight Enigma"
33+
"ctx-size": 8192,
34+
"n_gpu_layers_vk": 6,
35+
"n_gpu_layers_clblast": 7,
36+
"samplers_sequence": "ts",
37+
"smoothing_factor": 0.1,
38+
"smoothing_curve": 1.1,
39+
"temp": 0.2,
40+
"dynatemp_range": 0.2,
41+
"group":
42+
{
43+
"prompt": " ",
44+
"reverse-prompt": "\n[INST] ",
45+
"input_suffix": "[/INST]\n"
46+
}
3847
},
3948
"models/mistral-7b-instruct-v0.2.Q4_K_S.gguf":
4049
{
41-
"prompt": "Below is an instruction that describes a task. Write an accurate response that appropriately completes the request.\n\n### Instruction:\n",
42-
"reverse-prompt": "### Instruction:\n",
43-
"input_suffix": "\n### Response:\n",
44-
"n_gpu_layers": 8,
50+
"n_gpu_layers_vk": 8,
51+
"n_gpu_layers_clblast": 9,
4552
"ctx-size": 8192,
46-
"temp": 0.9
53+
"preset": "Midnight Enigma",
54+
"format_file": "formats/Mistral.txt"
4755
}
4856
}

thread_chat.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ struct modelThread{
239239
if (penalize_nl) std::cout << "penalize_nl = true" << '\n' << std::endl;
240240
//#endif
241241
//std::cout << lastTimings << std::endl;
242-
std::cout << std::format("Eval speed: {:.3f} t/s", lastSpeedPrompt) << std::format(" | Gen speed: {:.3f} t/s", lastSpeed) << std::endl;
242+
std::cout << std::format("Eval speed: {:.3f} t/s | Gen speed: {:.3f} t/s", lastSpeedPrompt, lastSpeed) << std::endl;
243243
std::cout << "----------------------------------------\n"<< std::endl;
244244

245245
for (auto r : resultsStringPairs){

0 commit comments

Comments
 (0)