Small fixes and updated recommended models

MaggotHATE · MaggotHATE · commit 90ad7f4df785 · 2024-07-31T23:25:29.000+05:00
* Mistral Nemo is crazily good
diff --git a/.gitignore b/.gitignore
@@ -37,5 +37,6 @@
 *.ini
 *.7z
 *.zip
+*.rar
 /base/common — копия.cpp
 /base/ggml-opencl-bu.cpp
diff --git a/README.md b/README.md
@@ -62,10 +62,11 @@ Tested on Windows only for now. AVX2 releases only for now, older releases were
 * [llama.cpp](https://github.com/ggerganov/llama.cpp)
 * [imgui](https://github.com/ocornut/imgui)
 * Retro theme based on https://github.com/ocornut/imgui/issues/707#issuecomment-254610737
-* Vulkan experimental build uses [this PR](https://github.com/ggerganov/llama.cpp/pull/2059)
-* [redmond-puffin-13b](https://huggingface.co/TheBloke/Redmond-Puffin-13B-GGUF) from config.json (q4_K_S version works faster)
-* [mistral-7b-instruct](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) from config.json (q4_K_S version works faster)
+* [mistral-7b-instruct-v0.1](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) from config.json (q4_K_S version works faster)
+* [Mistral-Nemo-Instruct-2407-GGUF](https://huggingface.co/ZeroWw/Mistral-Nemo-Instruct-2407-GGUF) from config.json (specific quants with output and embed tensors quantized to f16, q5 is the smallest)
+* [redmond-puffin-13b (previously recommended)](https://huggingface.co/TheBloke/Redmond-Puffin-13B-GGUF) from config.json (q4_K_S version works faster)
 
 ### Additional notes
 
+* Vulkan experimental build used [this PR](https://github.com/ggerganov/llama.cpp/pull/2059)
 * ggmlv3 version is very old and almost deprecated for now, as almost no new models are using the old format
diff --git a/chat_plain.h b/chat_plain.h
@@ -773,7 +773,8 @@ class chat
             }
 
             embd_inp.clear();
-            embd_inp.push_back(decoder_start_token_id);
+            //embd_inp.push_back(decoder_start_token_id);
+            embd_inp.emplace_back(decoder_start_token_id);
         }
 
         return 0;
@@ -973,8 +974,8 @@ class chat
         }
 
         if (embd_inp.empty()) {
-            //embd_inp.emplace_back(llama_token_bos(model));
-            embd_inp.push_back(llama_token_bos(model));
+            embd_inp.emplace_back(llama_token_bos(model));
+            //embd_inp.push_back(llama_token_bos(model));
         }
 
         // Tokenize negative prompt
@@ -1323,8 +1324,8 @@ class chat
 
         // add it to the context
         //embd.emplace_back(id);
-        //embd.emplace_back(id);
-        embd.push_back(id);
+        embd.emplace_back(id);
+        //embd.push_back(id);
         ++n_last_message;
         ++n_last_message_past;
 
@@ -1502,8 +1503,8 @@ class chat
             // some user input remains from prompt or interaction, forward it to processing
             while ((int) embd_inp.size() > n_consumed) {
                 //fprintf(stderr, ">");
-                //embd.emplace_back(embd_inp[n_consumed]);
-                embd.push_back(embd_inp[n_consumed]);
+                embd.emplace_back(embd_inp[n_consumed]);
+                //embd.push_back(embd_inp[n_consumed]);
                 //last_n_tokens.erase(last_n_tokens.begin());
                 //last_n_tokens.emplace_back(embd_inp[n_consumed]);
                 //last_tokens.erase(last_tokens.begin());
@@ -1552,8 +1553,8 @@ class chat
     
     void appendPrefixBos(){
         if (params.input_prefix_bos) {
-            //embd_inp.emplace_back(llama_token_bos(model));
-            embd_inp.push_back(llama_token_bos(model));
+            embd_inp.emplace_back(llama_token_bos(model));
+            //embd_inp.push_back(llama_token_bos(model));
         }
     }
     
@@ -1624,8 +1625,8 @@ class chat
 
             for (size_t i = original_size; i < embd_inp.size(); ++i) {
                 const llama_token token = embd_inp[i];
-                //output_tokens.emplace_back(token);
-                output_tokens.push_back(token);
+                output_tokens.emplace_back(token);
+                //output_tokens.push_back(token);
                 //output_ss << llama_token_to_piece(ctx, token);
                 //std::cout << "tkns = " << embd_inp.size() << std::endl;
             }
@@ -1798,14 +1799,16 @@ class chat
         
         // some user input remains from prompt or interaction, forward it to processing
         while ((int) embd_inp.size() > n_consumed) {
-            embd.push_back(embd_inp[n_consumed]);
+            //embd.push_back(embd_inp[n_consumed]);
+            embd.emplace_back(embd_inp[n_consumed]);
 
             // GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
             //     Most likely will remove this in the future to avoid exposing "prev"
             //     Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
             //     penalty will be applied only based on the tokens generated by the model.
             ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-            ctx_sampling->prev.push_back(embd_inp[n_consumed]);
+            //ctx_sampling->prev.push_back(embd_inp[n_consumed]);
+            ctx_sampling->prev.emplace_back(embd_inp[n_consumed]);
 
             ++n_consumed;
             if ((int) embd.size() >= params.n_batch) {
diff --git a/config.json b/config.json
@@ -1,5 +1,5 @@
 ﻿{
-    "model": "models/redmond-puffin-13b.Q4_K_S.gguf",
+    "model": "models/Mistral-Nemo-Instruct-2407.q5_k.gguf",
     "prompt": "TASK:",
     "input_suffix": "RESULT:",
     "reverse-prompt": "TASK:",
@@ -28,21 +28,29 @@
     "mirostat_tau": 5.0,
     "mirostat_eta": 0.1,
     "samplers_sequence": "tkm",
-    "models/redmond-puffin-13b.Q4_K_S.gguf":
+    "models/Mistral-Nemo-Instruct-2407.q5_k.gguf":
         {
-            "prompt": "### SYSTEM: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n### USER:",
-            "reverse-prompt": "### USER:",
-            "input_suffix": "### ASSISTANT:",
-            "ctx-size": 4096,
-            "preset": "Midnight Enigma"
+            "ctx-size": 8192,
+            "n_gpu_layers_vk": 6,
+            "n_gpu_layers_clblast": 7,
+            "samplers_sequence": "ts",
+            "smoothing_factor": 0.1,
+            "smoothing_curve": 1.1,
+            "temp": 0.2,
+            "dynatemp_range": 0.2,
+            "group":
+            {
+                "prompt": " ",
+                "reverse-prompt": "\n[INST] ",
+                "input_suffix": "[/INST]\n"
+            }
         },
     "models/mistral-7b-instruct-v0.2.Q4_K_S.gguf":
         {
-            "prompt": "Below is an instruction that describes a task. Write an accurate response that appropriately completes the request.\n\n### Instruction:\n",
-            "reverse-prompt": "### Instruction:\n",
-            "input_suffix": "\n### Response:\n",
-            "n_gpu_layers": 8,
+            "n_gpu_layers_vk": 8,
+            "n_gpu_layers_clblast": 9,
             "ctx-size": 8192,
-            "temp": 0.9
+            "preset": "Midnight Enigma",
+            "format_file": "formats/Mistral.txt"
         }
 }
diff --git a/thread_chat.h b/thread_chat.h
@@ -239,7 +239,7 @@ struct modelThread{
         if (penalize_nl) std::cout << "penalize_nl = true" << '\n' << std::endl;
         //#endif
         //std::cout << lastTimings << std::endl;
-        std::cout << std::format("Eval speed: {:.3f} t/s", lastSpeedPrompt) << std::format(" | Gen speed: {:.3f} t/s", lastSpeed) << std::endl;
+        std::cout << std::format("Eval speed: {:.3f} t/s | Gen speed: {:.3f} t/s", lastSpeedPrompt, lastSpeed) << std::endl;
         std::cout << "----------------------------------------\n"<< std::endl;
         
         for (auto r : resultsStringPairs){