Merge branch 'concedo_experimental' into croco_nex_0

Nexesenex · Nexesenex · commit 95d7cc7dbace · 2025-01-24T01:27:33.000+01:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -601,16 +601,23 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
             {
                 printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn debug mode, this restriction is bypassed. However, speculative decoding may malfunction!\n",draftvocab,base_n_vocab);
             }
-            else if((draftvocab >= base_n_vocab-512) || (draftvocab <= base_n_vocab+512))
+            else if((draftvocab >= base_n_vocab-128) || (draftvocab <= base_n_vocab+128))
             {
-                printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn Croco.Cpp, a tolerance of +/- 512 tokens is allowed to account for some variations between the base models and their finetunes/updates and other self-merged frankenmodels + eventual finetunes of those.\nHowever, speculative decoding may malfuction in such cases if the difference between their vocab/tokenizers is too big!\n",draftvocab,base_n_vocab);
+                printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn Croco.Cpp, a tolerance of +/- 128 tokens is allowed to account for some variations between the base models and their finetunes/updates and other self-merged frankenmodels + eventual finetunes of those.\nHowever, speculative decoding may malfuction in such cases if the difference between their vocab/tokenizers is too big!\n",draftvocab,base_n_vocab);
             }
             else
             {
-                printf("Error: Draft model vocab of (%d) does not match base vocab of (%d), or is above 512 tokens of difference. Speculative decoding cannot be used!\n",draftvocab,base_n_vocab);
-                printf("If you REALLY want to override this, run in --debugmode and this restriction will be completely disabled. However, you might encounter unwanted results!\n");
-                llama_free(draft_ctx);
-                draft_ctx = nullptr;
+                int diff = abs(draftvocab-base_n_vocab);
+                if(diff <= 256)
+                {
+                    //allow small differences to work
+                    printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn KoboldCpp, a tolerance of +/- 256 tokens is allowed.\nSpeculative decoding may malfunction!\n",draftvocab,base_n_vocab);
+                } else {
+                    printf("Error: Draft model vocab of (%d) is too different from base vocab of (%d). Speculative decoding cannot be used!\n",draftvocab,base_n_vocab);
+                    printf("If you REALLY want to override this, run in --debugmode and this restriction will be disabled. However, you might encounter unwanted results!\n");
+                    llama_free(draft_ctx);
+                    draft_ctx = nullptr;
+                }
             }
         }
     }
diff --git a/kcpp_adapters/DeepSeek-V2.json b/kcpp_adapters/DeepSeek-V2.json
@@ -0,0 +1,8 @@
+{
+  "system_start": "",
+  "system_end": "",
+  "user_start": "<｜User｜>",
+  "user_end": "",
+  "assistant_start": "<｜Assistant｜>",
+  "assistant_end": "<｜end▁of▁sentence｜>"
+}
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -66,10 +66,10 @@
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.83001"
+KcppVersion = "1.83002"
 LcppVersion = "b4517"
 CudaSpecifics = "Cu124_Ar6175_SMC2_DmmvX32Y1"
-ReleaseDate = "2025/01/22"
+ReleaseDate = "2025/01/23"
 showdebug = True
 guimode = False
 showsamplerwarning = True
@@ -4063,7 +4063,7 @@ def auto_set_backend_gui(manual_select=False):
     def on_picked_model_file(filepath):
         if filepath.lower().endswith('.kcpps') or filepath.lower().endswith('.kcppt'):
             #load it as a config file instead
-            with open(filepath, 'r') as f:
+            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                 dict = json.load(f)
                 import_vars(dict)
 
@@ -4712,7 +4712,7 @@ def export_vars():
         try:
             if kcpp_exporting_template and isinstance(args.chatcompletionsadapter, str) and args.chatcompletionsadapter!="" and os.path.exists(args.chatcompletionsadapter):
                 print("Embedding chat completions adapter...")   # parse and save embedded preload story
-                with open(args.chatcompletionsadapter, 'r') as f:
+                with open(args.chatcompletionsadapter, 'r', encoding='utf-8', errors='ignore') as f:
                     args.chatcompletionsadapter = json.load(f)
         except Exception:
             pass
@@ -4723,7 +4723,7 @@ def export_vars():
         try:
             if kcpp_exporting_template and isinstance(args.preloadstory, str) and args.preloadstory!="" and os.path.exists(args.preloadstory):
                 print("Embedding preload story...")   # parse and save embedded preload story
-                with open(args.preloadstory, 'r') as f:
+                with open(args.preloadstory, 'r', encoding='utf-8', errors='ignore') as f:
                     args.preloadstory = json.load(f)
         except Exception:
             pass
@@ -4993,7 +4993,7 @@ def load_config_gui(): #this is used to populate the GUI with a config file, whe
         if not filename or filename=="":
             return
         runmode_untouched = False
-        with open(filename, 'r') as f:
+        with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
             dict = json.load(f)
             import_vars(dict)
         pass
@@ -5479,7 +5479,7 @@ def unload_libs():
 
 def load_config_cli(filename):
     print("Loading .kcpps configuration file...")
-    with open(filename, 'r') as f:
+    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
         config = json.load(f)
         args.istemplate = False
         raw_args = (sys.argv[1:]) #a lousy hack to allow for overriding kcpps
@@ -5718,7 +5718,7 @@ def main(launch_args,start_server=True):
                 ccadapter_path = os.path.abspath(premade_adapt_path)
         if ccadapter_path:
             print(f"Loading Chat Completions Adapter: {ccadapter_path}")
-            with open(ccadapter_path, 'r') as f:
+            with open(ccadapter_path, 'r', encoding='utf-8', errors='replace') as f:
                 chatcompl_adapter = json.load(f)
                 canload = True
         else:
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
@@ -756,8 +756,9 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
 
                 //use creative settings to generate speakers
                 const int topk = 20;
+                const float top_p = 1.0f;
                 const float temp = 1.2f;
-                llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,topk,temp,speaker_rng);
+                llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,std::vector<int32_t>(),1.0,top_p,topk,temp,speaker_rng);
 
                 //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
                 if(next_token_uses_guide_token && !llama_vocab_is_control(ttcvocab, new_token_id) && !llama_vocab_is_eog(ttcvocab, new_token_id))
@@ -878,7 +879,8 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
         //use predictable settings to generate voice
         const int topk = 4;
         const float temp = 0.75f;
-        llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,topk,temp,tts_rng);
+        const float top_p = 1.0f;
+        llama_token new_token_id = kcpp_quick_sample(logits,ttc_n_vocab,std::vector<int32_t>(),1.0,top_p,topk,temp,speaker_rng);
 
         //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
         if(next_token_uses_guide_token && !llama_vocab_is_control(ttcvocab, new_token_id) && !llama_vocab_is_eog(ttcvocab, new_token_id))
@@ -935,7 +937,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
     const int n_codes = codes.size();
     if(n_codes<=1)
     {
-        printf("\nWarning: TTS vocoder generated nothing!\n");
+        printf("\nWarning: No Audio Tokens Produced!\n");
         last_generated_audio = "";
         output.data = last_generated_audio.c_str();
         output.status = 1;
@@ -965,12 +967,23 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
 
         //audio = resample_wav(audio,n_sr,t_sr); //resample to 16k
 
-        for (int i = 0; i < cutout; ++i) {
-            audio[i] = 0.0f;
+        if(audio.size()>cutout+16)
+        {
+            for (int i = 0; i < cutout; ++i) {
+                audio[i] = 0.0f;
+            }
+            //add some silence at the end
+            for (int i = 0; i < cutout; ++i) {
+                audio.push_back(0.0f);
+            }
         }
-        //add some silence at the end
-        for (int i = 0; i < cutout; ++i) {
-            audio.push_back(0.0f);
+        else
+        {
+            printf("\nWarning: TTS vocoder generated nothing!\n");
+            last_generated_audio = "";
+            output.data = last_generated_audio.c_str();
+            output.status = 1;
+            return output;
         }
 
         last_generated_audio = save_wav16_base64(audio, t_sr);
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
@@ -369,9 +369,9 @@ std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_
 }
 
 //a very rudimentary all in one sampling function which has no dependencies
-int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng)
+int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng)
 {
-    if (temp <= 0 || top_k==1) {
+    if (temp <= 0) {
         // select the token with the highest logit directly
         float max_logit = logits[0];
         int32_t max_id = 0;
@@ -392,8 +392,19 @@ int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float t
 
     //temperature sample
     const float scale = 1.0f/temp;
+
+    //sample rep pen
     for (int i = 0; i < n_logits; ++i) {
-        logits_id.push_back(std::make_pair(logits[i]*scale, i));
+        if (rep_pen>1.0f && std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
+            // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+            if (logits[i] < 0.0f) {
+                logits_id.push_back(std::make_pair(logits[i]*scale*rep_pen, i));
+            } else {
+                logits_id.push_back(std::make_pair(logits[i]*scale/rep_pen, i));
+            }
+        } else {
+            logits_id.push_back(std::make_pair(logits[i]*scale, i));
+        }
     }
 
     //sample top_k
@@ -421,6 +432,24 @@ int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float t
         p /= sum;
     }
 
+    //apply top p
+    if (top_p < 1.0) {
+        double cumsum = 0.0;
+        for (int i = 0; i < (int) probs.size(); i++) {
+            cumsum += probs[i];
+            if (cumsum >= top_p) {
+                probs.resize(i + 1);
+                logits_id.resize(i + 1);
+                break;
+            }
+        }
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
     std::discrete_distribution<> dist(probs.begin(), probs.end());
     int idx = dist(rng);
 
diff --git a/otherarch/utils.h b/otherarch/utils.h
@@ -63,7 +63,7 @@ std::string kcpp_base64_encode(const std::string &data);
 std::string get_timestamp_str();
 std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate);
 
-int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng);
+int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng);
 
 struct kcpp_embd_batch { //duplcated from llava_embd_batch
     std::vector<int32_t> pos;

Original file line number	Diff line number	Diff line change
`@@ -601,16 +601,23 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll`
`601`	`601`	`{`
`602`	`602`	`printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn debug mode, this restriction is bypassed. However, speculative decoding may malfunction!\n",draftvocab,base_n_vocab);`
`603`	`603`	`}`
`604`		`- else if((draftvocab >= base_n_vocab-512) \|\| (draftvocab <= base_n_vocab+512))`
	`604`	`+ else if((draftvocab >= base_n_vocab-128) \|\| (draftvocab <= base_n_vocab+128))`
`605`	`605`	`{`
`606`		`- printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn Croco.Cpp, a tolerance of +/- 512 tokens is allowed to account for some variations between the base models and their finetunes/updates and other self-merged frankenmodels + eventual finetunes of those.\nHowever, speculative decoding may malfuction in such cases if the difference between their vocab/tokenizers is too big!\n",draftvocab,base_n_vocab);`
	`606`	`+ printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn Croco.Cpp, a tolerance of +/- 128 tokens is allowed to account for some variations between the base models and their finetunes/updates and other self-merged frankenmodels + eventual finetunes of those.\nHowever, speculative decoding may malfuction in such cases if the difference between their vocab/tokenizers is too big!\n",draftvocab,base_n_vocab);`
`607`	`607`	`}`
`608`	`608`	`else`
`609`	`609`	`{`
`610`		`- printf("Error: Draft model vocab of (%d) does not match base vocab of (%d), or is above 512 tokens of difference. Speculative decoding cannot be used!\n",draftvocab,base_n_vocab);`
`611`		`- printf("If you REALLY want to override this, run in --debugmode and this restriction will be completely disabled. However, you might encounter unwanted results!\n");`
`612`		`- llama_free(draft_ctx);`
`613`		`- draft_ctx = nullptr;`
	`610`	`+ int diff = abs(draftvocab-base_n_vocab);`
	`611`	`+ if(diff <= 256)`
	`612`	`+ {`
	`613`	`+ //allow small differences to work`
	`614`	`+ printf("WARNING: Draft model vocab of (%d) does not match base vocab of (%d).\nIn KoboldCpp, a tolerance of +/- 256 tokens is allowed.\nSpeculative decoding may malfunction!\n",draftvocab,base_n_vocab);`
	`615`	`+ } else {`
	`616`	`+ printf("Error: Draft model vocab of (%d) is too different from base vocab of (%d). Speculative decoding cannot be used!\n",draftvocab,base_n_vocab);`
	`617`	`+ printf("If you REALLY want to override this, run in --debugmode and this restriction will be disabled. However, you might encounter unwanted results!\n");`
	`618`	`+ llama_free(draft_ctx);`
	`619`	`+ draft_ctx = nullptr;`
	`620`	`+ }`
`614`	`621`	`}`
`615`	`622`	`}`
`616`	`623`	`}`