Merge branch 'concedo_experimental' into croco_nex_0

Nexesenex · Nexesenex · commit 4b0b976b9690 · 2025-01-22T20:07:11.000+01:00
diff --git a/colab.ipynb b/colab.ipynb
@@ -122,7 +122,7 @@
     "if TTSCommand:\n",
     "  !aria2c -x 10 -o ttsmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $TTSModel\n",
     "  !aria2c -x 10 -o ttswavtok.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $WavTokModel\n",
-    "!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand\n"
+    "!./koboldcpp_linux model.gguf --usecublas 0 mmq --chatcompletionsadapter AutoGuess --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand\n"
    ]
   }
  ],
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -108,6 +108,7 @@ static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
 static int n_past = 0;
 static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
+static bool quiet = false;
 static std::vector<gpt_vocab::id> last_n_tokens;
 static std::vector<gpt_vocab::id> current_context_tokens;
 static size_t mem_per_token = 0;
@@ -936,12 +937,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
 
     if(last_idx>1) //if there are 2 or more viable candidates
     {
-        if (debugmode==1) {
+        if (debugmode==1 && !quiet) {
             printf("XTC penalties [");
         }
         // then remove all other tokens above threshold EXCEPT the least likely one
         for (size_t i = 0; i < last_idx - 1; ++i) {
-            if (debugmode==1)
+            if (debugmode==1 && !quiet)
             {
                 gpt_vocab::id token = candidates->data[i].id;
                 std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
@@ -950,7 +951,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
             }
             candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough
         }
-        if (debugmode==1) {
+        if (debugmode==1 && !quiet) {
             printf("]\n");
         }
         candidates->sorted = false;
@@ -1139,7 +1140,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
         max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);
     }
 
-    if (debugmode==1 && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
         printf("DRY penalties [");
     }
     size_t count = 0;
@@ -1150,7 +1151,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
             repeat_exp = max_exponent;
         }
         float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);
-        if (debugmode==1)
+        if (debugmode==1 && !quiet)
         {
             std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
             ::utreplace(tokenizedstr, "\n", "\\n");
@@ -1163,7 +1164,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
     {
         candidates->sorted = false;
     }
-    if (debugmode==1 && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
         printf("]\n");
     }
 }
@@ -1694,7 +1695,7 @@ static void load_grammar(const std::string & gammarstr)
             printf("\nIgnored invalid grammar sampler.");
             return;
         }
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             parsed_grammar.print(stderr);
         }
@@ -1864,7 +1865,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
         printf("Trained max context length (value:%.d).\n", n_ctx_train);
         printf("Desired context length (value:%.d).\n", n_ctx_desired);
 
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("Solar context multiplier (value:%.3f).\n", ctx_multiplier);
             printf("Chi context train (value:%.3f).\n", chi_ctx_train_value);
@@ -1880,7 +1881,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
         {
             float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));
             float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
-            // if(debugmode==1)
+            // if(debugmode==1 && !quiet)
             // {
             printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
             printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
@@ -3034,13 +3035,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
         printf("\nWarning: KCPP text generation not initialized!\n");
         return toks;
     }
-    if(debugmode==1)
+    if(debugmode==1 && !quiet)
     {
         printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());
     }
     TokenizeString(input, toks, file_format,addbos);
     int tokcount = toks.size();
-    if(debugmode==1)
+    if(debugmode==1 && !quiet)
     {
         printf("\nTokens Counted: %d\n",tokcount);
     }
@@ -3125,6 +3126,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         llama_perf_context_reset(llama_ctx_v4);
     }
 
+    quiet = inputs.quiet;
     generation_finished = false; // Set current generation status
     generated_tokens.clear(); // New Generation, new tokens
     delayed_generated_tokens.clear();
@@ -3203,7 +3205,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     banned_token_ids.clear();
     if(banned_tokens.size()>0)
     {
-        // if(debugmode==1)
+        // if(debugmode==1 && !quiet)
         // {
         printf("\nBanning %zu single character sequences...",banned_tokens.size());
         // }
@@ -3220,16 +3222,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
             }
         }
-        // if(debugmode==1)
+        // if(debugmode==1 && !quiet)
         // {
         printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());
         // }
     }
 
-    // if(debugmode==1 && banned_phrases.size()>0)
     if(banned_phrases.size()>0)
     {
+        // if(debugmode==1 && !quiet)
+        // {
         printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);
+        // }
     }
 
     logit_biases.clear();
@@ -3272,7 +3276,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         //images have changed. swap identifiers to force reprocessing
         current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
         llava_composite_image_signature = new_llava_composite;
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("\nLLAVA images changed, existing cache invalidated");
         }
@@ -3328,10 +3332,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             const int MAX_CHAR_LEN = 40;
             const int MAX_SEQ_LEN = 20;
 
-            // if (debugmode == 1)
-
+            // if (debugmode == 1 && !quiet)
+            {
             printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());
-
+            }
             for (auto sequence_break : kcpp_data->dry_sequence_breakers)
             {
                 if (sequence_break.size() > MAX_CHAR_LEN)
@@ -3340,7 +3344,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
                 GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
             }
-            if (debugmode == 1)
+            if (debugmode == 1 && !quiet)
             {
                 int trivial = 0, non_trivial = 0;
                 for (const auto &seq : dry_sequence_breakers)
@@ -3360,9 +3364,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     }
 
     bool stream_sse = inputs.stream_sse;
-
-    bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
-
+    bool allow_regular_prints = (!quiet && debugmode!=-1);
 
     std::string grammarstr = inputs.grammar;
     bool grammar_retain_state = inputs.grammar_retain_state;
@@ -3395,7 +3397,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     if (kcpp_data->seed <= 0 || kcpp_data->seed==0xFFFFFFFF)
     {
         kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("\nUsing Seed: %d",kcpp_data->seed);
         }
@@ -3427,15 +3429,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             }
             else
             {
-                if(debugmode==1)
+                if(debugmode==1 && !quiet)
                 {
                     printf("\nCreating clip image embed...");
                 }
                 llava_images[i].clp_image_tokens = 0;
                 if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
                     printf("\nError: Clip image %d failed to create embd!",i);
                 }
-                if(debugmode==1)
+                if(debugmode==1 && !quiet)
                 {
                     printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
                 }
@@ -3558,7 +3560,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
     n_past = 0;
 
-    if (debugmode==1)
+    if (debugmode==1 && !quiet)
     {
         std::string outstr = "";
         printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);
@@ -3703,7 +3705,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         printf("\n");
     }
 
-    if (debugmode==1)
+    if (debugmode==1 && !quiet)
     {
         std::string outstr = "";
         printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);
@@ -3757,7 +3759,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                     draft_used = true;
                     draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
                     evalres = draft_results.draft_success;
-                    if(debugmode==1)
+                    if(debugmode==1 && !quiet)
                     {
                         std::string draftedtoks = get_tok_vec_str(draft_results.draftids);
                         printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());
@@ -4052,7 +4054,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 if(draft_used)
                 {
                     int32_t draftedid = draft_results.draftids[logits_sampled];
-                    if(debugmode==1)
+                    if(debugmode==1 && !quiet)
                     {
                         std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);
                         std::string realtok = FileFormatTokenizeID(id, file_format, true);
@@ -4105,7 +4107,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 {
                     printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);
                 }
-                if(debugmode==1 && top_picks_history.size()>0)
+                if(debugmode==1 && !quiet && top_picks_history.size()>0)
                 {
                     printf(" [");
                     bool firstloop = true;
@@ -4370,7 +4372,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         delayed_generated_tokens.pop_front();
     }
 
-    if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
+    if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
     {
         printf("\n");
         llama_perf_context_print(llama_ctx_v4);
diff --git a/kcpp_adapters/AutoGuess.json b/kcpp_adapters/AutoGuess.json
@@ -109,5 +109,16 @@
         "assistant_start": "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
         "assistant_end": "<|END_OF_TURN_TOKEN|>"
     }
+}, {
+    "search": ["<｜User｜>"],
+    "name": "DeepSeek V2.5",
+    "adapter": {
+        "system_start": "",
+        "system_end": "",
+        "user_start": "<｜User｜>",
+        "user_end": "",
+        "assistant_start": "<｜Assistant｜>",
+        "assistant_end": "<｜end▁of▁sentence｜>"
+    }
 }
 ]
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -66,10 +66,10 @@
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.83000"
+KcppVersion = "1.83001"
 LcppVersion = "b4517"
 CudaSpecifics = "Cu124_Ar6175_SMC2_DmmvX32Y1"
-ReleaseDate = "2025/01/20"
+ReleaseDate = "2025/01/22"
 showdebug = True
 guimode = False
 showsamplerwarning = True
@@ -400,7 +400,7 @@ def pick_existant_file(ntoption,nonntoption):
     (lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
     (lib_clblast_noavx2, "Use CLBlast (Older CPU)"),
     (lib_failsafe, "Failsafe Mode (Older CPU)")]
-default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
+default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, vulkan_noavx2_option, clblast_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
 runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]
 
 def init_library():
@@ -668,6 +668,8 @@ def exit_with_error(code, message, title="Error"):
     sys.exit(code)
 
 def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = always print
+    if args.quiet and importance<2: #quiet overrides debugmode
+        return
     if args.debugmode < 1:
         if importance==1 and (args.debugmode == -1 or args.quiet):
             return
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
@@ -154,9 +154,10 @@ static std::vector<float> embd_to_audio(
         const int n_codes,
         const int n_embd,
         const int n_thread) {
-    const int n_hop = 600;
-    const int n_fft = n_hop*4; //its 1280 at 320, or 2400 at 600
-    const int n_win = n_hop*4;
+
+    const int n_fft = 1280; //its 1280 at 320, or 2400 at 600
+    const int n_hop = 320;
+    const int n_win = 1280;
     const int n_pad = (n_win - n_hop)/2;
     const int n_out = (n_codes - 1)*n_hop + n_win;
 
@@ -624,7 +625,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
     {
         audio_seed = (((uint32_t)time(NULL)) % 1000000u);
     }
-    if(ttsdebugmode==1)
+    if(ttsdebugmode==1 && !inputs.quiet)
     {
         printf("\nUsing Speaker Seed: %d", speaker_seed);
         printf("\nUsing Audio Seed: %d", audio_seed);
@@ -640,13 +641,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
     && last_generated_audio!=""
     && last_generation_settings_prompt == std::string(inputs.prompt))
     {
-        if(ttsdebugmode==1 || !inputs.quiet)
-        {
+        if (ttsdebugmode == 1 && !inputs.quiet) {
             printf("\nReusing Cached Audio.\n");
-            output.data = last_generated_audio.c_str();
-            output.status = 1;
-            return output;
         }
+        output.data = last_generated_audio.c_str();
+        output.status = 1;
+        return output;
     }
 
 
diff --git a/otherarch/whispercpp/whisper_adapter.cpp b/otherarch/whispercpp/whisper_adapter.cpp

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@`
`122`	`122`	`"if TTSCommand:\n",`
`123`	`123`	`" !aria2c -x 10 -o ttsmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $TTSModel\n",`
`124`	`124`	`" !aria2c -x 10 -o ttswavtok.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $WavTokModel\n",`
`125`		`- "!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand\n"`
	`125`	`+ "!./koboldcpp_linux model.gguf --usecublas 0 mmq --chatcompletionsadapter AutoGuess --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand\n"`
`126`	`126`	`]`
`127`	`127`	`}`
`128`	`128`	`],`
Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,7 @@ static kcpp_params * kcpp_data = nullptr;`
`108`	`108`	`static int max_context_limit_at_load = 0;`
`109`	`109`	`static int n_past = 0;`
`110`	`110`	`static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall`
	`111`	`+static bool quiet = false;`
`111`	`112`	`static std::vector<gpt_vocab::id> last_n_tokens;`
`112`	`113`	`static std::vector<gpt_vocab::id> current_context_tokens;`
`113`	`114`	`static size_t mem_per_token = 0;`
`@@ -936,12 +937,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float`
`936`	`937`
`937`	`938`	`if(last_idx>1) //if there are 2 or more viable candidates`
`938`	`939`	`{`
`939`		`- if (debugmode==1) {`
	`940`	`+ if (debugmode==1 && !quiet) {`
`940`	`941`	`printf("XTC penalties [");`
`941`	`942`	`}`
`942`	`943`	`// then remove all other tokens above threshold EXCEPT the least likely one`
`943`	`944`	`for (size_t i = 0; i < last_idx - 1; ++i) {`
`944`		`- if (debugmode==1)`
	`945`	`+ if (debugmode==1 && !quiet)`
`945`	`946`	`{`
`946`	`947`	`gpt_vocab::id token = candidates->data[i].id;`
`947`	`948`	`std::string tokenizedstr = FileFormatTokenizeID(token, file_format);`
`@@ -950,7 +951,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float`
`950`	`951`	`}`
`951`	`952`	`candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough`
`952`	`953`	`}`
`953`		`- if (debugmode==1) {`
	`954`	`+ if (debugmode==1 && !quiet) {`
`954`	`955`	`printf("]\n");`
`955`	`956`	`}`
`956`	`957`	`candidates->sorted = false;`
`@@ -1139,7 +1140,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1139`	`1140`	`max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);`
`1140`	`1141`	`}`
`1141`	`1142`
`1142`		`- if (debugmode==1 && !dry_max_token_repeat.empty()) {`
	`1143`	`+ if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {`
`1143`	`1144`	`printf("DRY penalties [");`
`1144`	`1145`	`}`
`1145`	`1146`	`size_t count = 0;`
`@@ -1150,7 +1151,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1150`	`1151`	`repeat_exp = max_exponent;`
`1151`	`1152`	`}`
`1152`	`1153`	`float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);`
`1153`		`- if (debugmode==1)`
	`1154`	`+ if (debugmode==1 && !quiet)`
`1154`	`1155`	`{`
`1155`	`1156`	`std::string tokenizedstr = FileFormatTokenizeID(token, file_format);`
`1156`	`1157`	`::utreplace(tokenizedstr, "\n", "\\n");`
`@@ -1163,7 +1164,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1163`	`1164`	`{`
`1164`	`1165`	`candidates->sorted = false;`
`1165`	`1166`	`}`
`1166`		`- if (debugmode==1 && !dry_max_token_repeat.empty()) {`
	`1167`	`+ if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {`
`1167`	`1168`	`printf("]\n");`
`1168`	`1169`	`}`
`1169`	`1170`	`}`
`@@ -1694,7 +1695,7 @@ static void load_grammar(const std::string & gammarstr)`
`1694`	`1695`	`printf("\nIgnored invalid grammar sampler.");`
`1695`	`1696`	`return;`
`1696`	`1697`	`}`
`1697`		`- if(debugmode==1)`
	`1698`	`+ if(debugmode==1 && !quiet)`
`1698`	`1699`	`{`
`1699`	`1700`	`parsed_grammar.print(stderr);`
`1700`	`1701`	`}`
`@@ -1864,7 +1865,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1864`	`1865`	`printf("Trained max context length (value:%.d).\n", n_ctx_train);`
`1865`	`1866`	`printf("Desired context length (value:%.d).\n", n_ctx_desired);`
`1866`	`1867`
`1867`		`- if(debugmode==1)`
	`1868`	`+ if(debugmode==1 && !quiet)`
`1868`	`1869`	`{`
`1869`	`1870`	`printf("Solar context multiplier (value:%.3f).\n", ctx_multiplier);`
`1870`	`1871`	`printf("Chi context train (value:%.3f).\n", chi_ctx_train_value);`
`@@ -1880,7 +1881,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1880`	`1881`	`{`
`1881`	`1882`	`float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));`
`1882`	`1883`	`float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;`
`1883`		`- // if(debugmode==1)`
	`1884`	`+ // if(debugmode==1 && !quiet)`
`1884`	`1885`	`// {`
`1885`	`1886`	`printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);`
`1886`	`1887`	`printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);`
`@@ -3034,13 +3035,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)`
`3034`	`3035`	`printf("\nWarning: KCPP text generation not initialized!\n");`
`3035`	`3036`	`return toks;`
`3036`	`3037`	`}`
`3037`		`- if(debugmode==1)`
	`3038`	`+ if(debugmode==1 && !quiet)`
`3038`	`3039`	`{`
`3039`	`3040`	`printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());`
`3040`	`3041`	`}`
`3041`	`3042`	`TokenizeString(input, toks, file_format,addbos);`
`3042`	`3043`	`int tokcount = toks.size();`
`3043`		`- if(debugmode==1)`
	`3044`	`+ if(debugmode==1 && !quiet)`
`3044`	`3045`	`{`
`3045`	`3046`	`printf("\nTokens Counted: %d\n",tokcount);`
`3046`	`3047`	`}`
`@@ -3125,6 +3126,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3125`	`3126`	`llama_perf_context_reset(llama_ctx_v4);`
`3126`	`3127`	`}`
`3127`	`3128`
	`3129`	`+ quiet = inputs.quiet;`
`3128`	`3130`	`generation_finished = false; // Set current generation status`
`3129`	`3131`	`generated_tokens.clear(); // New Generation, new tokens`
`3130`	`3132`	`delayed_generated_tokens.clear();`
`@@ -3203,7 +3205,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3203`	`3205`	`banned_token_ids.clear();`
`3204`	`3206`	`if(banned_tokens.size()>0)`
`3205`	`3207`	`{`
`3206`		`- // if(debugmode==1)`
	`3208`	`+ // if(debugmode==1 && !quiet)`
`3207`	`3209`	`// {`
`3208`	`3210`	`printf("\nBanning %zu single character sequences...",banned_tokens.size());`
`3209`	`3211`	`// }`
`@@ -3220,16 +3222,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3220`	`3222`	`}`
`3221`	`3223`	`}`
`3222`	`3224`	`}`
`3223`		`- // if(debugmode==1)`
	`3225`	`+ // if(debugmode==1 && !quiet)`
`3224`	`3226`	`// {`
`3225`	`3227`	`printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());`
`3226`	`3228`	`// }`
`3227`	`3229`	`}`
`3228`	`3230`
`3229`		`- // if(debugmode==1 && banned_phrases.size()>0)`
`3230`	`3231`	`if(banned_phrases.size()>0)`
`3231`	`3232`	`{`
	`3233`	`+ // if(debugmode==1 && !quiet)`
	`3234`	`+ // {`
`3232`	`3235`	`printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);`
	`3236`	`+ // }`
`3233`	`3237`	`}`
`3234`	`3238`
`3235`	`3239`	`logit_biases.clear();`
`@@ -3272,7 +3276,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3272`	`3276`	`//images have changed. swap identifiers to force reprocessing`
`3273`	`3277`	`current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);`
`3274`	`3278`	`llava_composite_image_signature = new_llava_composite;`
`3275`		`- if(debugmode==1)`
	`3279`	`+ if(debugmode==1 && !quiet)`
`3276`	`3280`	`{`
`3277`	`3281`	`printf("\nLLAVA images changed, existing cache invalidated");`
`3278`	`3282`	`}`
`@@ -3328,10 +3332,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3328`	`3332`	`const int MAX_CHAR_LEN = 40;`
`3329`	`3333`	`const int MAX_SEQ_LEN = 20;`
`3330`	`3334`
`3331`		`- // if (debugmode == 1)`
`3332`		`-`
	`3335`	`+ // if (debugmode == 1 && !quiet)`
	`3336`	`+ {`
`3333`	`3337`	`printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());`
`3334`		`-`
	`3338`	`+ }`
`3335`	`3339`	`for (auto sequence_break : kcpp_data->dry_sequence_breakers)`
`3336`	`3340`	`{`
`3337`	`3341`	`if (sequence_break.size() > MAX_CHAR_LEN)`
`@@ -3340,7 +3344,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3340`	`3344`	`}`
`3341`	`3345`	`GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);`
`3342`	`3346`	`}`
`3343`		`- if (debugmode == 1)`
	`3347`	`+ if (debugmode == 1 && !quiet)`
`3344`	`3348`	`{`
`3345`	`3349`	`int trivial = 0, non_trivial = 0;`
`3346`	`3350`	`for (const auto &seq : dry_sequence_breakers)`
`@@ -3360,9 +3364,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3360`	`3364`	`}`
`3361`	`3365`
`3362`	`3366`	`bool stream_sse = inputs.stream_sse;`
`3363`		`-`
`3364`		`- bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) \|\| debugmode >= 1;`
`3365`		`-`
	`3367`	`+ bool allow_regular_prints = (!quiet && debugmode!=-1);`
`3366`	`3368`
`3367`	`3369`	`std::string grammarstr = inputs.grammar;`
`3368`	`3370`	`bool grammar_retain_state = inputs.grammar_retain_state;`
`@@ -3395,7 +3397,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3395`	`3397`	`if (kcpp_data->seed <= 0 \|\| kcpp_data->seed==0xFFFFFFFF)`
`3396`	`3398`	`{`
`3397`	`3399`	`kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);`
`3398`		`- if(debugmode==1)`
	`3400`	`+ if(debugmode==1 && !quiet)`
`3399`	`3401`	`{`
`3400`	`3402`	`printf("\nUsing Seed: %d",kcpp_data->seed);`
`3401`	`3403`	`}`
`@@ -3427,15 +3429,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3427`	`3429`	`}`
`3428`	`3430`	`else`
`3429`	`3431`	`{`
`3430`		`- if(debugmode==1)`
	`3432`	`+ if(debugmode==1 && !quiet)`
`3431`	`3433`	`{`
`3432`	`3434`	`printf("\nCreating clip image embed...");`
`3433`	`3435`	`}`
`3434`	`3436`	`llava_images[i].clp_image_tokens = 0;`
`3435`	`3437`	`if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {`
`3436`	`3438`	`printf("\nError: Clip image %d failed to create embd!",i);`
`3437`	`3439`	`}`
`3438`		`- if(debugmode==1)`
	`3440`	`+ if(debugmode==1 && !quiet)`
`3439`	`3441`	`{`
`3440`	`3442`	`printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);`
`3441`	`3443`	`}`
`@@ -3558,7 +3560,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3558`	`3560`	`std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);`
`3559`	`3561`	`n_past = 0;`
`3560`	`3562`
`3561`		`- if (debugmode==1)`
	`3563`	`+ if (debugmode==1 && !quiet)`
`3562`	`3564`	`{`
`3563`	`3565`	`std::string outstr = "";`
`3564`	`3566`	`printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);`
`@@ -3703,7 +3705,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3703`	`3705`	`printf("\n");`
`3704`	`3706`	`}`
`3705`	`3707`
`3706`		`- if (debugmode==1)`
	`3708`	`+ if (debugmode==1 && !quiet)`
`3707`	`3709`	`{`
`3708`	`3710`	`std::string outstr = "";`
`3709`	`3711`	`printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);`
`@@ -3757,7 +3759,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3757`	`3759`	`draft_used = true;`
`3758`	`3760`	`draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);`
`3759`	`3761`	`evalres = draft_results.draft_success;`
`3760`		`- if(debugmode==1)`
	`3762`	`+ if(debugmode==1 && !quiet)`
`3761`	`3763`	`{`
`3762`	`3764`	`std::string draftedtoks = get_tok_vec_str(draft_results.draftids);`
`3763`	`3765`	`printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());`
`@@ -4052,7 +4054,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4052`	`4054`	`if(draft_used)`
`4053`	`4055`	`{`
`4054`	`4056`	`int32_t draftedid = draft_results.draftids[logits_sampled];`
`4055`		`- if(debugmode==1)`
	`4057`	`+ if(debugmode==1 && !quiet)`
`4056`	`4058`	`{`
`4057`	`4059`	`std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);`
`4058`	`4060`	`std::string realtok = FileFormatTokenizeID(id, file_format, true);`
`@@ -4105,7 +4107,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4105`	`4107`	`{`
`4106`	`4108`	`printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);`
`4107`	`4109`	`}`
`4108`		`- if(debugmode==1 && top_picks_history.size()>0)`
	`4110`	`+ if(debugmode==1 && !quiet && top_picks_history.size()>0)`
`4109`	`4111`	`{`
`4110`	`4112`	`printf(" [");`
`4111`	`4113`	`bool firstloop = true;`
`@@ -4370,7 +4372,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4370`	`4372`	`delayed_generated_tokens.pop_front();`
`4371`	`4373`	`}`
`4372`	`4374`
`4373`		`- if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)`
	`4375`	`+ if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)`
`4374`	`4376`	`{`
`4375`	`4377`	`printf("\n");`
`4376`	`4378`	`llama_perf_context_print(llama_ctx_v4);`
Original file line number	Diff line number	Diff line change
`@@ -109,5 +109,16 @@`
`109`	`109`	`"assistant_start": "<\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>",`
`110`	`110`	`"assistant_end": "<\|END_OF_TURN_TOKEN\|>"`
`111`	`111`	`}`
	`112`	`+}, {`
	`113`	`+ "search": ["<｜User｜>"],`
	`114`	`+ "name": "DeepSeek V2.5",`
	`115`	`+ "adapter": {`
	`116`	`+ "system_start": "",`
	`117`	`+ "system_end": "",`
	`118`	`+ "user_start": "<｜User｜>",`
	`119`	`+ "user_end": "",`
	`120`	`+ "assistant_start": "<｜Assistant｜>",`
	`121`	`+ "assistant_end": "<｜end▁of▁sentence｜>"`
	`122`	`+ }`
`112`	`123`	`}`
`113`	`124`	`]`