Nexesenex
diff --git a/‎expose.h‎
Lines changed: 5 additions & 5 deletions b/‎expose.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎gpttype_adapter.cpp‎
Lines changed: 29 additions & 29 deletions b/‎gpttype_adapter.cpp‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎klite.embd‎
Lines changed: 18 additions & 4 deletions b/‎klite.embd‎
Lines changed: 18 additions & 4 deletions
@@ -54,7 +54,6 @@ struct load_model_inputs
     const int cublas_info = 0;
     const char * vulkan_info = nullptr;
     const int blasbatchsize = 512;
-    const int debugmode = 0;
     const int forceversion = 0;
     const int gpulayers = 0;
     const float rope_freq_scale = 1.0f;
@@ -64,6 +63,8 @@ struct load_model_inputs
     const float tensor_split[tensor_split_max] = {};
     const int quant_k = 0;
     const int quant_v = 0;
+    const bool quiet = false;
+    const int debugmode = 0;
 };
 struct generation_inputs
 {
@@ -97,7 +98,6 @@ struct generation_inputs
     const bool stream_sse = false;
     const char * grammar = nullptr;
     const bool grammar_retain_state = false;
-    const bool quiet = false;
     const float dynatemp_range = 0.0f;
     const float dynatemp_exponent = 1.0f;
     const float smoothing_factor = 0.0f;
@@ -157,6 +157,7 @@ struct sd_load_model_inputs
     const char * vae_filename = nullptr;
     const char * lora_filename = nullptr;
     const float lora_multiplier = 1.0f;
+    const bool quiet = false;
     const int debugmode = 0;
 };
 struct sd_generation_inputs
@@ -172,7 +173,6 @@ struct sd_generation_inputs
     const int seed = 0;
     const char * sample_method = nullptr;
     const int clip_skip = -1;
-    const bool quiet = false;
 };
 struct sd_generation_outputs
 {
@@ -187,6 +187,7 @@ struct whisper_load_model_inputs
     const int clblast_info = 0;
     const int cublas_info = 0;
     const char * vulkan_info = nullptr;
+    const bool quiet = false;
     const int debugmode = 0;
 };
 struct whisper_generation_inputs
@@ -195,7 +196,6 @@ struct whisper_generation_inputs
     const char * audio_data = nullptr;
     const bool suppress_non_speech = false;
     const char * langcode = nullptr;
-    const bool quiet = false;
 };
 struct whisper_generation_outputs
 {
@@ -214,14 +214,14 @@ struct tts_load_model_inputs
     const char * vulkan_info = nullptr;
     const int gpulayers = 0;
     const bool flash_attention = false;
+    const bool quiet = false;
     const int debugmode = 0;
 };
 struct tts_generation_inputs
 {
     const char * prompt = nullptr;
     const int speaker_seed = 0;
     const int audio_seed = 0;
-    const bool quiet = false;
     const bool nocache = false;
 };
 struct tts_generation_outputs
 
@@ -106,7 +106,7 @@ static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
 static int n_past = 0;
 static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
-static bool quiet = false;
+static bool is_quiet = false;
 static std::vector<gpt_vocab::id> last_n_tokens;
 static std::vector<gpt_vocab::id> current_context_tokens;
 static size_t mem_per_token = 0;
@@ -939,12 +939,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
 
     if(last_idx>1) //if there are 2 or more viable candidates
     {
-        if (debugmode==1 && !quiet) {
+        if (debugmode==1 && !is_quiet) {
             printf("XTC penalties [");
         }
         // then remove all other tokens above threshold EXCEPT the least likely one
         for (size_t i = 0; i < last_idx - 1; ++i) {
-            if (debugmode==1 && !quiet)
+            if (debugmode==1 && !is_quiet)
             {
                 gpt_vocab::id token = candidates->data[i].id;
                 std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
@@ -953,7 +953,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
             }
             candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough
         }
-        if (debugmode==1 && !quiet) {
+        if (debugmode==1 && !is_quiet) {
             printf("]\n");
         }
         candidates->sorted = false;
@@ -1142,7 +1142,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
         max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);
     }
 
-    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !is_quiet && !dry_max_token_repeat.empty()) {
         printf("DRY penalties [");
     }
     size_t count = 0;
@@ -1153,7 +1153,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
             repeat_exp = max_exponent;
         }
         float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);
-        if (debugmode==1 && !quiet)
+        if (debugmode==1 && !is_quiet)
         {
             std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
             ::utreplace(tokenizedstr, "\n", "\\n");
@@ -1166,7 +1166,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
     {
         candidates->sorted = false;
     }
-    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !is_quiet && !dry_max_token_repeat.empty()) {
         printf("]\n");
     }
 }
@@ -1697,7 +1697,7 @@ static void load_grammar(const std::string & gammarstr)
             printf("\nIgnored invalid grammar sampler.");
             return;
         }
-        if(debugmode==1 && !quiet)
+        if(debugmode==1 && !is_quiet)
         {
             parsed_grammar.print(stderr);
         }
@@ -1840,7 +1840,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
         float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
         float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));
 
-        if(debugmode==1 && !quiet)
+        if(debugmode==1 && !is_quiet)
         {
             printf("Trained max context length (value:%.d).\n", n_ctx_train);
             printf("Desired context length (value:%.d).\n", n_ctx_desired);
@@ -1857,7 +1857,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
         {
             float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));
             float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
-            if(debugmode==1 && !quiet)
+            if(debugmode==1 && !is_quiet)
             {
                 printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
                 printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
@@ -1873,6 +1873,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
 
 ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
 {
+    is_quiet = inputs.quiet;
     ggml_time_init();
     kcpp_data = new kcpp_params(); //allocate on heap to avoid linux segfault. yes this leaks memory.
 
@@ -2688,13 +2689,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
         printf("\nWarning: KCPP text generation not initialized!\n");
         return toks;
     }
-    if(debugmode==1 && !quiet)
+    if(debugmode==1 && !is_quiet)
     {
         printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());
     }
     TokenizeString(input, toks, file_format,addbos);
     int tokcount = toks.size();
-    if(debugmode==1 && !quiet)
+    if(debugmode==1 && !is_quiet)
     {
         printf("\nTokens Counted: %d\n",tokcount);
     }
@@ -2779,7 +2780,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         llama_perf_context_reset(llama_ctx_v4);
     }
 
-    quiet = inputs.quiet;
     generation_finished = false; // Set current generation status
     generated_tokens.clear(); // New Generation, new tokens
     delayed_generated_tokens.clear();
@@ -2858,7 +2858,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     banned_token_ids.clear();
     if(banned_tokens.size()>0)
     {
-        if(debugmode==1 && !quiet)
+        if(debugmode==1 && !is_quiet)
         {
             printf("\nBanning %zu single character sequences...",banned_tokens.size());
         }
@@ -2875,13 +2875,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
             }
         }
-        if(debugmode==1 && !quiet)
+        if(debugmode==1 && !is_quiet)
         {
             printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());
         }
     }
 
-    if(debugmode==1 && !quiet && banned_phrases.size()>0)
+    if(debugmode==1 && !is_quiet && banned_phrases.size()>0)
     {
         printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);
     }
@@ -2926,7 +2926,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         //images have changed. swap identifiers to force reprocessing
         current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
         llava_composite_image_signature = new_llava_composite;
-        if(debugmode==1 && !quiet)
+        if(debugmode==1 && !is_quiet)
         {
             printf("\nLLAVA images changed, existing cache invalidated");
         }
@@ -2982,7 +2982,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             const int MAX_CHAR_LEN = 40;
             const int MAX_SEQ_LEN = 20;
 
-            if (debugmode == 1 && !quiet)
+            if (debugmode == 1 && !is_quiet)
             {
                 printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());
             }
@@ -2994,7 +2994,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
                 GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
             }
-            if (debugmode == 1 && !quiet)
+            if (debugmode == 1 && !is_quiet)
             {
                 int trivial = 0, non_trivial = 0;
                 for (const auto &seq : dry_sequence_breakers)
@@ -3014,7 +3014,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     }
 
     bool stream_sse = inputs.stream_sse;
-    bool allow_regular_prints = (!quiet && debugmode!=-1);
+    bool allow_regular_prints = (!is_quiet && debugmode!=-1);
 
     std::string grammarstr = inputs.grammar;
     bool grammar_retain_state = inputs.grammar_retain_state;
@@ -3047,7 +3047,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     if (kcpp_data->seed <= 0 || kcpp_data->seed==0xFFFFFFFF)
     {
         kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);
-        if(debugmode==1 && !quiet)
+        if(debugmode==1 && !is_quiet)
         {
             printf("\nUsing Seed: %d",kcpp_data->seed);
         }
@@ -3079,15 +3079,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             }
             else
             {
-                if(debugmode==1 && !quiet)
+                if(debugmode==1 && !is_quiet)
                 {
                     printf("\nCreating clip image embed...");
                 }
                 llava_images[i].clp_image_tokens = 0;
                 if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
                     printf("\nError: Clip image %d failed to create embd!",i);
                 }
-                if(debugmode==1 && !quiet)
+                if(debugmode==1 && !is_quiet)
                 {
                     printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
                 }
@@ -3210,7 +3210,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
     n_past = 0;
 
-    if (debugmode==1 && !quiet)
+    if (debugmode==1 && !is_quiet)
     {
         std::string outstr = "";
         printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);
@@ -3355,7 +3355,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         printf("\n");
     }
 
-    if (debugmode==1 && !quiet)
+    if (debugmode==1 && !is_quiet)
     {
         std::string outstr = "";
         printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);
@@ -3404,7 +3404,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                     draft_used = true;
                     draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
                     evalres = draft_results.draft_success;
-                    if(debugmode==1 && !quiet)
+                    if(debugmode==1 && !is_quiet)
                     {
                         std::string draftedtoks = get_tok_vec_str(draft_results.draftids);
                         printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());
@@ -3607,7 +3607,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 if(draft_used)
                 {
                     int32_t draftedid = draft_results.draftids[logits_sampled];
-                    if(debugmode==1 && !quiet)
+                    if(debugmode==1 && !is_quiet)
                     {
                         std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);
                         std::string realtok = FileFormatTokenizeID(id, file_format, true);
@@ -3660,7 +3660,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 {
                     printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);
                 }
-                if(debugmode==1 && !quiet && top_picks_history.size()>0)
+                if(debugmode==1 && !is_quiet && top_picks_history.size()>0)
                 {
                     printf(" [");
                     bool firstloop = true;
@@ -3912,7 +3912,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         delayed_generated_tokens.pop_front();
     }
 
-    if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
+    if(debugmode==1 && !is_quiet && file_format == FileFormat::GGUF_GENERIC)
     {
         printf("\n");
         llama_perf_context_print(llama_ctx_v4);
 
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script>
-	const LITEVER = 205;
+	const LITEVER = 206;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = true;
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -5009,8 +5009,17 @@ initializeInstructUIFunctionality();
 	function copyMarkdownCode(btn)
 	{
 		const codeContainer = btn.parentElement.querySelector('pre code');
-		//selectElementContents(codeContainer);
-		navigator.clipboard.writeText(codeContainer.innerText);
+		let innercode = codeContainer.innerText;
+		//remove common language descriptiors from the start
+		let langsmatched = ["matlab","jsonc","powershell","ps1","haskell","hs","vbnet","vb","apache","apacheconf","makefile","mk","ini","protobuf","proto","typescript","tsx","markdown","md","mkdown","mkd","python","py","javascript","js","jsx","html","xhtml","xml","css","json","typescript","ts","tsx","bash","sh","zsh","java","csharp","cs","c","h","cpp","hpp","php","sql","ruby","rb","go","golang","kotlin","kt","swift","rust","rs","r","dart","scala","dockerfile","docker","yaml","yml","ini","toml","perl","pl","shell","console","powershell","ps1","lua","typescript","ts"];
+		for(let i = 0; i < langsmatched.length; ++i) {
+			let matcher = langsmatched[i]+"\n";
+			if (innercode.startsWith(matcher)) {
+				innercode = innercode.substring(matcher.length);
+				break;
+			 }
+		}
+		navigator.clipboard.writeText(innercode);
 	}
 
 	function simpleMarkdown(text) {
@@ -13469,7 +13478,12 @@ initializeInstructUIFunctionality();
 					if (document.getElementById("jailbreakprompt2") && document.getElementById("jailbreakprompt2").checked && document.getElementById("jailbreakprompttext2").value!="") {
 						let addrole = document.getElementById("jailbreakprompttext2role").value;
 						addrole = ((addrole==2)?"system":(addrole==1?"assistant":"user"));
-						oai_payload.messages.push({ "role": addrole, "content": document.getElementById("jailbreakprompttext2").value });
+						let postmsg = { "role": addrole, "content": document.getElementById("jailbreakprompttext2").value };
+						if(addrole=="assistant" && targetep.toLowerCase().includes("api.deepseek.com"))
+						{
+							postmsg["prefix"] = true;
+						}
+						oai_payload.messages.push(postmsg);
 					}
 
 					oaiemulatecompletionscontent = "";
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ static kcpp_params * kcpp_data = nullptr;`
`106`	`106`	`static int max_context_limit_at_load = 0;`
`107`	`107`	`static int n_past = 0;`
`108`	`108`	`static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall`
`109`		`-static bool quiet = false;`
	`109`	`+static bool is_quiet = false;`
`110`	`110`	`static std::vector<gpt_vocab::id> last_n_tokens;`
`111`	`111`	`static std::vector<gpt_vocab::id> current_context_tokens;`
`112`	`112`	`static size_t mem_per_token = 0;`
`@@ -939,12 +939,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float`
`939`	`939`
`940`	`940`	`if(last_idx>1) //if there are 2 or more viable candidates`
`941`	`941`	`{`
`942`		`- if (debugmode==1 && !quiet) {`
	`942`	`+ if (debugmode==1 && !is_quiet) {`
`943`	`943`	`printf("XTC penalties [");`
`944`	`944`	`}`
`945`	`945`	`// then remove all other tokens above threshold EXCEPT the least likely one`
`946`	`946`	`for (size_t i = 0; i < last_idx - 1; ++i) {`
`947`		`- if (debugmode==1 && !quiet)`
	`947`	`+ if (debugmode==1 && !is_quiet)`
`948`	`948`	`{`
`949`	`949`	`gpt_vocab::id token = candidates->data[i].id;`
`950`	`950`	`std::string tokenizedstr = FileFormatTokenizeID(token, file_format);`
`@@ -953,7 +953,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float`
`953`	`953`	`}`
`954`	`954`	`candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough`
`955`	`955`	`}`
`956`		`- if (debugmode==1 && !quiet) {`
	`956`	`+ if (debugmode==1 && !is_quiet) {`
`957`	`957`	`printf("]\n");`
`958`	`958`	`}`
`959`	`959`	`candidates->sorted = false;`
`@@ -1142,7 +1142,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1142`	`1142`	`max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);`
`1143`	`1143`	`}`
`1144`	`1144`
`1145`		`- if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {`
	`1145`	`+ if (debugmode==1 && !is_quiet && !dry_max_token_repeat.empty()) {`
`1146`	`1146`	`printf("DRY penalties [");`
`1147`	`1147`	`}`
`1148`	`1148`	`size_t count = 0;`
`@@ -1153,7 +1153,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1153`	`1153`	`repeat_exp = max_exponent;`
`1154`	`1154`	`}`
`1155`	`1155`	`float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);`
`1156`		`- if (debugmode==1 && !quiet)`
	`1156`	`+ if (debugmode==1 && !is_quiet)`
`1157`	`1157`	`{`
`1158`	`1158`	`std::string tokenizedstr = FileFormatTokenizeID(token, file_format);`
`1159`	`1159`	`::utreplace(tokenizedstr, "\n", "\\n");`
`@@ -1166,7 +1166,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1166`	`1166`	`{`
`1167`	`1167`	`candidates->sorted = false;`
`1168`	`1168`	`}`
`1169`		`- if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {`
	`1169`	`+ if (debugmode==1 && !is_quiet && !dry_max_token_repeat.empty()) {`
`1170`	`1170`	`printf("]\n");`
`1171`	`1171`	`}`
`1172`	`1172`	`}`
`@@ -1697,7 +1697,7 @@ static void load_grammar(const std::string & gammarstr)`
`1697`	`1697`	`printf("\nIgnored invalid grammar sampler.");`
`1698`	`1698`	`return;`
`1699`	`1699`	`}`
`1700`		`- if(debugmode==1 && !quiet)`
	`1700`	`+ if(debugmode==1 && !is_quiet)`
`1701`	`1701`	`{`
`1702`	`1702`	`parsed_grammar.print(stderr);`
`1703`	`1703`	`}`
`@@ -1840,7 +1840,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1840`	`1840`	`float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;`
`1841`	`1841`	`float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));`
`1842`	`1842`
`1843`		`- if(debugmode==1 && !quiet)`
	`1843`	`+ if(debugmode==1 && !is_quiet)`
`1844`	`1844`	`{`
`1845`	`1845`	`printf("Trained max context length (value:%.d).\n", n_ctx_train);`
`1846`	`1846`	`printf("Desired context length (value:%.d).\n", n_ctx_desired);`
`@@ -1857,7 +1857,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1857`	`1857`	`{`
`1858`	`1858`	`float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));`
`1859`	`1859`	`float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;`
`1860`		`- if(debugmode==1 && !quiet)`
	`1860`	`+ if(debugmode==1 && !is_quiet)`
`1861`	`1861`	`{`
`1862`	`1862`	`printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);`
`1863`	`1863`	`printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);`
`@@ -1873,6 +1873,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1873`	`1873`
`1874`	`1874`	`ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)`
`1875`	`1875`	`{`
	`1876`	`+ is_quiet = inputs.quiet;`
`1876`	`1877`	`ggml_time_init();`
`1877`	`1878`	`kcpp_data = new kcpp_params(); //allocate on heap to avoid linux segfault. yes this leaks memory.`
`1878`	`1879`
`@@ -2688,13 +2689,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)`
`2688`	`2689`	`printf("\nWarning: KCPP text generation not initialized!\n");`
`2689`	`2690`	`return toks;`
`2690`	`2691`	`}`
`2691`		`- if(debugmode==1 && !quiet)`
	`2692`	`+ if(debugmode==1 && !is_quiet)`
`2692`	`2693`	`{`
`2693`	`2694`	`printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());`
`2694`	`2695`	`}`
`2695`	`2696`	`TokenizeString(input, toks, file_format,addbos);`
`2696`	`2697`	`int tokcount = toks.size();`
`2697`		`- if(debugmode==1 && !quiet)`
	`2698`	`+ if(debugmode==1 && !is_quiet)`
`2698`	`2699`	`{`
`2699`	`2700`	`printf("\nTokens Counted: %d\n",tokcount);`
`2700`	`2701`	`}`
`@@ -2779,7 +2780,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2779`	`2780`	`llama_perf_context_reset(llama_ctx_v4);`
`2780`	`2781`	`}`
`2781`	`2782`
`2782`		`- quiet = inputs.quiet;`
`2783`	`2783`	`generation_finished = false; // Set current generation status`
`2784`	`2784`	`generated_tokens.clear(); // New Generation, new tokens`
`2785`	`2785`	`delayed_generated_tokens.clear();`
`@@ -2858,7 +2858,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2858`	`2858`	`banned_token_ids.clear();`
`2859`	`2859`	`if(banned_tokens.size()>0)`
`2860`	`2860`	`{`
`2861`		`- if(debugmode==1 && !quiet)`
	`2861`	`+ if(debugmode==1 && !is_quiet)`
`2862`	`2862`	`{`
`2863`	`2863`	`printf("\nBanning %zu single character sequences...",banned_tokens.size());`
`2864`	`2864`	`}`
`@@ -2875,13 +2875,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2875`	`2875`	`}`
`2876`	`2876`	`}`
`2877`	`2877`	`}`
`2878`		`- if(debugmode==1 && !quiet)`
	`2878`	`+ if(debugmode==1 && !is_quiet)`
`2879`	`2879`	`{`
`2880`	`2880`	`printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());`
`2881`	`2881`	`}`
`2882`	`2882`	`}`
`2883`	`2883`
`2884`		`- if(debugmode==1 && !quiet && banned_phrases.size()>0)`
	`2884`	`+ if(debugmode==1 && !is_quiet && banned_phrases.size()>0)`
`2885`	`2885`	`{`
`2886`	`2886`	`printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);`
`2887`	`2887`	`}`
`@@ -2926,7 +2926,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2926`	`2926`	`//images have changed. swap identifiers to force reprocessing`
`2927`	`2927`	`current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);`
`2928`	`2928`	`llava_composite_image_signature = new_llava_composite;`
`2929`		`- if(debugmode==1 && !quiet)`
	`2929`	`+ if(debugmode==1 && !is_quiet)`
`2930`	`2930`	`{`
`2931`	`2931`	`printf("\nLLAVA images changed, existing cache invalidated");`
`2932`	`2932`	`}`
`@@ -2982,7 +2982,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2982`	`2982`	`const int MAX_CHAR_LEN = 40;`
`2983`	`2983`	`const int MAX_SEQ_LEN = 20;`
`2984`	`2984`
`2985`		`- if (debugmode == 1 && !quiet)`
	`2985`	`+ if (debugmode == 1 && !is_quiet)`
`2986`	`2986`	`{`
`2987`	`2987`	`printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());`
`2988`	`2988`	`}`
`@@ -2994,7 +2994,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2994`	`2994`	`}`
`2995`	`2995`	`GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);`
`2996`	`2996`	`}`
`2997`		`- if (debugmode == 1 && !quiet)`
	`2997`	`+ if (debugmode == 1 && !is_quiet)`
`2998`	`2998`	`{`
`2999`	`2999`	`int trivial = 0, non_trivial = 0;`
`3000`	`3000`	`for (const auto &seq : dry_sequence_breakers)`
`@@ -3014,7 +3014,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3014`	`3014`	`}`
`3015`	`3015`
`3016`	`3016`	`bool stream_sse = inputs.stream_sse;`
`3017`		`- bool allow_regular_prints = (!quiet && debugmode!=-1);`
	`3017`	`+ bool allow_regular_prints = (!is_quiet && debugmode!=-1);`
`3018`	`3018`
`3019`	`3019`	`std::string grammarstr = inputs.grammar;`
`3020`	`3020`	`bool grammar_retain_state = inputs.grammar_retain_state;`
`@@ -3047,7 +3047,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3047`	`3047`	`if (kcpp_data->seed <= 0 \|\| kcpp_data->seed==0xFFFFFFFF)`
`3048`	`3048`	`{`
`3049`	`3049`	`kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);`
`3050`		`- if(debugmode==1 && !quiet)`
	`3050`	`+ if(debugmode==1 && !is_quiet)`
`3051`	`3051`	`{`
`3052`	`3052`	`printf("\nUsing Seed: %d",kcpp_data->seed);`
`3053`	`3053`	`}`
`@@ -3079,15 +3079,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3079`	`3079`	`}`
`3080`	`3080`	`else`
`3081`	`3081`	`{`
`3082`		`- if(debugmode==1 && !quiet)`
	`3082`	`+ if(debugmode==1 && !is_quiet)`
`3083`	`3083`	`{`
`3084`	`3084`	`printf("\nCreating clip image embed...");`
`3085`	`3085`	`}`
`3086`	`3086`	`llava_images[i].clp_image_tokens = 0;`
`3087`	`3087`	`if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {`
`3088`	`3088`	`printf("\nError: Clip image %d failed to create embd!",i);`
`3089`	`3089`	`}`
`3090`		`- if(debugmode==1 && !quiet)`
	`3090`	`+ if(debugmode==1 && !is_quiet)`
`3091`	`3091`	`{`
`3092`	`3092`	`printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);`
`3093`	`3093`	`}`
`@@ -3210,7 +3210,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3210`	`3210`	`std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);`
`3211`	`3211`	`n_past = 0;`
`3212`	`3212`
`3213`		`- if (debugmode==1 && !quiet)`
	`3213`	`+ if (debugmode==1 && !is_quiet)`
`3214`	`3214`	`{`
`3215`	`3215`	`std::string outstr = "";`
`3216`	`3216`	`printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);`
`@@ -3355,7 +3355,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3355`	`3355`	`printf("\n");`
`3356`	`3356`	`}`
`3357`	`3357`
`3358`		`- if (debugmode==1 && !quiet)`
	`3358`	`+ if (debugmode==1 && !is_quiet)`
`3359`	`3359`	`{`
`3360`	`3360`	`std::string outstr = "";`
`3361`	`3361`	`printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);`
`@@ -3404,7 +3404,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3404`	`3404`	`draft_used = true;`
`3405`	`3405`	`draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);`
`3406`	`3406`	`evalres = draft_results.draft_success;`
`3407`		`- if(debugmode==1 && !quiet)`
	`3407`	`+ if(debugmode==1 && !is_quiet)`
`3408`	`3408`	`{`
`3409`	`3409`	`std::string draftedtoks = get_tok_vec_str(draft_results.draftids);`
`3410`	`3410`	`printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());`
`@@ -3607,7 +3607,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3607`	`3607`	`if(draft_used)`
`3608`	`3608`	`{`
`3609`	`3609`	`int32_t draftedid = draft_results.draftids[logits_sampled];`
`3610`		`- if(debugmode==1 && !quiet)`
	`3610`	`+ if(debugmode==1 && !is_quiet)`
`3611`	`3611`	`{`
`3612`	`3612`	`std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);`
`3613`	`3613`	`std::string realtok = FileFormatTokenizeID(id, file_format, true);`
`@@ -3660,7 +3660,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3660`	`3660`	`{`
`3661`	`3661`	`printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);`
`3662`	`3662`	`}`
`3663`		`- if(debugmode==1 && !quiet && top_picks_history.size()>0)`
	`3663`	`+ if(debugmode==1 && !is_quiet && top_picks_history.size()>0)`
`3664`	`3664`	`{`
`3665`	`3665`	`printf(" [");`
`3666`	`3666`	`bool firstloop = true;`
`@@ -3912,7 +3912,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3912`	`3912`	`delayed_generated_tokens.pop_front();`
`3913`	`3913`	`}`
`3914`	`3914`
`3915`		`- if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)`
	`3915`	`+ if(debugmode==1 && !is_quiet && file_format == FileFormat::GGUF_GENERIC)`
`3916`	`3916`	`{`
`3917`	`3917`	`printf("\n");`
`3918`	`3918`	`llama_perf_context_print(llama_ctx_v4);`