@@ -106,6 +106,7 @@ static kcpp_params * kcpp_data = nullptr;
106106static int max_context_limit_at_load = 0 ;
107107static int n_past = 0 ;
108108static int debugmode = 0 ; // -1 = hide all, 0 = normal, 1 = showall
109+ static bool quiet = false ;
109110static std::vector<gpt_vocab::id> last_n_tokens;
110111static std::vector<gpt_vocab::id> current_context_tokens;
111112static size_t mem_per_token = 0 ;
@@ -930,12 +931,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
930931
931932 if (last_idx>1 ) // if there are 2 or more viable candidates
932933 {
933- if (debugmode==1 ) {
934+ if (debugmode==1 && !quiet ) {
934935 printf (" XTC penalties [" );
935936 }
936937 // then remove all other tokens above threshold EXCEPT the least likely one
937938 for (size_t i = 0 ; i < last_idx - 1 ; ++i) {
938- if (debugmode==1 )
939+ if (debugmode==1 && !quiet )
939940 {
940941 gpt_vocab::id token = candidates->data [i].id ;
941942 std::string tokenizedstr = FileFormatTokenizeID (token, file_format);
@@ -944,7 +945,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
944945 }
945946 candidates->data [i].logit -= 999 .0f ; // infinity gets wonky results downstream, this hack works well enough
946947 }
947- if (debugmode==1 ) {
948+ if (debugmode==1 && !quiet ) {
948949 printf (" ]\n " );
949950 }
950951 candidates->sorted = false ;
@@ -1133,7 +1134,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
11331134 max_exponent = FLOAT_MAX_LOG / std::log (penalty_base);
11341135 }
11351136
1136- if (debugmode==1 && !dry_max_token_repeat.empty ()) {
1137+ if (debugmode==1 && !quiet && ! dry_max_token_repeat.empty ()) {
11371138 printf (" DRY penalties [" );
11381139 }
11391140 size_t count = 0 ;
@@ -1144,7 +1145,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
11441145 repeat_exp = max_exponent;
11451146 }
11461147 float penalty = penalty_multiplier * pow (penalty_base, repeat_exp);
1147- if (debugmode==1 )
1148+ if (debugmode==1 && !quiet )
11481149 {
11491150 std::string tokenizedstr = FileFormatTokenizeID (token, file_format);
11501151 ::utreplace (tokenizedstr, " \n " , " \\ n" );
@@ -1157,7 +1158,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
11571158 {
11581159 candidates->sorted = false ;
11591160 }
1160- if (debugmode==1 && !dry_max_token_repeat.empty ()) {
1161+ if (debugmode==1 && !quiet && ! dry_max_token_repeat.empty ()) {
11611162 printf (" ]\n " );
11621163 }
11631164}
@@ -1688,7 +1689,7 @@ static void load_grammar(const std::string & gammarstr)
16881689 printf (" \n Ignored invalid grammar sampler." );
16891690 return ;
16901691 }
1691- if (debugmode==1 )
1692+ if (debugmode==1 && !quiet )
16921693 {
16931694 parsed_grammar.print (stderr);
16941695 }
@@ -1831,7 +1832,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
18311832 float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318 ;
18321833 float gradient_ai_rope_freq_base_value = powf (original_rope_base, log10f (chi_ctx_value) / log10f (chi_ctx_train_value));
18331834
1834- if (debugmode==1 )
1835+ if (debugmode==1 && !quiet )
18351836 {
18361837 printf (" Trained max context length (value:%.d).\n " , n_ctx_train);
18371838 printf (" Desired context length (value:%.d).\n " , n_ctx_desired);
@@ -1848,7 +1849,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
18481849 {
18491850 float extended_rope_positive_offset_value = 1 + ((log10f (chi_ctx_value) - log10f (chi_ctx_train_value)) / ((log10f (chi_ctx_value) * log10f (chi_ctx_train_value)) - (log10f (chi_ctx_value) + log10f (chi_ctx_train_value))));
18501851 float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
1851- if (debugmode==1 )
1852+ if (debugmode==1 && !quiet )
18521853 {
18531854 printf (" Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n " , extended_rope_positive_offset_value);
18541855 printf (" RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n " , rope_freq_base_with_positive_offset);
@@ -2679,13 +2680,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
26792680 printf (" \n Warning: KCPP text generation not initialized!\n " );
26802681 return toks;
26812682 }
2682- if (debugmode==1 )
2683+ if (debugmode==1 && !quiet )
26832684 {
26842685 printf (" \n FileFormat: %d, Tokenizing: %s" ,file_format ,input.c_str ());
26852686 }
26862687 TokenizeString (input, toks, file_format,addbos);
26872688 int tokcount = toks.size ();
2688- if (debugmode==1 )
2689+ if (debugmode==1 && !quiet )
26892690 {
26902691 printf (" \n Tokens Counted: %d\n " ,tokcount);
26912692 }
@@ -2770,6 +2771,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
27702771 llama_perf_context_reset (llama_ctx_v4);
27712772 }
27722773
2774+ quiet = inputs.quiet ;
27732775 generation_finished = false ; // Set current generation status
27742776 generated_tokens.clear (); // New Generation, new tokens
27752777 delayed_generated_tokens.clear ();
@@ -2848,7 +2850,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
28482850 banned_token_ids.clear ();
28492851 if (banned_tokens.size ()>0 )
28502852 {
2851- if (debugmode==1 )
2853+ if (debugmode==1 && !quiet )
28522854 {
28532855 printf (" \n Banning %zu single character sequences..." ,banned_tokens.size ());
28542856 }
@@ -2865,13 +2867,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
28652867 }
28662868 }
28672869 }
2868- if (debugmode==1 )
2870+ if (debugmode==1 && !quiet )
28692871 {
28702872 printf (" \n Banned a total of %zu individual tokens.\n " ,banned_token_ids.size ());
28712873 }
28722874 }
28732875
2874- if (debugmode==1 && banned_phrases.size ()>0 )
2876+ if (debugmode==1 && !quiet && banned_phrases.size ()>0 )
28752877 {
28762878 printf (" \n Banned a total of %zu phrases, with max token count of %d.\n " ,banned_phrases.size (),delayed_generated_tokens_limit);
28772879 }
@@ -2916,7 +2918,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
29162918 // images have changed. swap identifiers to force reprocessing
29172919 current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
29182920 llava_composite_image_signature = new_llava_composite;
2919- if (debugmode==1 )
2921+ if (debugmode==1 && !quiet )
29202922 {
29212923 printf (" \n LLAVA images changed, existing cache invalidated" );
29222924 }
@@ -2972,7 +2974,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
29722974 const int MAX_CHAR_LEN = 40 ;
29732975 const int MAX_SEQ_LEN = 20 ;
29742976
2975- if (debugmode == 1 )
2977+ if (debugmode == 1 && !quiet )
29762978 {
29772979 printf (" \n Processing %zu dry break strings..." , kcpp_data->dry_sequence_breakers .size ());
29782980 }
@@ -2984,7 +2986,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
29842986 }
29852987 GetOverlappingTokenSequences (sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
29862988 }
2987- if (debugmode == 1 )
2989+ if (debugmode == 1 && !quiet )
29882990 {
29892991 int trivial = 0 , non_trivial = 0 ;
29902992 for (const auto &seq : dry_sequence_breakers)
@@ -3004,9 +3006,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
30043006 }
30053007
30063008 bool stream_sse = inputs.stream_sse ;
3007-
3008- bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet ) || debugmode >= 1 ;
3009-
3009+ bool allow_regular_prints = (!quiet && debugmode!=-1 );
30103010
30113011 std::string grammarstr = inputs.grammar ;
30123012 bool grammar_retain_state = inputs.grammar_retain_state ;
@@ -3039,7 +3039,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
30393039 if (kcpp_data->seed <= 0 || kcpp_data->seed ==0xFFFFFFFF )
30403040 {
30413041 kcpp_data->seed = (((uint32_t )time (NULL )) % 1000000u );
3042- if (debugmode==1 )
3042+ if (debugmode==1 && !quiet )
30433043 {
30443044 printf (" \n Using Seed: %d" ,kcpp_data->seed );
30453045 }
@@ -3071,15 +3071,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
30713071 }
30723072 else
30733073 {
3074- if (debugmode==1 )
3074+ if (debugmode==1 && !quiet )
30753075 {
30763076 printf (" \n Creating clip image embed..." );
30773077 }
30783078 llava_images[i].clp_image_tokens = 0 ;
30793079 if (!llava_image_embed_make_with_clip_img (clp_ctx, kcpp_data->n_threads , clp_img_data, &llava_images[i].clp_img_embd , &llava_images[i].clp_image_tokens )) {
30803080 printf (" \n Error: Clip image %d failed to create embd!" ,i);
30813081 }
3082- if (debugmode==1 )
3082+ if (debugmode==1 && !quiet )
30833083 {
30843084 printf (" \n LLAVA Clip Embed %i used Tokens: %d" ,i,llava_images[i].clp_image_tokens );
30853085 }
@@ -3202,7 +3202,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32023202 std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
32033203 n_past = 0 ;
32043204
3205- if (debugmode==1 )
3205+ if (debugmode==1 && !quiet )
32063206 {
32073207 std::string outstr = " " ;
32083208 printf (" \n\n [Debug: Dump Raw Input Tokens, format: %d]\n " , file_format);
@@ -3347,7 +3347,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33473347 printf (" \n " );
33483348 }
33493349
3350- if (debugmode==1 )
3350+ if (debugmode==1 && !quiet )
33513351 {
33523352 std::string outstr = " " ;
33533353 printf (" \n [Debug: Dump Forwarded Input Tokens, format: %d]\n " , file_format);
@@ -3396,7 +3396,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33963396 draft_used = true ;
33973397 draft_results = speculative_decoding_eval_chunk (draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
33983398 evalres = draft_results.draft_success ;
3399- if (debugmode==1 )
3399+ if (debugmode==1 && !quiet )
34003400 {
34013401 std::string draftedtoks = get_tok_vec_str (draft_results.draftids );
34023402 printf (" \n Drafted %d Tokens: [%s]\n " ,speculative_chunk_amt,draftedtoks.c_str ());
@@ -3599,7 +3599,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
35993599 if (draft_used)
36003600 {
36013601 int32_t draftedid = draft_results.draftids [logits_sampled];
3602- if (debugmode==1 )
3602+ if (debugmode==1 && !quiet )
36033603 {
36043604 std::string drafttok = FileFormatTokenizeID (draftedid, file_format, true );
36053605 std::string realtok = FileFormatTokenizeID (id, file_format, true );
@@ -3652,7 +3652,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
36523652 {
36533653 printf (" \r Generating (%d / %d tokens)" , (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict );
36543654 }
3655- if (debugmode==1 && top_picks_history.size ()>0 )
3655+ if (debugmode==1 && !quiet && top_picks_history.size ()>0 )
36563656 {
36573657 printf (" [" );
36583658 bool firstloop = true ;
@@ -3904,7 +3904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39043904 delayed_generated_tokens.pop_front ();
39053905 }
39063906
3907- if (debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
3907+ if (debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
39083908 {
39093909 printf (" \n " );
39103910 llama_perf_context_print (llama_ctx_v4);
0 commit comments