@@ -108,6 +108,7 @@ static kcpp_params * kcpp_data = nullptr;
108108static int max_context_limit_at_load = 0 ;
109109static int n_past = 0 ;
110110static int debugmode = 0 ; // -1 = hide all, 0 = normal, 1 = showall
111+ static bool quiet = false ;
111112static std::vector<gpt_vocab::id> last_n_tokens;
112113static std::vector<gpt_vocab::id> current_context_tokens;
113114static size_t mem_per_token = 0 ;
@@ -936,12 +937,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
936937
937938 if (last_idx>1 ) // if there are 2 or more viable candidates
938939 {
939- if (debugmode==1 ) {
940+ if (debugmode==1 && !quiet ) {
940941 printf (" XTC penalties [" );
941942 }
942943 // then remove all other tokens above threshold EXCEPT the least likely one
943944 for (size_t i = 0 ; i < last_idx - 1 ; ++i) {
944- if (debugmode==1 )
945+ if (debugmode==1 && !quiet )
945946 {
946947 gpt_vocab::id token = candidates->data [i].id ;
947948 std::string tokenizedstr = FileFormatTokenizeID (token, file_format);
@@ -950,7 +951,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
950951 }
951952 candidates->data [i].logit -= 999 .0f ; // infinity gets wonky results downstream, this hack works well enough
952953 }
953- if (debugmode==1 ) {
954+ if (debugmode==1 && !quiet ) {
954955 printf (" ]\n " );
955956 }
956957 candidates->sorted = false ;
@@ -1139,7 +1140,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
11391140 max_exponent = FLOAT_MAX_LOG / std::log (penalty_base);
11401141 }
11411142
1142- if (debugmode==1 && !dry_max_token_repeat.empty ()) {
1143+ if (debugmode==1 && !quiet && ! dry_max_token_repeat.empty ()) {
11431144 printf (" DRY penalties [" );
11441145 }
11451146 size_t count = 0 ;
@@ -1150,7 +1151,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
11501151 repeat_exp = max_exponent;
11511152 }
11521153 float penalty = penalty_multiplier * pow (penalty_base, repeat_exp);
1153- if (debugmode==1 )
1154+ if (debugmode==1 && !quiet )
11541155 {
11551156 std::string tokenizedstr = FileFormatTokenizeID (token, file_format);
11561157 ::utreplace (tokenizedstr, " \n " , " \\ n" );
@@ -1163,7 +1164,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
11631164 {
11641165 candidates->sorted = false ;
11651166 }
1166- if (debugmode==1 && !dry_max_token_repeat.empty ()) {
1167+ if (debugmode==1 && !quiet && ! dry_max_token_repeat.empty ()) {
11671168 printf (" ]\n " );
11681169 }
11691170}
@@ -1694,7 +1695,7 @@ static void load_grammar(const std::string & gammarstr)
16941695 printf (" \n Ignored invalid grammar sampler." );
16951696 return ;
16961697 }
1697- if (debugmode==1 )
1698+ if (debugmode==1 && !quiet )
16981699 {
16991700 parsed_grammar.print (stderr);
17001701 }
@@ -1864,7 +1865,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
18641865 printf (" Trained max context length (value:%.d).\n " , n_ctx_train);
18651866 printf (" Desired context length (value:%.d).\n " , n_ctx_desired);
18661867
1867- if (debugmode==1 )
1868+ if (debugmode==1 && !quiet )
18681869 {
18691870 printf (" Solar context multiplier (value:%.3f).\n " , ctx_multiplier);
18701871 printf (" Chi context train (value:%.3f).\n " , chi_ctx_train_value);
@@ -1880,7 +1881,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
18801881 {
18811882 float extended_rope_positive_offset_value = 1 + ((log10f (chi_ctx_value) - log10f (chi_ctx_train_value)) / ((log10f (chi_ctx_value) * log10f (chi_ctx_train_value)) - (log10f (chi_ctx_value) + log10f (chi_ctx_train_value))));
18821883 float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
1883- // if(debugmode==1)
1884+ // if(debugmode==1 && !quiet )
18841885 // {
18851886 printf (" Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n " , extended_rope_positive_offset_value);
18861887 printf (" RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n " , rope_freq_base_with_positive_offset);
@@ -3034,13 +3035,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
30343035 printf (" \n Warning: KCPP text generation not initialized!\n " );
30353036 return toks;
30363037 }
3037- if (debugmode==1 )
3038+ if (debugmode==1 && !quiet )
30383039 {
30393040 printf (" \n FileFormat: %d, Tokenizing: %s" ,file_format ,input.c_str ());
30403041 }
30413042 TokenizeString (input, toks, file_format,addbos);
30423043 int tokcount = toks.size ();
3043- if (debugmode==1 )
3044+ if (debugmode==1 && !quiet )
30443045 {
30453046 printf (" \n Tokens Counted: %d\n " ,tokcount);
30463047 }
@@ -3125,6 +3126,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
31253126 llama_perf_context_reset (llama_ctx_v4);
31263127 }
31273128
3129+ quiet = inputs.quiet ;
31283130 generation_finished = false ; // Set current generation status
31293131 generated_tokens.clear (); // New Generation, new tokens
31303132 delayed_generated_tokens.clear ();
@@ -3203,7 +3205,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32033205 banned_token_ids.clear ();
32043206 if (banned_tokens.size ()>0 )
32053207 {
3206- // if(debugmode==1)
3208+ // if(debugmode==1 && !quiet )
32073209 // {
32083210 printf (" \n Banning %zu single character sequences..." ,banned_tokens.size ());
32093211 // }
@@ -3220,16 +3222,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32203222 }
32213223 }
32223224 }
3223- // if(debugmode==1)
3225+ // if(debugmode==1 && !quiet )
32243226 // {
32253227 printf (" \n Banned a total of %zu individual tokens.\n " ,banned_token_ids.size ());
32263228 // }
32273229 }
32283230
3229- // if(debugmode==1 && banned_phrases.size()>0)
32303231 if (banned_phrases.size ()>0 )
32313232 {
3233+ // if(debugmode==1 && !quiet)
3234+ // {
32323235 printf (" \n Banned a total of %zu phrases, with max token count of %d.\n " ,banned_phrases.size (),delayed_generated_tokens_limit);
3236+ // }
32333237 }
32343238
32353239 logit_biases.clear ();
@@ -3272,7 +3276,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32723276 // images have changed. swap identifiers to force reprocessing
32733277 current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
32743278 llava_composite_image_signature = new_llava_composite;
3275- if (debugmode==1 )
3279+ if (debugmode==1 && !quiet )
32763280 {
32773281 printf (" \n LLAVA images changed, existing cache invalidated" );
32783282 }
@@ -3328,10 +3332,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33283332 const int MAX_CHAR_LEN = 40 ;
33293333 const int MAX_SEQ_LEN = 20 ;
33303334
3331- // if (debugmode == 1)
3332-
3335+ // if (debugmode == 1 && !quiet )
3336+ {
33333337 printf (" \n Processing %zu dry break strings..." , kcpp_data->dry_sequence_breakers .size ());
3334-
3338+ }
33353339 for (auto sequence_break : kcpp_data->dry_sequence_breakers )
33363340 {
33373341 if (sequence_break.size () > MAX_CHAR_LEN)
@@ -3340,7 +3344,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33403344 }
33413345 GetOverlappingTokenSequences (sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
33423346 }
3343- if (debugmode == 1 )
3347+ if (debugmode == 1 && !quiet )
33443348 {
33453349 int trivial = 0 , non_trivial = 0 ;
33463350 for (const auto &seq : dry_sequence_breakers)
@@ -3360,9 +3364,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33603364 }
33613365
33623366 bool stream_sse = inputs.stream_sse ;
3363-
3364- bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet ) || debugmode >= 1 ;
3365-
3367+ bool allow_regular_prints = (!quiet && debugmode!=-1 );
33663368
33673369 std::string grammarstr = inputs.grammar ;
33683370 bool grammar_retain_state = inputs.grammar_retain_state ;
@@ -3395,7 +3397,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33953397 if (kcpp_data->seed <= 0 || kcpp_data->seed ==0xFFFFFFFF )
33963398 {
33973399 kcpp_data->seed = (((uint32_t )time (NULL )) % 1000000u );
3398- if (debugmode==1 )
3400+ if (debugmode==1 && !quiet )
33993401 {
34003402 printf (" \n Using Seed: %d" ,kcpp_data->seed );
34013403 }
@@ -3427,15 +3429,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34273429 }
34283430 else
34293431 {
3430- if (debugmode==1 )
3432+ if (debugmode==1 && !quiet )
34313433 {
34323434 printf (" \n Creating clip image embed..." );
34333435 }
34343436 llava_images[i].clp_image_tokens = 0 ;
34353437 if (!llava_image_embed_make_with_clip_img (clp_ctx, kcpp_data->n_threads , clp_img_data, &llava_images[i].clp_img_embd , &llava_images[i].clp_image_tokens )) {
34363438 printf (" \n Error: Clip image %d failed to create embd!" ,i);
34373439 }
3438- if (debugmode==1 )
3440+ if (debugmode==1 && !quiet )
34393441 {
34403442 printf (" \n LLAVA Clip Embed %i used Tokens: %d" ,i,llava_images[i].clp_image_tokens );
34413443 }
@@ -3558,7 +3560,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
35583560 std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
35593561 n_past = 0 ;
35603562
3561- if (debugmode==1 )
3563+ if (debugmode==1 && !quiet )
35623564 {
35633565 std::string outstr = " " ;
35643566 printf (" \n\n [Debug: Dump Raw Input Tokens, format: %d]\n " , file_format);
@@ -3703,7 +3705,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
37033705 printf (" \n " );
37043706 }
37053707
3706- if (debugmode==1 )
3708+ if (debugmode==1 && !quiet )
37073709 {
37083710 std::string outstr = " " ;
37093711 printf (" \n [Debug: Dump Forwarded Input Tokens, format: %d]\n " , file_format);
@@ -3757,7 +3759,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
37573759 draft_used = true ;
37583760 draft_results = speculative_decoding_eval_chunk (draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
37593761 evalres = draft_results.draft_success ;
3760- if (debugmode==1 )
3762+ if (debugmode==1 && !quiet )
37613763 {
37623764 std::string draftedtoks = get_tok_vec_str (draft_results.draftids );
37633765 printf (" \n Drafted %d Tokens: [%s]\n " ,speculative_chunk_amt,draftedtoks.c_str ());
@@ -4052,7 +4054,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
40524054 if (draft_used)
40534055 {
40544056 int32_t draftedid = draft_results.draftids [logits_sampled];
4055- if (debugmode==1 )
4057+ if (debugmode==1 && !quiet )
40564058 {
40574059 std::string drafttok = FileFormatTokenizeID (draftedid, file_format, true );
40584060 std::string realtok = FileFormatTokenizeID (id, file_format, true );
@@ -4105,7 +4107,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
41054107 {
41064108 printf (" \r Generating (%d / %d tokens)" , (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict );
41074109 }
4108- if (debugmode==1 && top_picks_history.size ()>0 )
4110+ if (debugmode==1 && !quiet && top_picks_history.size ()>0 )
41094111 {
41104112 printf (" [" );
41114113 bool firstloop = true ;
@@ -4370,7 +4372,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
43704372 delayed_generated_tokens.pop_front ();
43714373 }
43724374
4373- if (debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
4375+ if (debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
43744376 {
43754377 printf (" \n " );
43764378 llama_perf_context_print (llama_ctx_v4);
0 commit comments