@@ -159,7 +159,7 @@ llama_tokens common_speculative_gen_draft(
159159 const float p_decay = floorf ((params.p_min - p_min) * 10000 ) / 100 ; // Next 2 decimal places
160160 const int n_min = roundf ((params.p_min - p_min - (p_decay / 100 )) * 100000 ); // Last digit
161161
162- printf ( " p_min= %f, p_decay= %f, n_min= %d\n " , p_min, p_decay, n_min);
162+ LOG_DBG ( " %s: p_min = %f, p_decay = %f, n_min = %d\n " , __func__ , p_min, p_decay, n_min);
163163
164164 // reuse as much as possible from the old draft context
165165 // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
@@ -277,7 +277,7 @@ llama_tokens common_speculative_gen_draft(
277277
278278 const float threshold_p = p_min * pow (std::max ((int ) result.size () - std::max (n_min, 1 ) + 1 , 1 ), -p_decay);
279279
280- printf ( " sequence_p= %f, threshold_p= %f\n " , sequence_p, threshold_p);
280+ LOG_DBG ( " %s: sequence_p = %f, threshold_p = %f\n " , __func__ , sequence_p, threshold_p);
281281
282282 // only collect very high-confidence draft tokens
283283 if (sequence_p < threshold_p) {
@@ -292,7 +292,7 @@ llama_tokens common_speculative_gen_draft(
292292 prompt.push_back (id);
293293 }
294294
295- printf ( " result.size()= %d, sequence_p= %f\n " , result.size (), sequence_p);
295+ LOG_DBG ( " %s: n_result = %d, sequence_p = %f\n " , __func__, ( int ) result.size (), sequence_p);
296296
297297 return result;
298298}
0 commit comments