From 17855ff1c29e89bda17013318e76d53561f58975 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 7 Oct 2025 14:25:14 +0200 Subject: [PATCH 1/2] llama : add normalized field to llama_token_data_array struct This commit adds a 'normalized' field to the llama_token_data_array struct to indicate whether the probabilities have been computed and normalized from the logits. The motivation for this change is to avoid redundant normalization calls in the sampling code, as the softmax calculation can be expensive depending on the size of the llama_token_data array. Samplers that modify logits or filter tokens (change the size) must set normalized to false to invalidate cached probabilities. Samplers that compute probabilities set it to true after normalization. --- common/sampling.cpp | 4 +- examples/diffusion/diffusion-cli.cpp | 3 ++ examples/speculative/speculative.cpp | 2 +- include/llama.h | 1 + src/llama-grammar.cpp | 1 + src/llama-sampling.cpp | 64 +++++++++++++++++++++++++--- tests/test-sampling.cpp | 8 ++-- 7 files changed, 69 insertions(+), 14 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index c69d525b5b358..148af567ec477 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -126,7 +126,7 @@ struct common_sampler { cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; } - cur_p = { cur.data(), cur.size(), -1, false }; + cur_p = { cur.data(), cur.size(), false, -1, false }; } }; @@ -360,7 +360,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // check if it the sampled token fits the grammar { llama_token_data single_token_data = { id, 1.0f, 0.0f }; - llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; + llama_token_data_array single_token_data_array = { &single_token_data, 1, false, -1, false }; llama_sampler_apply(grmr, &single_token_data_array); diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 273942a165ed0..3ed79df95e6fd 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -404,6 +404,7 @@ static void diffusion_generate(llama_context * ctx, llama_token_data_array cur_p = { candidates.data(), (size_t) n_vocab, + false, // normalized -1, false, }; @@ -429,6 +430,7 @@ static void diffusion_generate(llama_context * ctx, llama_token_data_array cur_p = { candidates.data(), candidates.size(), + false, // normalized -1, false, }; @@ -472,6 +474,7 @@ static void diffusion_generate(llama_context * ctx, llama_token_data_array conf_array = { conf_candidates.data(), conf_candidates.size(), + false, -1, false, }; diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 5f5ac5eb64d38..75337968e2c9b 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -269,7 +269,7 @@ int main(int argc, char ** argv) { LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); float r = u_dist(rng); - llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true }; + llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), false, LLAMA_TOKEN_NULL, true }; //GGML_ASSERT(dist_tgt.size <= dist_dft.size); diff --git a/include/llama.h b/include/llama.h index a0a660bff88da..55c9fbc2d928b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -205,6 +205,7 @@ extern "C" { // NOTE: this pointer can be modified by the samplers llama_token_data * data; size_t size; + bool normalized; // true if the probabilities (llama_token_data.p) have been computed int64_t selected; // this is the index in the data array (i.e. not the token id) bool sorted; // note: do not assume the data is sorted - always check this flag } llama_token_data_array; diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bed706bb248d1..f11a22c15f857 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1156,6 +1156,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_ for (const auto & reject : rejects) { cur_p->data[reject.index].logit = -INFINITY; } + cur_p->normalized = false; } void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) { diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 2186f827bf543..93b52a7b83b58 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -260,6 +260,7 @@ static void llama_log_softmax(float * array, size_t size) { */ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) { + cur_p->normalized = false; if (temp <= 0.0f) { // find the token with the highest logit and set the rest to -inf size_t max_i = 0; @@ -309,6 +310,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_s for (size_t i = 0; i < cur_p->size; ++i) { cur_p->data[i].p /= cum_sum; } + cur_p->normalized = true; } static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) { @@ -328,6 +330,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } cur_p->size = k; + cur_p->normalized = false; } static uint32_t get_rng_seed(uint32_t seed) { @@ -422,6 +425,7 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte llama_token_data_array cur_p = { /* .data = */ cur.data(), /* .size = */ cur.size(), + /* .normalized = */ false, /* .selected = */ -1, /* .sorted = */ false, }; @@ -614,6 +618,23 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da if (cur_p->size == 1) { cur_p->data[0].p = 1.0f; + cur_p->normalized = true; + return; + } + + if (cur_p->normalized) { + std::uniform_real_distribution dist(0.0f, 1.0f); + const double rnd = dist(ctx->rng); + double sum_run = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + sum_run += cur_p->data[i].p; + if (sum_run >= rnd) { + cur_p->selected = i; + return; + } + } + cur_p->selected = cur_p->size - 1; return; } @@ -663,6 +684,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da if (!found) { cur_p->selected = cur_p->size - 1; } + cur_p->normalized = true; #else // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling for (size_t i = 0; i < cur_p->size; ++i) { @@ -670,6 +692,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da } cur_p->selected = llama_sample_dist(cur_p, ctx->rng); + cur_p->normalized = true; #endif } @@ -780,7 +803,9 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d return; } - llama_sampler_softmax_impl(cur_p, false); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, false); + } size_t k = cur_p->size; auto * pdata = cur_p->data; @@ -826,6 +851,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d } cur_p->size = last_idx; + cur_p->normalized = false; } static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) { @@ -897,6 +923,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) { std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data); cur_p->size = filtered_tokens.size(); + cur_p->normalized = false; min_p_applied = true; } } @@ -919,6 +946,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d // Resize the output vector to keep only the matching tokens cur_p->size = i; + cur_p->normalized = false; } } @@ -971,7 +999,9 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token } // Compute the softmax of logits and calculate entropy - llama_sampler_softmax_impl(cur_p, true); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, true); + } float entropy = 0.0f; for (size_t i = 0; i < cur_p->size; ++i) { @@ -1019,6 +1049,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); cur_p->size = cur_p_new.size(); cur_p->sorted = false; + cur_p->normalized = false; } static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) { @@ -1120,7 +1151,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke // Calculate maximum possible entropy float max_entropy = -logf(1.0f / cur_p->size); - llama_sampler_softmax_impl(cur_p, true); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, true); + } // Calculate entropy of the softmax probabilities float entropy = 0.0f; @@ -1162,6 +1195,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke for (size_t i = 0; i < cur_p->size; ++i) { cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities } + cur_p->normalized = true; #ifdef DEBUG // Print the updated top 25 probabilities after temperature scaling @@ -1236,7 +1270,9 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data return; } - llama_sampler_softmax_impl(cur_p, true); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, true); + } int pos_last = 0; @@ -1251,6 +1287,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { cur_p->data += pos_last; cur_p->size -= pos_last; + cur_p->normalized = false; } } @@ -1327,7 +1364,9 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_mirostat *) smpl->ctx; - llama_sampler_softmax_impl(cur_p, true); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, true); + } // Estimate s_hat using the most probable m tokens float s_hat = 0.0; @@ -1433,7 +1472,9 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; - llama_sampler_softmax_impl(cur_p, true); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, true); + } // Truncate the words with surprise values greater than mu cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { @@ -1775,6 +1816,7 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok } cur_p->sorted = false; + cur_p->normalized = false; } static void llama_sampler_penalties_reset(struct llama_sampler * smpl) { @@ -2193,6 +2235,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat } cur_p->sorted = false; + cur_p->normalized = false; } static void llama_sampler_dry_reset(struct llama_sampler * smpl) { @@ -2344,6 +2387,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } if (ctx->to_search.empty()) { + cur_p->normalized = false; return; } @@ -2356,6 +2400,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } } } + cur_p->normalized = false; } static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { @@ -2408,7 +2453,9 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_infill *) smpl->ctx; - llama_sampler_softmax_impl(cur_p, true); + if (!cur_p->normalized) { + llama_sampler_softmax_impl(cur_p, true); + } #if defined(GGML_DEBUG_SAMPLER_INFILL) #define LOG_DBG_CUR LLAMA_LOG_DEBUG @@ -2457,6 +2504,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ for (size_t i = 0; i < cur_p->size; ++i) { cur_p->data[i].p /= p_sum; } + cur_p->normalized = true; return; } @@ -2542,6 +2590,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ cur_p->size = 1; cur_p->data[0].id = ctx->vocab->token_eot(); cur_p->data[0].logit = 1.0f; + cur_p->normalized = true; return; } @@ -2579,6 +2628,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } + cur_p->normalized = true; #undef LOG_DBG_CUR } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 7cd96c5cd351c..44a8cd56421fc 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -28,7 +28,7 @@ struct sampler_tester { cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false }; + cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false }; } sampler_tester(const std::vector & probs, const std::vector & probs_expected) : probs_expected(probs_expected) { @@ -38,7 +38,7 @@ struct sampler_tester { cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]}); } - cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false }; + cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false }; } void apply(llama_sampler * sampler) { @@ -270,13 +270,13 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector & data, int n_iter) { std::vector cur(data.size()); std::copy(data.begin(), data.end(), cur.begin()); - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false }; llama_sampler_apply(cnstr, &cur_p); llama_sampler_reset(cnstr); const int64_t t_start = ggml_time_us(); for (int i = 0; i < n_iter; i++) { std::copy(data.begin(), data.end(), cur.begin()); - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false }; llama_sampler_apply(cnstr, &cur_p); llama_sampler_reset(cnstr); } From 21d44e7f1b63aff4e4ff2ac01724ec8b872813a9 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 7 Oct 2025 17:43:46 +0200 Subject: [PATCH 2/2] fix normalized init/update in test-grammar-llguidance.cpp --- tests/test-grammar-llguidance.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp index 566b039a07038..f368810f7af01 100644 --- a/tests/test-grammar-llguidance.cpp +++ b/tests/test-grammar-llguidance.cpp @@ -21,7 +21,7 @@ static bool match_string(const std::string & input, llama_sampler * grammar) { for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) { cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f }); } - auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false }; + auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), false, -1, false }; for (const auto token : tokens) { for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) { @@ -1096,6 +1096,7 @@ static void one_hot(llama_token_data_array & tok_arr, llama_token selected) { } tok_arr.data[selected].logit = 100.0f; + tok_arr.normalized = false; } static void test_sampler_chain(void) { @@ -1119,7 +1120,7 @@ start: /[A-Z ]*/)"; for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) { cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f }); } - auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false }; + auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), false, -1, false }; for (const auto token : tokens) { one_hot(tok_arr, token);