From 17855ff1c29e89bda17013318e76d53561f58975 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 7 Oct 2025 14:25:14 +0200
Subject: [PATCH 1/2] llama : add normalized field to llama_token_data_array
 struct

This commit adds a 'normalized' field to the llama_token_data_array
struct to indicate whether the probabilities have been computed and
normalized from the logits.

The motivation for this change is to avoid redundant normalization
calls in the sampling code, as the softmax calculation can be
expensive depending on the size of the llama_token_data array.

Samplers that modify logits or filter tokens (change the size) must set
normalized to false to invalidate cached probabilities. Samplers that
compute probabilities set it to true after normalization.
---
 common/sampling.cpp                  |  4 +-
 examples/diffusion/diffusion-cli.cpp |  3 ++
 examples/speculative/speculative.cpp |  2 +-
 include/llama.h                      |  1 +
 src/llama-grammar.cpp                |  1 +
 src/llama-sampling.cpp               | 64 +++++++++++++++++++++++++---
 tests/test-sampling.cpp              |  8 ++--
 7 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index c69d525b5b358..148af567ec477 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -126,7 +126,7 @@ struct common_sampler {
             cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
         }
 
-        cur_p = { cur.data(), cur.size(), -1, false };
+        cur_p = { cur.data(), cur.size(), false, -1, false };
     }
 };
 
@@ -360,7 +360,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     // check if it the sampled token fits the grammar
     {
         llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, false, -1, false };
 
         llama_sampler_apply(grmr, &single_token_data_array);
 
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
index 273942a165ed0..3ed79df95e6fd 100644
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -404,6 +404,7 @@ static void diffusion_generate(llama_context *          ctx,
                         llama_token_data_array cur_p = {
                             candidates.data(),
                             (size_t) n_vocab,
+                            false, // normalized
                             -1,
                             false,
                         };
@@ -429,6 +430,7 @@ static void diffusion_generate(llama_context *          ctx,
                     llama_token_data_array cur_p = {
                         candidates.data(),
                         candidates.size(),
+                        false, // normalized
                         -1,
                         false,
                     };
@@ -472,6 +474,7 @@ static void diffusion_generate(llama_context *          ctx,
                         llama_token_data_array conf_array = {
                             conf_candidates.data(),
                             conf_candidates.size(),
+                            false,
                             -1,
                             false,
                         };
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 5f5ac5eb64d38..75337968e2c9b 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -269,7 +269,7 @@ int main(int argc, char ** argv) {
 
                         LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                         float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), false, LLAMA_TOKEN_NULL, true };
 
                         //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
 
diff --git a/include/llama.h b/include/llama.h
index a0a660bff88da..55c9fbc2d928b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -205,6 +205,7 @@ extern "C" {
         // NOTE: this pointer can be modified by the samplers
         llama_token_data * data;
         size_t size;
+        bool normalized;  // true if the probabilities (llama_token_data.p) have been computed
         int64_t selected; // this is the index in the data array (i.e. not the token id)
         bool sorted;      // note: do not assume the data is sorted - always check this flag
     } llama_token_data_array;
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bed706bb248d1..f11a22c15f857 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1156,6 +1156,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
     for (const auto & reject : rejects) {
         cur_p->data[reject.index].logit = -INFINITY;
     }
+    cur_p->normalized = false;
 }
 
 void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 2186f827bf543..93b52a7b83b58 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -260,6 +260,7 @@ static void llama_log_softmax(float * array, size_t size) {
 */
 
 static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+    cur_p->normalized = false;
     if (temp <= 0.0f) {
         // find the token with the highest logit and set the rest to -inf
         size_t max_i = 0;
@@ -309,6 +310,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_s
     for (size_t i = 0; i < cur_p->size; ++i) {
         cur_p->data[i].p /= cum_sum;
     }
+    cur_p->normalized = true;
 }
 
 static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
@@ -328,6 +330,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     }
 
     cur_p->size = k;
+    cur_p->normalized = false;
 }
 
 static uint32_t get_rng_seed(uint32_t seed) {
@@ -422,6 +425,7 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
     llama_token_data_array cur_p = {
         /* .data       = */ cur.data(),
         /* .size       = */ cur.size(),
+        /* .normalized = */ false,
         /* .selected   = */ -1,
         /* .sorted     = */ false,
     };
@@ -614,6 +618,23 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
 
     if (cur_p->size == 1) {
         cur_p->data[0].p = 1.0f;
+        cur_p->normalized = true;
+        return;
+    }
+
+    if (cur_p->normalized) {
+        std::uniform_real_distribution<double> dist(0.0f, 1.0f);
+        const double rnd = dist(ctx->rng);
+        double sum_run = 0.0f;
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            sum_run += cur_p->data[i].p;
+            if (sum_run >= rnd) {
+                cur_p->selected = i;
+                return;
+            }
+        }
+        cur_p->selected = cur_p->size - 1;
         return;
     }
 
@@ -663,6 +684,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     if (!found) {
         cur_p->selected = cur_p->size - 1;
     }
+    cur_p->normalized = true;
 #else
     // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -670,6 +692,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     }
 
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+    cur_p->normalized = true;
 #endif
 }
 
@@ -780,7 +803,9 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, false);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, false);
+    }
 
     size_t k = cur_p->size;
     auto * pdata = cur_p->data;
@@ -826,6 +851,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
     }
 
     cur_p->size = last_idx;
+    cur_p->normalized = false;
 }
 
 static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
@@ -897,6 +923,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
         if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
             cur_p->size = filtered_tokens.size();
+            cur_p->normalized = false;
             min_p_applied = true;
         }
     }
@@ -919,6 +946,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
 
         // Resize the output vector to keep only the matching tokens
         cur_p->size = i;
+        cur_p->normalized = false;
     }
 }
 
@@ -971,7 +999,9 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     }
 
     // Compute the softmax of logits and calculate entropy
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     float entropy = 0.0f;
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -1019,6 +1049,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
     cur_p->size = cur_p_new.size();
     cur_p->sorted = false;
+    cur_p->normalized = false;
 }
 
 static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
@@ -1120,7 +1151,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         // Calculate maximum possible entropy
         float max_entropy = -logf(1.0f / cur_p->size);
 
-        llama_sampler_softmax_impl(cur_p, true);
+        if (!cur_p->normalized) {
+            llama_sampler_softmax_impl(cur_p, true);
+        }
 
         // Calculate entropy of the softmax probabilities
         float entropy = 0.0f;
@@ -1162,6 +1195,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         for (size_t i = 0; i < cur_p->size; ++i) {
             cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
         }
+        cur_p->normalized = true;
 
     #ifdef DEBUG
         // Print the updated top 25 probabilities after temperature scaling
@@ -1236,7 +1270,9 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     int pos_last = 0;
 
@@ -1251,6 +1287,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
         cur_p->data += pos_last;
         cur_p->size -= pos_last;
+        cur_p->normalized = false;
     }
 }
 
@@ -1327,7 +1364,9 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
 static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
@@ -1433,7 +1472,9 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
 static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     // Truncate the words with surprise values greater than mu
     cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
@@ -1775,6 +1816,7 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
     }
 
     cur_p->sorted = false;
+    cur_p->normalized = false;
 }
 
 static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
@@ -2193,6 +2235,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
     }
 
     cur_p->sorted = false;
+    cur_p->normalized = false;
 }
 
 static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
@@ -2344,6 +2387,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
     }
 
     if (ctx->to_search.empty()) {
+        cur_p->normalized = false;
         return;
     }
 
@@ -2356,6 +2400,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
             }
         }
     }
+    cur_p->normalized = false;
 }
 
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
@@ -2408,7 +2453,9 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
 static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_infill *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
 #if defined(GGML_DEBUG_SAMPLER_INFILL)
 #define LOG_DBG_CUR LLAMA_LOG_DEBUG
@@ -2457,6 +2504,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         for (size_t i = 0; i < cur_p->size; ++i) {
             cur_p->data[i].p /= p_sum;
         }
+        cur_p->normalized = true;
 
         return;
     }
@@ -2542,6 +2590,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         cur_p->size = 1;
         cur_p->data[0].id = ctx->vocab->token_eot();
         cur_p->data[0].logit = 1.0f;
+        cur_p->normalized = true;
 
         return;
     }
@@ -2579,6 +2628,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
         LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
+    cur_p->normalized = true;
 
 #undef LOG_DBG_CUR
 }
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 7cd96c5cd351c..44a8cd56421fc 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -28,7 +28,7 @@ struct sampler_tester {
             cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
         }
 
-        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+        cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };
     }
 
     sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
@@ -38,7 +38,7 @@ struct sampler_tester {
             cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
         }
 
-        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+        cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };
     }
 
     void apply(llama_sampler * sampler) {
@@ -270,13 +270,13 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
 static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
     std::vector<llama_token_data> cur(data.size());
     std::copy(data.begin(), data.end(), cur.begin());
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };
     llama_sampler_apply(cnstr, &cur_p);
     llama_sampler_reset(cnstr);
     const int64_t t_start = ggml_time_us();
     for (int i = 0; i < n_iter; i++) {
         std::copy(data.begin(), data.end(), cur.begin());
-        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };
         llama_sampler_apply(cnstr, &cur_p);
         llama_sampler_reset(cnstr);
     }

From 21d44e7f1b63aff4e4ff2ac01724ec8b872813a9 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 7 Oct 2025 17:43:46 +0200
Subject: [PATCH 2/2] fix normalized init/update in test-grammar-llguidance.cpp

---
 tests/test-grammar-llguidance.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp
index 566b039a07038..f368810f7af01 100644
--- a/tests/test-grammar-llguidance.cpp
+++ b/tests/test-grammar-llguidance.cpp
@@ -21,7 +21,7 @@ static bool match_string(const std::string & input, llama_sampler * grammar) {
     for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
         cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
     }
-    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
+    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), false, -1, false };
 
     for (const auto token : tokens) {
         for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
@@ -1096,6 +1096,7 @@ static void one_hot(llama_token_data_array & tok_arr, llama_token selected) {
     }
 
     tok_arr.data[selected].logit = 100.0f;
+    tok_arr.normalized = false;
 }
 
 static void test_sampler_chain(void) {
@@ -1119,7 +1120,7 @@ start: /[A-Z ]*/)";
     for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
         cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
     }
-    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
+    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), false, -1, false };
 
     for (const auto token : tokens) {
         one_hot(tok_arr, token);