llama : add normalized field to llama_token_data_array struct

danbev · danbev · commit 17855ff1c29e · 2025-10-07T15:52:18.000+02:00
This commit adds a 'normalized' field to the llama_token_data_array
struct to indicate whether the probabilities have been computed and
normalized from the logits.

The motivation for this change is to avoid redundant normalization
calls in the sampling code, as the softmax calculation can be
expensive depending on the size of the llama_token_data array.

Samplers that modify logits or filter tokens (change the size) must set
normalized to false to invalidate cached probabilities. Samplers that
compute probabilities set it to true after normalization.
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -126,7 +126,7 @@ struct common_sampler {
             cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
         }
 
-        cur_p = { cur.data(), cur.size(), -1, false };
+        cur_p = { cur.data(), cur.size(), false, -1, false };
     }
 };
 
@@ -360,7 +360,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     // check if it the sampled token fits the grammar
     {
         llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, false, -1, false };
 
         llama_sampler_apply(grmr, &single_token_data_array);
 
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
@@ -404,6 +404,7 @@ static void diffusion_generate(llama_context *          ctx,
                         llama_token_data_array cur_p = {
                             candidates.data(),
                             (size_t) n_vocab,
+                            false, // normalized
                             -1,
                             false,
                         };
@@ -429,6 +430,7 @@ static void diffusion_generate(llama_context *          ctx,
                     llama_token_data_array cur_p = {
                         candidates.data(),
                         candidates.size(),
+                        false, // normalized
                         -1,
                         false,
                     };
@@ -472,6 +474,7 @@ static void diffusion_generate(llama_context *          ctx,
                         llama_token_data_array conf_array = {
                             conf_candidates.data(),
                             conf_candidates.size(),
+                            false,
                             -1,
                             false,
                         };
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -269,7 +269,7 @@ int main(int argc, char ** argv) {
 
                         LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                         float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), false, LLAMA_TOKEN_NULL, true };
 
                         //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
 
diff --git a/include/llama.h b/include/llama.h
@@ -205,6 +205,7 @@ extern "C" {
         // NOTE: this pointer can be modified by the samplers
         llama_token_data * data;
         size_t size;
+        bool normalized;  // true if the probabilities (llama_token_data.p) have been computed
         int64_t selected; // this is the index in the data array (i.e. not the token id)
         bool sorted;      // note: do not assume the data is sorted - always check this flag
     } llama_token_data_array;
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
@@ -1156,6 +1156,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
     for (const auto & reject : rejects) {
         cur_p->data[reject.index].logit = -INFINITY;
     }
+    cur_p->normalized = false;
 }
 
 void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -260,6 +260,7 @@ static void llama_log_softmax(float * array, size_t size) {
 */
 
 static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+    cur_p->normalized = false;
     if (temp <= 0.0f) {
         // find the token with the highest logit and set the rest to -inf
         size_t max_i = 0;
@@ -309,6 +310,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_s
     for (size_t i = 0; i < cur_p->size; ++i) {
         cur_p->data[i].p /= cum_sum;
     }
+    cur_p->normalized = true;
 }
 
 static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
@@ -328,6 +330,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     }
 
     cur_p->size = k;
+    cur_p->normalized = false;
 }
 
 static uint32_t get_rng_seed(uint32_t seed) {
@@ -422,6 +425,7 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
     llama_token_data_array cur_p = {
         /* .data       = */ cur.data(),
         /* .size       = */ cur.size(),
+        /* .normalized = */ false,
         /* .selected   = */ -1,
         /* .sorted     = */ false,
     };
@@ -614,6 +618,23 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
 
     if (cur_p->size == 1) {
         cur_p->data[0].p = 1.0f;
+        cur_p->normalized = true;
+        return;
+    }
+
+    if (cur_p->normalized) {
+        std::uniform_real_distribution<double> dist(0.0f, 1.0f);
+        const double rnd = dist(ctx->rng);
+        double sum_run = 0.0f;
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            sum_run += cur_p->data[i].p;
+            if (sum_run >= rnd) {
+                cur_p->selected = i;
+                return;
+            }
+        }
+        cur_p->selected = cur_p->size - 1;
         return;
     }
 
@@ -663,13 +684,15 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     if (!found) {
         cur_p->selected = cur_p->size - 1;
     }
+    cur_p->normalized = true;
 #else
     // for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
     for (size_t i = 0; i < cur_p->size; ++i) {
         cur_p->data[i].p /= sum_cum;
     }
 
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
+    cur_p->normalized = true;
 #endif
 }
 
@@ -780,7 +803,9 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, false);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, false);
+    }
 
     size_t k = cur_p->size;
     auto * pdata = cur_p->data;
@@ -826,6 +851,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
     }
 
     cur_p->size = last_idx;
+    cur_p->normalized = false;
 }
 
 static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
@@ -897,6 +923,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
         if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
             cur_p->size = filtered_tokens.size();
+            cur_p->normalized = false;
             min_p_applied = true;
         }
     }
@@ -919,6 +946,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
 
         // Resize the output vector to keep only the matching tokens
         cur_p->size = i;
+        cur_p->normalized = false;
     }
 }
 
@@ -971,7 +999,9 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     }
 
     // Compute the softmax of logits and calculate entropy
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     float entropy = 0.0f;
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -1019,6 +1049,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
     cur_p->size = cur_p_new.size();
     cur_p->sorted = false;
+    cur_p->normalized = false;
 }
 
 static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
@@ -1120,7 +1151,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         // Calculate maximum possible entropy
         float max_entropy = -logf(1.0f / cur_p->size);
 
-        llama_sampler_softmax_impl(cur_p, true);
+        if (!cur_p->normalized) {
+            llama_sampler_softmax_impl(cur_p, true);
+        }
 
         // Calculate entropy of the softmax probabilities
         float entropy = 0.0f;
@@ -1162,6 +1195,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         for (size_t i = 0; i < cur_p->size; ++i) {
             cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
         }
+        cur_p->normalized = true;
 
     #ifdef DEBUG
         // Print the updated top 25 probabilities after temperature scaling
@@ -1236,7 +1270,9 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     int pos_last = 0;
 
@@ -1251,6 +1287,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
         cur_p->data += pos_last;
         cur_p->size -= pos_last;
+        cur_p->normalized = false;
     }
 }
 
@@ -1327,7 +1364,9 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
 static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
@@ -1433,7 +1472,9 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
 static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
     // Truncate the words with surprise values greater than mu
     cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
@@ -1775,6 +1816,7 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
     }
 
     cur_p->sorted = false;
+    cur_p->normalized = false;
 }
 
 static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
@@ -2193,6 +2235,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
     }
 
     cur_p->sorted = false;
+    cur_p->normalized = false;
 }
 
 static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
@@ -2344,6 +2387,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
     }
 
     if (ctx->to_search.empty()) {
+        cur_p->normalized = false;
         return;
     }
 
@@ -2356,6 +2400,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
             }
         }
     }
+    cur_p->normalized = false;
 }
 
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
@@ -2408,7 +2453,9 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
 static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_infill *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, true);
+    if (!cur_p->normalized) {
+        llama_sampler_softmax_impl(cur_p, true);
+    }
 
 #if defined(GGML_DEBUG_SAMPLER_INFILL)
 #define LOG_DBG_CUR LLAMA_LOG_DEBUG
@@ -2457,6 +2504,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         for (size_t i = 0; i < cur_p->size; ++i) {
             cur_p->data[i].p /= p_sum;
         }
+        cur_p->normalized = true;
 
         return;
     }
@@ -2542,6 +2590,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         cur_p->size = 1;
         cur_p->data[0].id = ctx->vocab->token_eot();
         cur_p->data[0].logit = 1.0f;
+        cur_p->normalized = true;
 
         return;
     }
@@ -2579,6 +2628,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
         LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
+    cur_p->normalized = true;
 
 #undef LOG_DBG_CUR
 }
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
@@ -28,7 +28,7 @@ struct sampler_tester {
             cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
         }
 
-        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+        cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };
     }
 
     sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
@@ -38,7 +38,7 @@ struct sampler_tester {
             cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
         }
 
-        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+        cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };
     }
 
     void apply(llama_sampler * sampler) {
@@ -270,13 +270,13 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
 static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
     std::vector<llama_token_data> cur(data.size());
     std::copy(data.begin(), data.end(), cur.begin());
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };
     llama_sampler_apply(cnstr, &cur_p);
     llama_sampler_reset(cnstr);
     const int64_t t_start = ggml_time_us();
     for (int i = 0; i < n_iter; i++) {
         std::copy(data.begin(), data.end(), cur.begin());
-        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };
         llama_sampler_apply(cnstr, &cur_p);
         llama_sampler_reset(cnstr);
     }

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ struct common_sampler {`
`126`	`126`	`cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};`
`127`	`127`	`}`
`128`	`128`
`129`		`- cur_p = { cur.data(), cur.size(), -1, false };`
	`129`	`+ cur_p = { cur.data(), cur.size(), false, -1, false };`
`130`	`130`	`}`
`131`	`131`	`};`
`132`	`132`
`@@ -360,7 +360,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co`
`360`	`360`	`// check if it the sampled token fits the grammar`
`361`	`361`	`{`
`362`	`362`	`llama_token_data single_token_data = { id, 1.0f, 0.0f };`
`363`		`- llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };`
	`363`	`+ llama_token_data_array single_token_data_array = { &single_token_data, 1, false, -1, false };`
`364`	`364`
`365`	`365`	`llama_sampler_apply(grmr, &single_token_data_array);`
`366`	`366`
Original file line number	Diff line number	Diff line change
`@@ -1156,6 +1156,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_`
`1156`	`1156`	`for (const auto & reject : rejects) {`
`1157`	`1157`	`cur_p->data[reject.index].logit = -INFINITY;`
`1158`	`1158`	`}`
	`1159`	`+ cur_p->normalized = false;`
`1159`	`1160`	`}`
`1160`	`1161`
`1161`	`1162`	`void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ struct sampler_tester {`
`28`	`28`	`cur.emplace_back(llama_token_data{token_id, logit, 0.0f});`
`29`	`29`	`}`
`30`	`30`
`31`		`- cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };`
	`31`	`+ cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {`
`@@ -38,7 +38,7 @@ struct sampler_tester {`
`38`	`38`	`cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});`
`39`	`39`	`}`
`40`	`40`
`41`		`- cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };`
	`41`	`+ cur_p = llama_token_data_array { cur.data(), cur.size(), false, -1, false };`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`void apply(llama_sampler * sampler) {`
`@@ -270,13 +270,13 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler`
`270`	`270`	`static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {`
`271`	`271`	`std::vector<llama_token_data> cur(data.size());`
`272`	`272`	`std::copy(data.begin(), data.end(), cur.begin());`
`273`		`- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };`
	`273`	`+ llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };`
`274`	`274`	`llama_sampler_apply(cnstr, &cur_p);`
`275`	`275`	`llama_sampler_reset(cnstr);`
`276`	`276`	`const int64_t t_start = ggml_time_us();`
`277`	`277`	`for (int i = 0; i < n_iter; i++) {`
`278`	`278`	`std::copy(data.begin(), data.end(), cur.begin());`
`279`		`- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };`
	`279`	`+ llama_token_data_array cur_p = { cur.data(), cur.size(), false, -1, false };`
`280`	`280`	`llama_sampler_apply(cnstr, &cur_p);`
`281`	`281`	`llama_sampler_reset(cnstr);`
`282`	`282`	`}`