From 7d34a4bf70adbaa1970d6e39363398bdabd65cbf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Aug 2025 17:01:37 +0300
Subject: [PATCH 1/9] sampling : optimize sorting using bucket sort in more
 places

ggml-ci
---
 include/llama.h         |   5 -
 src/llama-sampling.cpp  | 216 ++++++++++++++++++++++------------------
 tests/test-sampling.cpp |   2 +-
 3 files changed, 120 insertions(+), 103 deletions(-)
diff --git a/include/llama.h b/include/llama.h
index 346135c71e2e6..904fe03560bcf 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1156,11 +1156,6 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
     LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
 
-    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-        "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     /// Setting k <= 0 makes this a noop
     LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index bfbf5fa230112..fe8d9ba45e676 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -128,6 +128,77 @@ struct ring_buffer {
     std::vector<T> data;
 };
 
+static void llama_token_data_array_sort(const llama_token_data_array * cur_p, int k, std::vector<llama_token_data> & res) {
+    static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit > b.logit;
+    };
+
+    constexpr int   nbuckets     = 128;
+    constexpr float bucket_low   = -10.0f;
+    constexpr float bucket_high  =  10.0f;
+    constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+    constexpr float bucket_inter = -bucket_low * bucket_scale;
+
+    std::vector<int> bucket_idx(cur_p->size);
+    std::vector<int> histo(nbuckets, 0);
+
+    for (int i = 0; i < (int)cur_p->size; ++i) {
+        const float val = cur_p->data[i].logit;
+        int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+        ib = std::max(0, std::min(nbuckets - 1, ib));
+        bucket_idx[i] = ib;
+        ++histo[ib];
+    }
+    int nhave = 0;
+    int ib = nbuckets - 1;
+    for ( ; ib >= 0; --ib) {
+        nhave += histo[ib];
+        if (nhave >= k) {
+            break;
+        }
+    }
+    res.resize(nhave);
+    auto * ptr = res.data();
+    std::vector<llama_token_data*> bucket_ptrs;
+    bucket_ptrs.reserve(nbuckets - ib);
+    for (int j = nbuckets - 1; j >= ib; --j) {
+        bucket_ptrs.push_back(ptr);
+        ptr += histo[j];
+    }
+    for (int i = 0; i < (int)cur_p->size; ++i) {
+        int j = bucket_idx[i];
+        if (j >= ib) {
+            *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
+        }
+    }
+
+    ptr = res.data();
+    int ndone = 0;
+    for (int j = nbuckets - 1; j > ib; --j) {
+        std::sort(ptr, ptr + histo[j], comp);
+        ptr += histo[j];
+        ndone += histo[j];
+    }
+    std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+}
+
+static void llama_token_data_array_sort(llama_token_data_array * cur_p, int k) {
+    static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit > b.logit;
+    };
+
+    if (k <= 128) {
+        std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
+        return;
+    }
+
+    std::vector<llama_token_data> tmp_tokens;
+
+    llama_token_data_array_sort(cur_p, k, tmp_tokens);
+
+    std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+}
+
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
     // iterator for the probabilities
 #ifdef __GNUC__
@@ -200,18 +271,22 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
     }
 }
 
-static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
+static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort = true) {
     GGML_ASSERT(cur_p->size > 0);
 
-    // Sort the logits in descending order
-    if (!cur_p->sorted) {
-        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        });
+    // Sort the logits in descending order if requested
+    if (do_sort && !cur_p->sorted) {
+        llama_token_data_array_sort(cur_p, cur_p->size);
         cur_p->sorted = true;
     }
 
     float max_l = cur_p->data[0].logit;
+    if (!cur_p->sorted) {
+        for (size_t i = 1; i < cur_p->size; ++i) {
+            max_l = std::max(max_l, cur_p->data[i].logit);
+        }
+    }
+
     float cum_sum = 0.0f;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -226,7 +301,6 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
 }
 
 static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
-    // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
     // if (k >= (int32_t)cur_p->size) {
     //     return;
     // }
@@ -239,63 +313,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 
     // Sort scores in descending order
     if (!cur_p->sorted) {
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        };
-        if (k <= 128) {
-            std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
-        } else {
-            constexpr int   nbuckets     = 128;
-            constexpr float bucket_low   = -10.0f;
-            constexpr float bucket_high  =  10.0f;
-            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
-            constexpr float bucket_inter = -bucket_low * bucket_scale;
-
-            std::vector<int> bucket_idx(cur_p->size);
-            std::vector<int> histo(nbuckets, 0);
-
-            for (int i = 0; i < (int)cur_p->size; ++i) {
-                const float val = cur_p->data[i].logit;
-                int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
-                ib = std::max(0, std::min(nbuckets - 1, ib));
-                bucket_idx[i] = ib;
-                ++histo[ib];
-            }
-            int nhave = 0;
-            int ib = nbuckets - 1;
-            for ( ; ib >= 0; --ib) {
-                nhave += histo[ib];
-                if (nhave >= k) {
-                    break;
-                }
-            }
-            std::vector<llama_token_data> tmp_tokens(nhave);
-            auto * ptr = tmp_tokens.data();
-            std::vector<llama_token_data*> bucket_ptrs;
-            bucket_ptrs.reserve(nbuckets - ib);
-            for (int j = nbuckets - 1; j >= ib; --j) {
-                bucket_ptrs.push_back(ptr);
-                ptr += histo[j];
-            }
-            for (int i = 0; i < (int)cur_p->size; ++i) {
-                int j = bucket_idx[i];
-                if (j >= ib) {
-                    *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
-                }
-            }
-
-            ptr = tmp_tokens.data();
-            int ndone = 0;
-            for (int j = nbuckets - 1; j > ib; --j) {
-                std::sort(ptr, ptr + histo[j], comp);
-                ptr += histo[j];
-                ndone += histo[j];
-            }
-            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
-
-            std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
-
-        }
+        llama_token_data_array_sort(cur_p, k);
         cur_p->sorted = true;
     }
 
@@ -576,7 +594,8 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    // sorting is not necessary here, but for now we are doing it
+    llama_sampler_softmax_impl(cur_p, true);
 
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
@@ -626,32 +645,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
     );
 }
 
-// softmax
-
-static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
-    return "softmax";
-}
-
-static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
-    llama_sampler_softmax_impl(cur_p);
-}
-
-static struct llama_sampler_i llama_sampler_softmax_i = {
-    /* .name   = */ llama_sampler_softmax_name,
-    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_softmax_apply,
-    /* .reset  = */ nullptr,
-    /* .clone  = */ nullptr,
-    /* .free   = */ nullptr,
-};
-
-struct llama_sampler * llama_sampler_init_softmax() {
-    return llama_sampler_init(
-        /* .iface = */ &llama_sampler_softmax_i,
-        /* .ctx   = */ nullptr
-    );
-}
-
 // top-k
 
 struct llama_sampler_top_k {
@@ -699,6 +692,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
 struct llama_sampler_top_p {
     const float  p;
     const size_t min_keep;
+
+    std::vector<llama_token_data> buf_sort;
 };
 
 static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
@@ -706,20 +701,36 @@ static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl
 }
 
 static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
+    auto * ctx = (llama_sampler_top_p *) smpl->ctx;
 
     if (ctx->p >= 1.0f) {
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, false);
+
+    size_t k = cur_p->size;
+    auto * pdata = cur_p->data;
+
+    auto & buf_sort = ctx->buf_sort;
+
+    // if not sorted, try adaptive top-k sorting
+    if (!cur_p->sorted && cur_p->size > 1024) {
+        k = std::min<size_t>(256, cur_p->size);
+        llama_token_data_array_sort(cur_p, k, buf_sort);
+        pdata = buf_sort.data();
+    } else if (!cur_p->sorted) {
+        // small candidates -> sort inplace
+        llama_token_data_array_sort(cur_p, k);
+        cur_p->sorted = true;
+    }
 
     // Compute the cumulative probabilities
     float cum_sum = 0.0f;
     size_t last_idx = cur_p->size;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
-        cum_sum += cur_p->data[i].p;
+        cum_sum += pdata[i].p;
 
         // Check if the running sum is at least p or if we have kept at least min_keep tokens
         // we set the last index to i+1 to indicate that the current iterate should be included in the set
@@ -727,9 +738,21 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
             last_idx = i + 1;
             break;
         }
+
+        // we exceeded the current top-k heuristic -> increase k and continue
+        if (!cur_p->sorted && i == k - 1) {
+            k = cur_p->size;
+            llama_token_data_array_sort(cur_p, k, buf_sort);
+            pdata = buf_sort.data();
+        }
     }
 
     // Resize the output vector to keep only the top-p tokens
+    if (!cur_p->sorted) {
+        std::memcpy(cur_p->data, buf_sort.data(), last_idx*sizeof(llama_token_data));
+        cur_p->sorted = true;
+    }
+
     cur_p->size = last_idx;
 }
 
@@ -757,6 +780,7 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
         /* .ctx   = */ new llama_sampler_top_p {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -809,9 +833,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
     if (!min_p_applied) {
         // Sort the logits in descending order
         if (!cur_p->sorted) {
-            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
-                return a.logit > b.logit;
-            });
+            llama_token_data_array_sort(cur_p, cur_p->size);
             cur_p->sorted = true;
         }
 
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6300f25caebe3..97c223ef61841 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -372,7 +372,7 @@ int main(void) {
     test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
 
     test_sampler_queue(10000, "k",   100, 1.0000f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.0003f, 1.0f);
     test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
     test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
     test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);

From 97167e61da6d1ec73229fcb26537c805959dd50f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Aug 2025 17:02:45 +0300
Subject: [PATCH 2/9] sampling : do not sort in dist sampler

ggml-ci
---
 src/llama-sampling.cpp  |  4 +--
 tests/test-sampling.cpp | 60 ++++++++++++++++++++---------------------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index fe8d9ba45e676..fc28fb7c2a233 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -594,8 +594,8 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
 
-    // sorting is not necessary here, but for now we are doing it
-    llama_sampler_softmax_impl(cur_p, true);
+    // sorting is not necessary here
+    llama_sampler_softmax_impl(cur_p, false);
 
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 97c223ef61841..7cd96c5cd351c 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -197,10 +197,10 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
     sampler_tester tester(n_vocab);
 
           llama_token min_token_id = 0;
-    const llama_token max_token_id = n_vocab-1;
+    const llama_token max_token_id = n_vocab - 1;
 
     for (auto s : samplers_sequence) {
-        switch (s){
+        switch (s) {
             case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
             case 'y': GGML_ABORT("typical test not implemented");
             case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
@@ -243,10 +243,10 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
             }
 
             GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(cur_p.data[0].id == max_token_id);
-            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
         } else if (s == 'm') {
-            int expected_size = ceilf((1.0f-min_p) * n_vocab);
+            int expected_size = ceilf((1.0f - min_p) * n_vocab);
             expected_size = std::max(expected_size, 1);
             expected_size = std::min(expected_size, size);
 
@@ -256,14 +256,14 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
             min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
 
             GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(cur_p.data[0].id == max_token_id);
-            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
         } else {
             GGML_ABORT("fatal error");
         }
     }
 
-    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%5d top_p=%f min_p=%f\n",
            samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
 
@@ -308,28 +308,28 @@ static void test_perf() {
 int main(void) {
     ggml_time_init();
 
-    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
-    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f);
 
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f, 0.0f, 1.0f);
 
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0);
 
     test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
     test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
     test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
-
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.26f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.49f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.51f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.74f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
+
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.00f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.24f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f},            0.26f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f},            0.49f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f},                       0.51f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f},                       0.74f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.05f);
@@ -345,23 +345,23 @@ int main(void) {
     test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f},            0.5f);
     test_typical({0.4f, 0.2f, 0.2f, 0.2f},     {0.2f, 0.2f, 0.2f}, 0.5f);
 
-    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
-    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
-    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0, 0.25f, 0.25f, 0.25f, 0.25f},   50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0, 0, 0, 0.5f, 0.5f},       50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0, 0, 0, 0.5f, 0.5f}, 50.0f, 0.0f, 0.0f);
 
-    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
-    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
-    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.000011f, 0.249997f, 0.249997f, 0.249997f, 0.249997f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.000023f, 0.000023f, 0.000023f, 0.499966f, 0.499966f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.000000f, 0.000023f, 0.000023f, 0.499977f, 0.499977f}, 1.0f, 5.0f, 5.0f);
 
 
     test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
-    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {});
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.109232f, 0.296923f}, 1.0f, 1.1f, 2, 5, {});
     test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
-    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.032727f, 0.241818f, 0.241818f}, 2.0f, 1.1f, 2, 5, {});
     test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
 
     test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f, 0.0f, 0.0f}, 1.00f);
-    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
     test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 3.00f);
 
     test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);

From 15557b8ff26e7cca3633ab7373d34d7996a9ec0c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Aug 2025 18:13:34 +0300
Subject: [PATCH 3/9] sampling : avoid heap allocations for sort buffers

ggml-ci
---
 src/llama-sampling.cpp | 180 ++++++++++++++++++++++++++---------------
 1 file changed, 117 insertions(+), 63 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index fc28fb7c2a233..f0200c9e0b6b6 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -128,7 +128,19 @@ struct ring_buffer {
     std::vector<T> data;
 };
 
-static void llama_token_data_array_sort(const llama_token_data_array * cur_p, int k, std::vector<llama_token_data> & res) {
+// helper buffers used for bucket sort
+// samplers can keep a struct of these to avoid allocating buffers on the heap each time
+struct llama_sort_data {
+    std::vector<llama_token_data> data;
+
+    std::vector<int> bucket_idx;
+    std::vector<int> histo;
+
+    std::vector<llama_token_data*> bucket_ptrs;
+};
+
+// writes result in res, does not mutate cur
+static void llama_token_data_array_sort(const llama_token_data_array & cur, int k, llama_sort_data & res) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     };
@@ -139,11 +151,18 @@ static void llama_token_data_array_sort(const llama_token_data_array * cur_p, in
     constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
     constexpr float bucket_inter = -bucket_low * bucket_scale;
 
-    std::vector<int> bucket_idx(cur_p->size);
-    std::vector<int> histo(nbuckets, 0);
+    auto & data = res.data;
+    auto & bucket_idx = res.bucket_idx;
+    auto & histo = res.histo;
+    auto & bucket_ptrs = res.bucket_ptrs;
 
-    for (int i = 0; i < (int)cur_p->size; ++i) {
-        const float val = cur_p->data[i].logit;
+    bucket_idx.resize(cur.size);
+
+    histo.clear();
+    histo.resize(nbuckets, 0);
+
+    for (int i = 0; i < (int)cur.size; ++i) {
+        const float val = cur.data[i].logit;
         int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
         ib = std::max(0, std::min(nbuckets - 1, ib));
         bucket_idx[i] = ib;
@@ -157,22 +176,22 @@ static void llama_token_data_array_sort(const llama_token_data_array * cur_p, in
             break;
         }
     }
-    res.resize(nhave);
-    auto * ptr = res.data();
-    std::vector<llama_token_data*> bucket_ptrs;
+    data.resize(nhave);
+    auto * ptr = data.data();
+    bucket_ptrs.clear();
     bucket_ptrs.reserve(nbuckets - ib);
     for (int j = nbuckets - 1; j >= ib; --j) {
         bucket_ptrs.push_back(ptr);
         ptr += histo[j];
     }
-    for (int i = 0; i < (int)cur_p->size; ++i) {
+    for (int i = 0; i < (int)cur.size; ++i) {
         int j = bucket_idx[i];
         if (j >= ib) {
-            *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
+            *bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
         }
     }
 
-    ptr = res.data();
+    ptr = data.data();
     int ndone = 0;
     for (int j = nbuckets - 1; j > ib; --j) {
         std::sort(ptr, ptr + histo[j], comp);
@@ -182,7 +201,8 @@ static void llama_token_data_array_sort(const llama_token_data_array * cur_p, in
     std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
 }
 
-static void llama_token_data_array_sort(llama_token_data_array * cur_p, int k) {
+// buf is a helper buffer that can optionally be utilized
+static void llama_token_data_array_sort_inplace(llama_token_data_array * cur_p, int k, llama_sort_data & buf) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     };
@@ -192,11 +212,9 @@ static void llama_token_data_array_sort(llama_token_data_array * cur_p, int k) {
         return;
     }
 
-    std::vector<llama_token_data> tmp_tokens;
-
-    llama_token_data_array_sort(cur_p, k, tmp_tokens);
+    llama_token_data_array_sort(*cur_p, k, buf);
 
-    std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+    std::memcpy(cur_p->data, buf.data.data(), k*sizeof(llama_token_data));
 }
 
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
@@ -271,12 +289,13 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
     }
 }
 
-static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort = true) {
+// if buf_sort == nullptr : do not sort cur_p
+static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, llama_sort_data * buf_sort) {
     GGML_ASSERT(cur_p->size > 0);
 
     // Sort the logits in descending order if requested
-    if (do_sort && !cur_p->sorted) {
-        llama_token_data_array_sort(cur_p, cur_p->size);
+    if (buf_sort && !cur_p->sorted) {
+        llama_token_data_array_sort_inplace(cur_p, cur_p->size, *buf_sort);
         cur_p->sorted = true;
     }
 
@@ -300,7 +319,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_s
     }
 }
 
-static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
+static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k, llama_sort_data & buf_sort) {
     // if (k >= (int32_t)cur_p->size) {
     //     return;
     // }
@@ -313,7 +332,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 
     // Sort scores in descending order
     if (!cur_p->sorted) {
-        llama_token_data_array_sort(cur_p, k);
+        llama_token_data_array_sort_inplace(cur_p, k, buf_sort);
         cur_p->sorted = true;
     }
 
@@ -585,6 +604,8 @@ struct llama_sampler_dist {
           uint32_t seed_cur;
 
     std::mt19937 rng;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
@@ -595,7 +616,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
 
     // sorting is not necessary here
-    llama_sampler_softmax_impl(cur_p, false);
+    llama_sampler_softmax_impl(cur_p, nullptr);
 
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
@@ -641,6 +662,7 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
             /* .seed     = */ seed,
             /* .seed_cur = */ seed_cur,
             /* .rng      = */ std::mt19937(seed_cur),
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -649,6 +671,8 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
 
 struct llama_sampler_top_k {
     const int32_t k;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) {
@@ -656,8 +680,8 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
 }
 
 static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
-    llama_sampler_top_k_impl(cur_p, ctx->k);
+    auto * ctx = (llama_sampler_top_k *) smpl->ctx;
+    llama_sampler_top_k_impl(cur_p, ctx->k, ctx->buf_sort);
 }
 
 static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) {
@@ -682,7 +706,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_top_k_i,
         /* .ctx   = */ new llama_sampler_top_k {
-            /* .k = */ k,
+            /* .k        = */ k,
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -693,7 +718,7 @@ struct llama_sampler_top_p {
     const float  p;
     const size_t min_keep;
 
-    std::vector<llama_token_data> buf_sort;
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
@@ -707,7 +732,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, false);
+    llama_sampler_softmax_impl(cur_p, nullptr);
 
     size_t k = cur_p->size;
     auto * pdata = cur_p->data;
@@ -717,11 +742,11 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
     // if not sorted, try adaptive top-k sorting
     if (!cur_p->sorted && cur_p->size > 1024) {
         k = std::min<size_t>(256, cur_p->size);
-        llama_token_data_array_sort(cur_p, k, buf_sort);
-        pdata = buf_sort.data();
+        llama_token_data_array_sort(*cur_p, k, buf_sort);
+        pdata = buf_sort.data.data();
     } else if (!cur_p->sorted) {
         // small candidates -> sort inplace
-        llama_token_data_array_sort(cur_p, k);
+        llama_token_data_array_sort_inplace(cur_p, k, buf_sort);
         cur_p->sorted = true;
     }
 
@@ -742,14 +767,14 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         // we exceeded the current top-k heuristic -> increase k and continue
         if (!cur_p->sorted && i == k - 1) {
             k = cur_p->size;
-            llama_token_data_array_sort(cur_p, k, buf_sort);
-            pdata = buf_sort.data();
+            llama_token_data_array_sort(*cur_p, k, buf_sort);
+            pdata = buf_sort.data.data();
         }
     }
 
     // Resize the output vector to keep only the top-p tokens
     if (!cur_p->sorted) {
-        std::memcpy(cur_p->data, buf_sort.data(), last_idx*sizeof(llama_token_data));
+        std::memcpy(cur_p->data, buf_sort.data.data(), last_idx*sizeof(llama_token_data));
         cur_p->sorted = true;
     }
 
@@ -790,6 +815,8 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
 struct llama_sampler_min_p {
     const float  p;
     const size_t min_keep;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) {
@@ -797,7 +824,7 @@ static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl
 }
 
 static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
+    auto * ctx = (llama_sampler_min_p *) smpl->ctx;
 
     if (ctx->p <= 0.0f || !cur_p->size) {
         return;
@@ -833,7 +860,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
     if (!min_p_applied) {
         // Sort the logits in descending order
         if (!cur_p->sorted) {
-            llama_token_data_array_sort(cur_p, cur_p->size);
+            llama_token_data_array_sort_inplace(cur_p, cur_p->size, ctx->buf_sort);
             cur_p->sorted = true;
         }
 
@@ -875,6 +902,7 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
         /* .ctx   = */ new llama_sampler_min_p {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -884,6 +912,8 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
 struct llama_sampler_typical {
     const float  p;
     const size_t min_keep;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) {
@@ -891,7 +921,7 @@ static const char * llama_sampler_typical_name(const struct llama_sampler * /*sm
 }
 
 static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_typical *) smpl->ctx;
+    auto * ctx = (llama_sampler_typical *) smpl->ctx;
 
     // Reference implementation:
     // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
@@ -900,7 +930,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     }
 
     // Compute the softmax of logits and calculate entropy
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     float entropy = 0.0f;
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -974,6 +1004,7 @@ struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
         /* .ctx   = */ new llama_sampler_typical {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -1027,6 +1058,8 @@ struct llama_sampler_temp_ext {
     const float temp;
     const float delta;
     const float exponent;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
@@ -1034,7 +1067,7 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
 }
 
 static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
+    auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
     if (ctx->delta > 0) {
         const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
         const float max_temp = ctx->temp + ctx->delta;
@@ -1049,7 +1082,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         // Calculate maximum possible entropy
         float max_entropy = -logf(1.0f / cur_p->size);
 
-        llama_sampler_softmax_impl(cur_p);
+        llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
         // Calculate entropy of the softmax probabilities
         float entropy = 0.0f;
@@ -1129,6 +1162,7 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
             /* .temp     = */ temp,
             /* .delta    = */ delta,
             /* .exponent = */ exponent,
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -1143,7 +1177,8 @@ struct llama_sampler_xtc {
     const uint32_t seed;
     uint32_t       seed_cur;
 
-    std::mt19937   rng;
+    std::mt19937    rng;
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@@ -1161,17 +1196,21 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
     std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
     float chance = distribution(ctx->rng);
-    if (chance > ctx->probability) return;
+    if (chance > ctx->probability) {
+        return;
+    }
 
     // in case it's not sorted/recalculated yet
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     int pos_last = 0;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
         if (cur_p->data[i].p >= ctx->threshold) {
             pos_last = i;
-        } else break;
+        } else {
+            break;
+        }
     }
 
     if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
@@ -1224,6 +1263,7 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
             /* .seed          = */ seed,
             /* .seed_cur      = */ seed_cur,
             /* .rng           = */ std::mt19937(seed_cur),
+            /* .buf_sort      = */ {},
         }
     );
 }
@@ -1243,7 +1283,8 @@ struct llama_sampler_mirostat {
 
     float mu;
 
-    std::mt19937 rng;
+    std::mt19937    rng;
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
@@ -1253,7 +1294,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
 static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
@@ -1271,8 +1312,9 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
     float epsilon_hat = s_hat - 1;
     float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
 
-    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1), ctx->buf_sort);
+
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     const int idx = llama_sample_dist(cur_p, ctx->rng);
 
@@ -1333,6 +1375,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
             /* .m        = */ m,
             /* .mu       = */ 2.0f*tau,
             /* .rng      = */ std::mt19937(seed_cur),
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -1348,7 +1391,8 @@ struct llama_sampler_mirostat_v2 {
 
     float mu;
 
-    std::mt19937 rng;
+    std::mt19937    rng;
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
@@ -1358,7 +1402,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
 static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     // Truncate the words with surprise values greater than mu
     cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
@@ -1370,7 +1414,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
     }
 
     // Normalize the probabilities of the remaining words
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     const int idx = llama_sample_dist(cur_p, ctx->rng);
 
@@ -1430,6 +1474,7 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
             /* .eta      = */ eta,
             /* .mu       = */ 2.0f*tau,
             /* .rng      = */ std::mt19937(seed_cur),
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -1562,7 +1607,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                 trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
             }
             trigger_pattern += ")[\\s\\S]*";
-            auto trigger_pattern_c = trigger_pattern.c_str();
+            const auto * trigger_pattern_c = trigger_pattern.c_str();
             trigger_patterns = &trigger_pattern_c;
             num_trigger_patterns = 1;
         }
@@ -1763,6 +1808,8 @@ struct llama_sampler * llama_sampler_init_penalties(
 
 struct llama_sampler_top_n_sigma {
     const float n;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
@@ -1770,7 +1817,7 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
 }
 
 static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
+    auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
 
     if (ctx->n <= 0.0f || cur_p->size <= 1) {
         return;
@@ -1802,13 +1849,14 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
     }
     float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
 
-    //apply mask
+    // apply mask
     for (size_t i = 0; i < cur_p->size; ++i) {
         if (cur_p->data[i].logit < max - (ctx->n * std)) {
             cur_p->data[i].logit = -INFINITY;
         }
     }
-    llama_sampler_softmax_impl(cur_p);
+
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 }
 
 static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
@@ -1833,7 +1881,8 @@ struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_top_n_sigma_i,
         /* .ctx   = */ new llama_sampler_top_n_sigma {
-            /* .n = */ n,
+            /* .n        = */ n,
+            /* .buf_sort = */ {},
         }
     );
 }
@@ -2013,7 +2062,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
 
     {
         const int last = last_n_repeat - 1;
-        int rt = 0, lt = 0;
+
+        int rt = 0;
+        int lt = 0;
 
         for (int k = 1; k < last_n_repeat; ++k) {
             if (k > rt) {
@@ -2157,8 +2208,8 @@ static struct llama_sampler_i llama_sampler_dry_i = {
     /* .free   = */ llama_sampler_dry_free,
 };
 
-struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
-    int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
+struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+    int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
     std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
     const int MAX_CHAR_LEN = 40;
     const int MAX_SEQ_LEN = 20;
@@ -2191,7 +2242,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_dry_i,
         /* .ctx   = */ new llama_sampler_dry {
-            /* .total_context_size     = */ context_size,
+            /* .total_context_size     = */ n_ctx_train,
             /* .dry_multiplier         = */ dry_multiplier,
             /* .dry_base               = */ dry_base,
             /* .dry_allowed_length     = */ dry_allowed_length,
@@ -2321,6 +2372,8 @@ struct llama_sampler_infill {
 
     std::vector<char> buf0;
     std::vector<char> buf1;
+
+    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
@@ -2330,7 +2383,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
 static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_infill *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p);
+    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
 #if defined(GGML_DEBUG_SAMPLER_INFILL)
 #define LOG_DBG_CUR LLAMA_LOG_DEBUG
@@ -2527,9 +2580,10 @@ struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * voca
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_infill_i,
         /* .ctx   = */ new llama_sampler_infill {
-            /* .vocab = */ vocab,
-            /* .buf0  = */ std::vector<char>(512),
-            /* .buf1  = */ std::vector<char>(512),
+            /* .vocab    = */ vocab,
+            /* .buf0     = */ std::vector<char>(512),
+            /* .buf1     = */ std::vector<char>(512),
+            /* .buf_sort = */ {},
         }
     );
 }

From 70bce496ad4194f6d9bbe4f76a6c9bfba339d95a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 Aug 2025 11:30:27 +0300
Subject: [PATCH 4/9] common : add option to sort sampling candidates by
 probability

ggml-ci
---
 common/sampling.cpp                  | 25 +++++++++++++++++++++++--
 common/sampling.h                    |  4 +++-
 common/speculative.cpp               |  2 +-
 examples/speculative/speculative.cpp |  4 ++--
 include/llama.h                      |  2 +-
 src/llama-sampling.cpp               |  1 -
 tools/server/server.cpp              |  5 +++--
 tools/tts/tts.cpp                    |  2 +-
 8 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index 9c04d35fd00a2..c710ee173c0ed 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 
 // helpers
 
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
-    return &gsmpl->cur_p;
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+    auto * res = &gsmpl->cur_p;
+
+    if (do_sort && !res->sorted) {
+        // remember the selected token before sorting
+        const llama_token id = res->data[res->selected].id;
+
+        std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.p > b.p;
+        });
+
+        // restore the selected token after sorting
+        for (size_t i = 0; i < res->size; ++i) {
+            if (res->data[i].id == id) {
+                res->selected = i;
+                break;
+            }
+        }
+
+        res->sorted = true;
+    }
+
+    return res;
 }
 
 llama_token common_sampler_last(const struct common_sampler * gsmpl) {
diff --git a/common/sampling.h b/common/sampling.h
index 2064421db4e80..c7f3278b8fe48 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 // helpers
 
 // access the internal list of current candidate tokens
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+// if do_sort == true, the candidates will be sorted (in descending order of probability) in case they are not already sorted
+// if do_sort == false, the candidates *might* not be sorted. use the .sorted flag of the result to determine that
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
 
 // get the last accepted token
 llama_token common_sampler_last(const struct common_sampler * gsmpl);
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 262b2c23e720f..3e83b0964c855 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
 
         common_sampler_sample(smpl, ctx_dft, 0, true);
 
-        const auto * cur_p = common_sampler_get_candidates(smpl);
+        const auto * cur_p = common_sampler_get_candidates(smpl, true);
 
         for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
             LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 8449406a6d27a..5f5ac5eb64d38 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -244,7 +244,7 @@ int main(int argc, char ** argv) {
                     // stochastic verification
                     common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
 
-                    auto & dist_tgt = *common_sampler_get_candidates(smpl);
+                    auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
 
                     float p_tgt = 0.0f;
                     float p_dft = 0.0f;
@@ -493,7 +493,7 @@ int main(int argc, char ** argv) {
 
                 common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
 
-                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
+                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                     LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
diff --git a/include/llama.h b/include/llama.h
index 904fe03560bcf..11f8a363a5733 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -206,7 +206,7 @@ extern "C" {
         llama_token_data * data;
         size_t size;
         int64_t selected; // this is the index in the data array (i.e. not the token id)
-        bool sorted;
+        bool sorted;      // note: do not assume the data is sorted - always check this flag
     } llama_token_data_array;
 
     typedef bool (*llama_progress_callback)(float progress, void * user_data);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index f0200c9e0b6b6..8a7a817ef8c97 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1200,7 +1200,6 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         return;
     }
 
-    // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
 
     int pos_last = 0;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6aa319d2f1121..2217385f9bda9 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2485,11 +2485,12 @@ struct server_context {
         return slot.has_next_token; // continue
     }
 
-    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
+    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
         size_t n_probs = slot.params.sampling.n_probs;
         size_t n_vocab = llama_vocab_n_tokens(vocab);
+
         if (post_sampling) {
-            const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+            const auto * cur_p = common_sampler_get_candidates(slot.smpl, true);
             const size_t max_probs = cur_p->size;
 
             // set probability for sampled token
diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp
index 18f01a9946350..eaf56591d9d11 100644
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@@ -895,7 +895,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
 
                 codes.push_back(new_token_id);
 
-                const auto * cands = common_sampler_get_candidates(smpl[i]);
+                const auto * cands = common_sampler_get_candidates(smpl[i], false);
 
                 // is it an end of generation? -> mark the stream as finished
                 if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {

From c8a2adaf416b833825073f32ddb12ce27045086a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 31 Aug 2025 19:56:55 +0300
Subject: [PATCH 5/9] sampling : revert the change for preserving sort buffers

---
 src/llama-sampling.cpp | 123 ++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 82 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 8a7a817ef8c97..1d0baa7c03683 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -128,19 +128,8 @@ struct ring_buffer {
     std::vector<T> data;
 };
 
-// helper buffers used for bucket sort
-// samplers can keep a struct of these to avoid allocating buffers on the heap each time
-struct llama_sort_data {
-    std::vector<llama_token_data> data;
-
-    std::vector<int> bucket_idx;
-    std::vector<int> histo;
-
-    std::vector<llama_token_data*> bucket_ptrs;
-};
-
 // writes result in res, does not mutate cur
-static void llama_token_data_array_sort(const llama_token_data_array & cur, int k, llama_sort_data & res) {
+static void llama_token_data_array_sort(const llama_token_data_array & cur, int k, std::vector<llama_token_data> & data) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     };
@@ -151,21 +140,18 @@ static void llama_token_data_array_sort(const llama_token_data_array & cur, int
     constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
     constexpr float bucket_inter = -bucket_low * bucket_scale;
 
-    auto & data = res.data;
-    auto & bucket_idx = res.bucket_idx;
-    auto & histo = res.histo;
-    auto & bucket_ptrs = res.bucket_ptrs;
+    std::vector<int> bucket_idx;
+    std::vector<int> histo(nbuckets, 0);
 
-    bucket_idx.resize(cur.size);
+    std::vector<llama_token_data*> bucket_ptrs;
 
-    histo.clear();
-    histo.resize(nbuckets, 0);
+    bucket_idx.reserve(cur.size);
 
     for (int i = 0; i < (int)cur.size; ++i) {
         const float val = cur.data[i].logit;
         int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
         ib = std::max(0, std::min(nbuckets - 1, ib));
-        bucket_idx[i] = ib;
+        bucket_idx.push_back(ib);
         ++histo[ib];
     }
     int nhave = 0;
@@ -178,7 +164,6 @@ static void llama_token_data_array_sort(const llama_token_data_array & cur, int
     }
     data.resize(nhave);
     auto * ptr = data.data();
-    bucket_ptrs.clear();
     bucket_ptrs.reserve(nbuckets - ib);
     for (int j = nbuckets - 1; j >= ib; --j) {
         bucket_ptrs.push_back(ptr);
@@ -202,7 +187,7 @@ static void llama_token_data_array_sort(const llama_token_data_array & cur, int
 }
 
 // buf is a helper buffer that can optionally be utilized
-static void llama_token_data_array_sort_inplace(llama_token_data_array * cur_p, int k, llama_sort_data & buf) {
+static void llama_token_data_array_sort_inplace(llama_token_data_array * cur_p, int k) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     };
@@ -212,9 +197,11 @@ static void llama_token_data_array_sort_inplace(llama_token_data_array * cur_p,
         return;
     }
 
-    llama_token_data_array_sort(*cur_p, k, buf);
+    std::vector<llama_token_data> tmp;
+
+    llama_token_data_array_sort(*cur_p, k, tmp);
 
-    std::memcpy(cur_p->data, buf.data.data(), k*sizeof(llama_token_data));
+    std::memcpy(cur_p->data, tmp.data(), k*sizeof(llama_token_data));
 }
 
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
@@ -289,13 +276,12 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
     }
 }
 
-// if buf_sort == nullptr : do not sort cur_p
-static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, llama_sort_data * buf_sort) {
+static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
     GGML_ASSERT(cur_p->size > 0);
 
     // Sort the logits in descending order if requested
-    if (buf_sort && !cur_p->sorted) {
-        llama_token_data_array_sort_inplace(cur_p, cur_p->size, *buf_sort);
+    if (do_sort && !cur_p->sorted) {
+        llama_token_data_array_sort_inplace(cur_p, cur_p->size);
         cur_p->sorted = true;
     }
 
@@ -319,7 +305,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, llama_sor
     }
 }
 
-static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k, llama_sort_data & buf_sort) {
+static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
     // if (k >= (int32_t)cur_p->size) {
     //     return;
     // }
@@ -332,7 +318,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k,
 
     // Sort scores in descending order
     if (!cur_p->sorted) {
-        llama_token_data_array_sort_inplace(cur_p, k, buf_sort);
+        llama_token_data_array_sort_inplace(cur_p, k);
         cur_p->sorted = true;
     }
 
@@ -604,8 +590,6 @@ struct llama_sampler_dist {
           uint32_t seed_cur;
 
     std::mt19937 rng;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
@@ -616,7 +600,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
 
     // sorting is not necessary here
-    llama_sampler_softmax_impl(cur_p, nullptr);
+    llama_sampler_softmax_impl(cur_p, false);
 
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
@@ -662,7 +646,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
             /* .seed     = */ seed,
             /* .seed_cur = */ seed_cur,
             /* .rng      = */ std::mt19937(seed_cur),
-            /* .buf_sort = */ {},
         }
     );
 }
@@ -671,8 +654,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
 
 struct llama_sampler_top_k {
     const int32_t k;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) {
@@ -681,7 +662,7 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
 
 static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_top_k *) smpl->ctx;
-    llama_sampler_top_k_impl(cur_p, ctx->k, ctx->buf_sort);
+    llama_sampler_top_k_impl(cur_p, ctx->k);
 }
 
 static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) {
@@ -706,8 +687,7 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_top_k_i,
         /* .ctx   = */ new llama_sampler_top_k {
-            /* .k        = */ k,
-            /* .buf_sort = */ {},
+            /* .k = */ k,
         }
     );
 }
@@ -718,7 +698,7 @@ struct llama_sampler_top_p {
     const float  p;
     const size_t min_keep;
 
-    llama_sort_data buf_sort;
+    std::vector<llama_token_data> buf_sort;
 };
 
 static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
@@ -732,7 +712,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, nullptr);
+    llama_sampler_softmax_impl(cur_p, false);
 
     size_t k = cur_p->size;
     auto * pdata = cur_p->data;
@@ -743,10 +723,10 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
     if (!cur_p->sorted && cur_p->size > 1024) {
         k = std::min<size_t>(256, cur_p->size);
         llama_token_data_array_sort(*cur_p, k, buf_sort);
-        pdata = buf_sort.data.data();
+        pdata = buf_sort.data();
     } else if (!cur_p->sorted) {
         // small candidates -> sort inplace
-        llama_token_data_array_sort_inplace(cur_p, k, buf_sort);
+        llama_token_data_array_sort_inplace(cur_p, k);
         cur_p->sorted = true;
     }
 
@@ -768,13 +748,13 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         if (!cur_p->sorted && i == k - 1) {
             k = cur_p->size;
             llama_token_data_array_sort(*cur_p, k, buf_sort);
-            pdata = buf_sort.data.data();
+            pdata = buf_sort.data();
         }
     }
 
     // Resize the output vector to keep only the top-p tokens
     if (!cur_p->sorted) {
-        std::memcpy(cur_p->data, buf_sort.data.data(), last_idx*sizeof(llama_token_data));
+        std::memcpy(cur_p->data, buf_sort.data(), last_idx*sizeof(llama_token_data));
         cur_p->sorted = true;
     }
 
@@ -815,8 +795,6 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
 struct llama_sampler_min_p {
     const float  p;
     const size_t min_keep;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) {
@@ -860,7 +838,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
     if (!min_p_applied) {
         // Sort the logits in descending order
         if (!cur_p->sorted) {
-            llama_token_data_array_sort_inplace(cur_p, cur_p->size, ctx->buf_sort);
+            llama_token_data_array_sort_inplace(cur_p, cur_p->size);
             cur_p->sorted = true;
         }
 
@@ -902,7 +880,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
         /* .ctx   = */ new llama_sampler_min_p {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
-            /* .buf_sort = */ {},
         }
     );
 }
@@ -912,8 +889,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
 struct llama_sampler_typical {
     const float  p;
     const size_t min_keep;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) {
@@ -930,7 +905,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
     }
 
     // Compute the softmax of logits and calculate entropy
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
     float entropy = 0.0f;
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -1004,7 +979,6 @@ struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
         /* .ctx   = */ new llama_sampler_typical {
             /* .p        = */ p,
             /* .min_keep = */ min_keep,
-            /* .buf_sort = */ {},
         }
     );
 }
@@ -1058,8 +1032,6 @@ struct llama_sampler_temp_ext {
     const float temp;
     const float delta;
     const float exponent;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
@@ -1082,7 +1054,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         // Calculate maximum possible entropy
         float max_entropy = -logf(1.0f / cur_p->size);
 
-        llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+        llama_sampler_softmax_impl(cur_p, true);
 
         // Calculate entropy of the softmax probabilities
         float entropy = 0.0f;
@@ -1162,7 +1134,6 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
             /* .temp     = */ temp,
             /* .delta    = */ delta,
             /* .exponent = */ exponent,
-            /* .buf_sort = */ {},
         }
     );
 }
@@ -1178,7 +1149,6 @@ struct llama_sampler_xtc {
     uint32_t       seed_cur;
 
     std::mt19937    rng;
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@@ -1200,7 +1170,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         return;
     }
 
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
     int pos_last = 0;
 
@@ -1262,7 +1232,6 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
             /* .seed          = */ seed,
             /* .seed_cur      = */ seed_cur,
             /* .rng           = */ std::mt19937(seed_cur),
-            /* .buf_sort      = */ {},
         }
     );
 }
@@ -1283,7 +1252,6 @@ struct llama_sampler_mirostat {
     float mu;
 
     std::mt19937    rng;
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
@@ -1293,7 +1261,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
 static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
@@ -1311,9 +1279,9 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
     float epsilon_hat = s_hat - 1;
     float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
 
-    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1), ctx->buf_sort);
+    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
 
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
     const int idx = llama_sample_dist(cur_p, ctx->rng);
 
@@ -1374,7 +1342,6 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
             /* .m        = */ m,
             /* .mu       = */ 2.0f*tau,
             /* .rng      = */ std::mt19937(seed_cur),
-            /* .buf_sort = */ {},
         }
     );
 }
@@ -1390,8 +1357,7 @@ struct llama_sampler_mirostat_v2 {
 
     float mu;
 
-    std::mt19937    rng;
-    llama_sort_data buf_sort;
+    std::mt19937 rng;
 };
 
 static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
@@ -1401,7 +1367,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
 static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
     // Truncate the words with surprise values greater than mu
     cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
@@ -1413,7 +1379,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
     }
 
     // Normalize the probabilities of the remaining words
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
     const int idx = llama_sample_dist(cur_p, ctx->rng);
 
@@ -1473,7 +1439,6 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
             /* .eta      = */ eta,
             /* .mu       = */ 2.0f*tau,
             /* .rng      = */ std::mt19937(seed_cur),
-            /* .buf_sort = */ {},
         }
     );
 }
@@ -1807,8 +1772,6 @@ struct llama_sampler * llama_sampler_init_penalties(
 
 struct llama_sampler_top_n_sigma {
     const float n;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler * /*smpl*/) {
@@ -1855,7 +1818,7 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
         }
     }
 
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 }
 
 static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
@@ -1880,8 +1843,7 @@ struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_top_n_sigma_i,
         /* .ctx   = */ new llama_sampler_top_n_sigma {
-            /* .n        = */ n,
-            /* .buf_sort = */ {},
+            /* .n = */ n,
         }
     );
 }
@@ -2371,8 +2333,6 @@ struct llama_sampler_infill {
 
     std::vector<char> buf0;
     std::vector<char> buf1;
-
-    llama_sort_data buf_sort;
 };
 
 static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
@@ -2382,7 +2342,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
 static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_infill *) smpl->ctx;
 
-    llama_sampler_softmax_impl(cur_p, &ctx->buf_sort);
+    llama_sampler_softmax_impl(cur_p, true);
 
 #if defined(GGML_DEBUG_SAMPLER_INFILL)
 #define LOG_DBG_CUR LLAMA_LOG_DEBUG
@@ -2579,10 +2539,9 @@ struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * voca
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_infill_i,
         /* .ctx   = */ new llama_sampler_infill {
-            /* .vocab    = */ vocab,
-            /* .buf0     = */ std::vector<char>(512),
-            /* .buf1     = */ std::vector<char>(512),
-            /* .buf_sort = */ {},
+            /* .vocab = */ vocab,
+            /* .buf0  = */ std::vector<char>(512),
+            /* .buf1  = */ std::vector<char>(512),
         }
     );
 }

From de2902dc766cd66919d10d776e75344fd822dbb4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 31 Aug 2025 19:59:19 +0300
Subject: [PATCH 6/9] sampling : use std::copy instead of memcpy

---
 src/llama-sampling.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 1d0baa7c03683..ed8632c687026 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -201,7 +201,7 @@ static void llama_token_data_array_sort_inplace(llama_token_data_array * cur_p,
 
     llama_token_data_array_sort(*cur_p, k, tmp);
 
-    std::memcpy(cur_p->data, tmp.data(), k*sizeof(llama_token_data));
+    std::copy(tmp.data(), tmp.data() + k, cur_p->data);
 }
 
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
@@ -754,7 +754,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
 
     // Resize the output vector to keep only the top-p tokens
     if (!cur_p->sorted) {
-        std::memcpy(cur_p->data, buf_sort.data(), last_idx*sizeof(llama_token_data));
+        std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
         cur_p->sorted = true;
     }
 
@@ -828,7 +828,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
 
         // if we have enough values the operation was a success
         if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
-            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
             cur_p->size = filtered_tokens.size();
             min_p_applied = true;
         }

From 6d2a38c79c521946b0b9da508d4d01eaa3d0f402 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 31 Aug 2025 20:03:47 +0300
Subject: [PATCH 7/9] sampling : clarify purpose of partial sort helpers

ggml-ci
---
 src/llama-sampling.cpp | 48 +++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index ed8632c687026..c32f4c9af7367 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -129,7 +129,8 @@ struct ring_buffer {
 };
 
 // writes result in res, does not mutate cur
-static void llama_token_data_array_sort(const llama_token_data_array & cur, int k, std::vector<llama_token_data> & data) {
+// reduces the size of cur_p to npartial, keeping only the top npartial elements
+static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     };
@@ -158,12 +159,12 @@ static void llama_token_data_array_sort(const llama_token_data_array & cur, int
     int ib = nbuckets - 1;
     for ( ; ib >= 0; --ib) {
         nhave += histo[ib];
-        if (nhave >= k) {
+        if (nhave >= npartial) {
             break;
         }
     }
-    data.resize(nhave);
-    auto * ptr = data.data();
+    res.resize(nhave);
+    auto * ptr = res.data();
     bucket_ptrs.reserve(nbuckets - ib);
     for (int j = nbuckets - 1; j >= ib; --j) {
         bucket_ptrs.push_back(ptr);
@@ -176,32 +177,39 @@ static void llama_token_data_array_sort(const llama_token_data_array & cur, int
         }
     }
 
-    ptr = data.data();
+    ptr = res.data();
     int ndone = 0;
     for (int j = nbuckets - 1; j > ib; --j) {
         std::sort(ptr, ptr + histo[j], comp);
         ptr += histo[j];
         ndone += histo[j];
     }
-    std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+    std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
 }
 
-// buf is a helper buffer that can optionally be utilized
-static void llama_token_data_array_sort_inplace(llama_token_data_array * cur_p, int k) {
+// reduces the size of cur_p to npartial, keeping only the top npartial elements
+static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     };
 
-    if (k <= 128) {
-        std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
+    if (npartial <= 128) {
+        std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
+
+        cur_p->size = npartial;
+        cur_p->sorted = true;
+
         return;
     }
 
     std::vector<llama_token_data> tmp;
 
-    llama_token_data_array_sort(*cur_p, k, tmp);
+    llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
+
+    std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
 
-    std::copy(tmp.data(), tmp.data() + k, cur_p->data);
+    cur_p->size = npartial;
+    cur_p->sorted = true;
 }
 
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
@@ -281,8 +289,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_s
 
     // Sort the logits in descending order if requested
     if (do_sort && !cur_p->sorted) {
-        llama_token_data_array_sort_inplace(cur_p, cur_p->size);
-        cur_p->sorted = true;
+        llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
     }
 
     float max_l = cur_p->data[0].logit;
@@ -318,8 +325,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
 
     // Sort scores in descending order
     if (!cur_p->sorted) {
-        llama_token_data_array_sort_inplace(cur_p, k);
-        cur_p->sorted = true;
+        llama_token_data_array_partial_sort_inplace(cur_p, k);
     }
 
     cur_p->size = k;
@@ -722,12 +728,11 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
     // if not sorted, try adaptive top-k sorting
     if (!cur_p->sorted && cur_p->size > 1024) {
         k = std::min<size_t>(256, cur_p->size);
-        llama_token_data_array_sort(*cur_p, k, buf_sort);
+        llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
         pdata = buf_sort.data();
     } else if (!cur_p->sorted) {
         // small candidates -> sort inplace
-        llama_token_data_array_sort_inplace(cur_p, k);
-        cur_p->sorted = true;
+        llama_token_data_array_partial_sort_inplace(cur_p, k);
     }
 
     // Compute the cumulative probabilities
@@ -747,7 +752,7 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
         // we exceeded the current top-k heuristic -> increase k and continue
         if (!cur_p->sorted && i == k - 1) {
             k = cur_p->size;
-            llama_token_data_array_sort(*cur_p, k, buf_sort);
+            llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
             pdata = buf_sort.data();
         }
     }
@@ -838,8 +843,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
     if (!min_p_applied) {
         // Sort the logits in descending order
         if (!cur_p->sorted) {
-            llama_token_data_array_sort_inplace(cur_p, cur_p->size);
-            cur_p->sorted = true;
+            llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
         }
 
         const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max

From 08d5ff4f7a4b098aef0f0b02ada297f88f72af76 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 31 Aug 2025 20:05:49 +0300
Subject: [PATCH 8/9] cont : remove wrong comment [no ci]

---
 src/llama-sampling.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c32f4c9af7367..e8c0fc3418bf3 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -129,7 +129,6 @@ struct ring_buffer {
 };
 
 // writes result in res, does not mutate cur
-// reduces the size of cur_p to npartial, keeping only the top npartial elements
 static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
     static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;

From 1136efb2ff12679534ecea4f5957526935bcee7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 30 Aug 2025 13:54:24 +0300
Subject: [PATCH 9/9] common : update comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 common/sampling.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/sampling.h b/common/sampling.h
index c7f3278b8fe48..e198eecda3810 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -86,8 +86,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 // helpers
 
 // access the internal list of current candidate tokens
-// if do_sort == true, the candidates will be sorted (in descending order of probability) in case they are not already sorted
-// if do_sort == false, the candidates *might* not be sorted. use the .sorted flag of the result to determine that
+// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
+// the .sorted flag of the result indicates whether the returned candidates are sorted
 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
 
 // get the last accepted token