Sync sampling : optimize samplers by reusing bucket sort

JamePeng · JamePeng · commit 16bc60a6ee8c · 2025-09-01T22:58:32.000+08:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -405,13 +405,6 @@ def sample_repetition_penalties(
         # )
         raise NotImplementedError("sample_repetition_penalties is not implemented in llama.cpp")
 
-    def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        # llama_cpp.llama_sample_softmax(
-        #     self.ctx,
-        #     llama_cpp.byref(candidates.candidates),
-        # )
-        raise NotImplementedError("sample_softmax is not implemented in llama.cpp")
-
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
         # llama_cpp.llama_sample_top_k(
         #     self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
@@ -592,6 +585,7 @@ def __init__(self, *, n_vocab: int):
         self.candidates = llama_cpp.llama_token_data_array(
             data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
             size=self.n_vocab,
+            selected=-1,
             sorted=False,
         )
         self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)  # type: ignore
@@ -729,7 +723,6 @@ def sample(
             ctx_main.sample_grammar(token_data_array, self.grammar)
 
         if self.params.temp < 0:
-            ctx_main.sample_softmax(token_data_array)
             id = token_data_array.candidates_data.id[0]
         elif self.params.temp == 0:
             id = ctx_main.sample_token_greedy(token_data_array)
@@ -827,10 +820,6 @@ def add_dist(self, seed: int):
         sampler = llama_cpp.llama_sampler_init_dist(seed)
         self._add_sampler(sampler)
 
-    def add_softmax(self):
-        sampler = llama_cpp.llama_sampler_init_softmax()
-        self._add_sampler(sampler)
-
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)
         self._add_sampler(sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -735,7 +735,6 @@ def _init_sampler(
             sampler.add_grammar(self._model, grammar)
 
         if temp < 0.0:
-            sampler.add_softmax()
             sampler.add_dist(self._seed)
         elif temp == 0.0:
             sampler.add_greedy()
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -552,7 +552,7 @@ class llama_token_data(ctypes.Structure):
 #     llama_token_data * data;
 #     size_t size;
 #     int64_t selected; // this is the index in the data array (i.e. not the token id)
-#     bool sorted;
+#     bool sorted;      // note: do not assume the data is sorted - always check this flag
 # } llama_token_data_array;
 class llama_token_data_array(ctypes.Structure):
     """Used to sample tokens given logits
@@ -3742,15 +3742,6 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
     ...
 
 
-# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-#     "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p:
-    ...
-
-
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 # /// Setting k <= 0 makes this a noop
 # LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);