Sync context : remove logits_all flag and update API

JamePeng · JamePeng · commit 1a1d920874e6 · 2025-05-09T08:54:47.000+08:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -279,23 +279,35 @@ def n_ctx(self) -> int:
     def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
-    def kv_cache_clear(self):
-        llama_cpp.llama_kv_cache_clear(self.ctx)
+    def kv_self_clear(self):
+        llama_cpp.llama_kv_self_clear(self.ctx)
 
-    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+    def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)
 
-    def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+    def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
 
-    def kv_cache_seq_keep(self, seq_id: int):
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+    def kv_self_seq_keep(self, seq_id: int):
+        llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)
 
-    def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
-        llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
+    def kv_self_seq_add(self, seq_id: int, p0: int, p1: int, delta: int):
+        llama_cpp.llama_kv_self_seq_add(self.ctx, seq_id, p0, p1, delta)
+
+    def kv_self_seq_div(self, seq_id: int, p0: int, p1: int, d: int):
+        llama_cpp.llama_kv_self_seq_div(self.ctx, seq_id, p0, p1, d)
+
+    def kv_self_seq_pos_max(self, seq_id: int):
+        llama_cpp.llama_kv_self_seq_pos_max(self.ctx, seq_id)
+
+    def kv_self_defrag(self):
+        llama_cpp.llama_kv_self_defrag(self.ctx)
+
+    def kv_self_can_shift(self) -> bool:
+        llama_cpp.llama_kv_self_can_shift(self.ctx)
 
     def get_state_size(self) -> int:
-        return llama_cpp.llama_get_state_size(self.ctx)
+        return llama_cpp.llama_state_get_size(self.ctx)
 
     # TODO: copy_state_data
 
@@ -502,18 +514,16 @@ def n_tokens(self) -> int:
     def reset(self):
         self.batch.n_tokens = 0
 
-    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
+    def set_batch(self, batch: Sequence[int], n_past: int):
         n_tokens = len(batch)
         self.batch.n_tokens = n_tokens
         for i in range(n_tokens):
             self.batch.token[i] = batch[i]
             self.batch.pos[i] = n_past + i
             self.batch.seq_id[i][0] = 0
             self.batch.n_seq_id[i] = 1
-            self.batch.logits[i] = logits_all
-        self.batch.logits[n_tokens - 1] = True
 
-    def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
+    def add_sequence(self, batch: Sequence[int], seq_id: int):
         n_tokens = len(batch)
         n_tokens0 = self.batch.n_tokens
         self.batch.n_tokens += n_tokens
@@ -523,8 +533,6 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
             self.batch.pos[j] = i
             self.batch.seq_id[j][0] = seq_id
             self.batch.n_seq_id[j] = 1
-            self.batch.logits[j] = logits_all
-        self.batch.logits[n_tokens - 1] = True
 
 
 class LlamaTokenDataArray:
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -89,7 +89,6 @@ def __init__(
         yarn_beta_fast: float = 32.0,
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
-        logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
@@ -170,7 +169,6 @@ def __init__(
             yarn_beta_fast: YaRN low correction dim
             yarn_beta_slow: YaRN high correction dim
             yarn_orig_ctx: YaRN original context size
-            logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
@@ -341,9 +339,6 @@ def __init__(
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.logits_all = (
-            logits_all if draft_model is None else True
-        )  # Must be set to True for speculative decoding
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
@@ -457,9 +452,7 @@ def free_lora_adapter():
 
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
-        self.scores: npt.NDArray[np.single] = np.ndarray(
-            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
-        )
+        self.scores: npt.NDArray[np.single] = np.ndarray((n_batch, self._n_vocab), dtype=np.single)
 
         self._mirostat_mu = ctypes.c_float(
             2.0 * 5.0
@@ -568,7 +561,7 @@ def eval_tokens(self) -> Deque[int]:
     def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
-            maxlen=self._n_ctx if self.context_params.logits_all else 1,
+            maxlen=self._n_ctx
         )
 
     def tokenize(
@@ -635,34 +628,18 @@ def eval(self, tokens: Sequence[int]):
         Args:
             tokens: The list of tokens to evaluate.
         """
-        self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+        self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
-                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
+                batch=batch, n_past=n_past
             )
             self._ctx.decode(self._batch)
             # Save tokens
             self.input_ids[n_past : n_past + n_tokens] = batch
-            # Save logits
-            if self.context_params.logits_all:
-                rows = n_tokens
-                cols = self._n_vocab
-                logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,)
-                )
-                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
-            else:
-                # rows = 1
-                # cols = self._n_vocab
-                # logits = np.ctypeslib.as_array(
-                #     self._ctx.get_logits(), shape=(rows * cols,)
-                # )
-                # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
-                # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
-                pass
+
             # Update n_tokens
             self.n_tokens += n_tokens
 
@@ -988,7 +965,7 @@ def generate(
 
                 if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
                     self.n_tokens = sample_idx
-                    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
                     break
 
             if self.draft_model is not None:
@@ -1062,7 +1039,6 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
-        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
 
         if self.context_params.embeddings is False:
             raise RuntimeError(
@@ -1140,7 +1116,7 @@ def decode_batch(seq_sizes: List[int]):
                 p_batch = 0
 
             # add to batch
-            self._batch.add_sequence(tokens, p_batch, logits_all)
+            self._batch.add_sequence(tokens, p_batch)
 
             # update batch stats
             s_batch.append(n_tokens)
@@ -1340,9 +1316,9 @@ def logit_bias_processor(
         else:
             stop_sequences = []
 
-        if logprobs is not None and self.context_params.logits_all is False:
+        if logprobs is not None:
             raise ValueError(
-                "logprobs is not supported for models created with logits_all=False"
+                "logprobs is not supported for models"
             )
 
         if self.cache:
@@ -2213,7 +2189,6 @@ def __getstate__(self):
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            logits_all=self.context_params.logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -751,7 +751,7 @@ class llama_model_params(ctypes.Structure):
 
 
 # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
-# //       https://github.com/ggerganov/llama.cpp/pull/7544
+# //       https://github.com/ggml-org/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t n_ctx;             // text context, 0 = from model
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
@@ -764,7 +764,7 @@ class llama_model_params(ctypes.Structure):
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
 
-#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
 #     float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
 #     float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
@@ -779,20 +779,17 @@ class llama_model_params(ctypes.Structure):
 
 #     enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
 #     enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
-
-#     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-#     bool embeddings;  // if true, extract embeddings (together with logits)
-#     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-#     bool no_perf;     // whether to measure performance timings
-
-
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
 #     // currently works only with CPU execution
 #     ggml_abort_callback abort_callback;
 #     void *              abort_callback_data;
+
+#     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+#     bool embeddings;  // if true, extract embeddings (together with logits)
+#     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+#     bool no_perf;     // whether to measure performance timings
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -819,13 +816,12 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
         type_v (int): data type for V cache
-        logits_all (bool): the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
+        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
         flash_attn (bool): whether to use flash attention
         no_perf (bool): whether to measure performance timings
-        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
-        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
     """
 
     if TYPE_CHECKING:
@@ -850,13 +846,12 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data: ctypes.c_void_p
         type_k: int
         type_v: int
-        logits_all: bool
+        abort_callback: Callable[[ctypes.c_void_p], bool]
+        abort_callback_data: ctypes.c_void_p
         embeddings: bool
         offload_kqv: bool
         flash_attn: bool
         no_perf: bool
-        abort_callback: Callable[[ctypes.c_void_p], bool]
-        abort_callback_data: ctypes.c_void_p
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -880,13 +875,12 @@ class llama_context_params(ctypes.Structure):
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
-        ("logits_all", ctypes.c_bool),
+        ("abort_callback", ggml_abort_callback),
+        ("abort_callback_data", ctypes.c_void_p),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
         ("flash_attn", ctypes.c_bool),
         ("no_perf", ctypes.c_bool),
-        ("abort_callback", ggml_abort_callback),
-        ("abort_callback_data", ctypes.c_void_p),
     ]
 
 
@@ -2683,10 +2677,12 @@ def llama_batch_free(batch: llama_batch, /):
     ...
 
 
-# // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-# // Stores the encoder output internally for later use by the decoder cross-attention layers.
+# // Process a batch of tokens.
+# // In contrast to llama_decode() - this call does not use KV cache.
+# // For encode-decoder contexts, processes the batch using the encoder.
+# // Can store the encoder output internally for later use by the decoder's cross-attention layers.
 # //   0 - success
-# // < 0 - error
+# // < 0 - error. the KV cache state is restored to the state before this call
 # LLAMA_API int32_t llama_encode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
@@ -2699,10 +2695,13 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
     ...
 
 
+# // Process a batch of tokens.
+# // Requires KV cache.
+# // For encode-decoder contexts, processes the batch using the decoder.
 # // Positive return values does not mean a fatal error, but rather a warning.
 # //   0 - success
 # //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-# // < 0 - error
+# // < 0 - error. the KV cache state is restored to the state before this call
 # LLAMA_API int32_t llama_decode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -261,7 +261,6 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             yarn_beta_slow=settings.yarn_beta_slow,
             yarn_orig_ctx=settings.yarn_orig_ctx,
             mul_mat_q=settings.mul_mat_q,
-            logits_all=settings.logits_all,
             embedding=settings.embedding,
             offload_kqv=settings.offload_kqv,
             flash_attn=settings.flash_attn,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -98,7 +98,6 @@ class ModelSettings(BaseSettings):
     mul_mat_q: bool = Field(
         default=True, description="if true, use experimental mul_mat_q kernels"
     )
-    logits_all: bool = Field(default=True, description="Whether to return logits.")
     embedding: bool = Field(default=False, description="Whether to use embeddings.")
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -81,7 +81,6 @@ def test_real_model(llama_cpp_model_path):
     cparams.n_ubatch = 16
     cparams.n_threads = multiprocessing.cpu_count()
     cparams.n_threads_batch = multiprocessing.cpu_count()
-    cparams.logits_all = False
     cparams.flash_attn = True
 
     context = internals.LlamaContext(model=model, params=cparams)
@@ -103,7 +102,7 @@ def test_real_model(llama_cpp_model_path):
     result = tokens
     n_eval = 0
     for _ in range(4):
-        batch.set_batch(tokens, n_past=n_eval, logits_all=False)
+        batch.set_batch(tokens, n_past=n_eval)
         context.decode(batch)
         n_eval += len(tokens)
         token_id = sampler.sample(context, -1)
@@ -122,7 +121,6 @@ def test_real_llama(llama_cpp_model_path):
         n_ubatch=32,
         n_threads=multiprocessing.cpu_count(),
         n_threads_batch=multiprocessing.cpu_count(),
-        logits_all=False,
         flash_attn=True,
     )
 

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,6 @@ class ModelSettings(BaseSettings):`
`98`	`98`	`mul_mat_q: bool = Field(`
`99`	`99`	`default=True, description="if true, use experimental mul_mat_q kernels"`
`100`	`100`	`)`
`101`		`- logits_all: bool = Field(default=True, description="Whether to return logits.")`
`102`	`101`	`embedding: bool = Field(default=False, description="Whether to use embeddings.")`
`103`	`102`	`offload_kqv: bool = Field(`
`104`	`103`	`default=True, description="Whether to offload kqv to the GPU."`