fix missing logits_all params

JamePeng · JamePeng · commit 51e77fa939bb · 2025-07-06T12:58:36.000+08:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1065,6 +1065,7 @@ def embed(
 
         # get pooling information
         pooling_type = self.pooling_type()
+        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
 
         if self.context_params.embeddings is False:
             raise RuntimeError(
@@ -1142,7 +1143,7 @@ def decode_batch(seq_sizes: List[int]):
                 p_batch = 0
 
             # add to batch
-            self._batch.add_sequence(tokens, p_batch)
+            self._batch.add_sequence(tokens, p_batch, logits_all)
 
             # update batch stats
             s_batch.append(n_tokens)
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -81,6 +81,7 @@ def test_real_model(llama_cpp_model_path):
     cparams.n_ubatch = 16
     cparams.n_threads = multiprocessing.cpu_count()
     cparams.n_threads_batch = multiprocessing.cpu_count()
+    cparams.logits_all = False
     cparams.flash_attn = True
     cparams.swa_full = True
 
@@ -103,7 +104,7 @@ def test_real_model(llama_cpp_model_path):
     result = tokens
     n_eval = 0
     for _ in range(4):
-        batch.set_batch(tokens, n_past=n_eval)
+        batch.set_batch(tokens, n_past=n_eval, logits_all=False)
         context.decode(batch)
         n_eval += len(tokens)
         token_id = sampler.sample(context, -1)
@@ -122,8 +123,8 @@ def test_real_llama(llama_cpp_model_path):
         n_ubatch=32,
         n_threads=multiprocessing.cpu_count(),
         n_threads_batch=multiprocessing.cpu_count(),
-        flash_attn=True,
         logits_all=False,
+        flash_attn=True,
         swa_full = True
     )