more logits_all params restored

JamePeng · JamePeng · commit cdc7348b1cfc · 2025-07-13T13:32:53.000+08:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -462,7 +462,9 @@ def free_lora_adapter():
 
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
-        self.scores: npt.NDArray[np.single] = np.ndarray((n_batch, self._n_vocab), dtype=np.single)
+        self.scores: npt.NDArray[np.single] = np.ndarray(
+            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
+        )
 
         self._mirostat_mu = ctypes.c_float(
             2.0 * 5.0
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -276,6 +276,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             yarn_orig_ctx=settings.yarn_orig_ctx,
             defrag_thold=settings.defrag_thold,
             mul_mat_q=settings.mul_mat_q,
+            logits_all=settings.logits_all,
             embedding=settings.embedding,
             offload_kqv=settings.offload_kqv,
             flash_attn=settings.flash_attn,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -99,6 +99,7 @@ class ModelSettings(BaseSettings):
     mul_mat_q: bool = Field(
         default=True, description="if true, use experimental mul_mat_q kernels"
     )
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
     embedding: bool = Field(default=False, description="Whether to use embeddings.")
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."

Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ class ModelSettings(BaseSettings):`
`99`	`99`	`mul_mat_q: bool = Field(`
`100`	`100`	`default=True, description="if true, use experimental mul_mat_q kernels"`
`101`	`101`	`)`
	`102`	`+ logits_all: bool = Field(default=True, description="Whether to return logits.")`
`102`	`103`	`embedding: bool = Field(default=False, description="Whether to use embeddings.")`
`103`	`104`	`offload_kqv: bool = Field(`
`104`	`105`	`default=True, description="Whether to offload kqv to the GPU."`