File tree Expand file tree Collapse file tree 3 files changed +5
-1
lines changed
Expand file tree Collapse file tree 3 files changed +5
-1
lines changed Original file line number Diff line number Diff line change @@ -462,7 +462,9 @@ def free_lora_adapter():
462462
463463 self .n_tokens = 0
464464 self .input_ids : npt .NDArray [np .intc ] = np .ndarray ((n_ctx ,), dtype = np .intc )
465- self .scores : npt .NDArray [np .single ] = np .ndarray ((n_batch , self ._n_vocab ), dtype = np .single )
465+ self .scores : npt .NDArray [np .single ] = np .ndarray (
466+ (n_ctx if logits_all == True else n_batch , self ._n_vocab ), dtype = np .single
467+ )
466468
467469 self ._mirostat_mu = ctypes .c_float (
468470 2.0 * 5.0
Original file line number Diff line number Diff line change @@ -276,6 +276,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
276276 yarn_orig_ctx = settings .yarn_orig_ctx ,
277277 defrag_thold = settings .defrag_thold ,
278278 mul_mat_q = settings .mul_mat_q ,
279+ logits_all = settings .logits_all ,
279280 embedding = settings .embedding ,
280281 offload_kqv = settings .offload_kqv ,
281282 flash_attn = settings .flash_attn ,
Original file line number Diff line number Diff line change @@ -99,6 +99,7 @@ class ModelSettings(BaseSettings):
9999 mul_mat_q : bool = Field (
100100 default = True , description = "if true, use experimental mul_mat_q kernels"
101101 )
102+ logits_all : bool = Field (default = True , description = "Whether to return logits." )
102103 embedding : bool = Field (default = False , description = "Whether to use embeddings." )
103104 offload_kqv : bool = Field (
104105 default = True , description = "Whether to offload kqv to the GPU."
You can’t perform that action at this time.
0 commit comments