You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+34-16Lines changed: 34 additions & 16 deletions
Original file line number
Diff line number
Diff line change
@@ -66,7 +66,6 @@ def __init__(
66
66
split_mode: int=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
67
67
main_gpu: int=0,
68
68
tensor_split: Optional[List[float]] =None,
69
-
rpc_servers: Optional[str] =None,
70
69
vocab_only: bool=False,
71
70
use_mmap: bool=True,
72
71
use_mlock: bool=False,
@@ -90,11 +89,12 @@ def __init__(
90
89
yarn_beta_slow: float=1.0,
91
90
yarn_orig_ctx: int=0,
92
91
defrag_thold: float=-1.0,
92
+
logits_all: bool=False,
93
93
embedding: bool=False,
94
94
offload_kqv: bool=True,
95
95
flash_attn: bool=False,
96
-
op_offload: bool=True,
97
-
swa_full: bool=True,
96
+
op_offload: Optional[bool]=None,
97
+
swa_full: Optional[bool]=None,
98
98
# Sampling Params
99
99
no_perf: bool=False,
100
100
last_n_tokens_size: int=64,
@@ -152,7 +152,6 @@ def __init__(
152
152
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
153
153
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
154
154
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
155
-
rpc_servers: Comma separated list of RPC servers to use for offloading
156
155
vocab_only: Only load the vocabulary no weights.
157
156
use_mmap: Use mmap if possible.
158
157
use_mlock: Force the system to keep the model in RAM.
@@ -173,6 +172,7 @@ def __init__(
173
172
yarn_beta_slow: YaRN high correction dim
174
173
yarn_orig_ctx: YaRN original context size
175
174
defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
175
+
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
176
176
embedding: Embedding mode only.
177
177
offload_kqv: Offload K, Q, V to GPU.
178
178
flash_attn: Use flash attention.
@@ -230,11 +230,6 @@ def __init__(
230
230
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
0 commit comments