Skip to content

Commit ae46275

Browse files
committed
Sync llama : remove KV cache defragmentation logic
1 parent 5022227 commit ae46275

File tree

4 files changed

+2
-8
lines changed

4 files changed

+2
-8
lines changed

llama_cpp/llama.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ def __init__(
8888
yarn_beta_fast: float = 32.0,
8989
yarn_beta_slow: float = 1.0,
9090
yarn_orig_ctx: int = 0,
91-
defrag_thold: float = -1.0,
9291
logits_all: bool = False,
9392
embedding: bool = False,
9493
offload_kqv: bool = True,
@@ -172,7 +171,6 @@ def __init__(
172171
yarn_beta_fast: YaRN low correction dim
173172
yarn_beta_slow: YaRN high correction dim
174173
yarn_orig_ctx: YaRN original context size
175-
defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
176174
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
177175
embedding: Embedding mode only.
178176
offload_kqv: Offload K, Q, V to GPU.
@@ -342,7 +340,6 @@ def __init__(
342340
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
343341
)
344342
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
345-
self.context_params.defrag_thold = defrag_thold
346343
self._logits_all = logits_all if draft_model is None else True
347344
self.context_params.embeddings = embedding # TODO: Rename to embeddings
348345
self.context_params.offload_kqv = offload_kqv
@@ -2211,7 +2208,6 @@ def __getstate__(self):
22112208
yarn_beta_fast=self.context_params.yarn_beta_fast,
22122209
yarn_beta_slow=self.context_params.yarn_beta_slow,
22132210
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
2214-
defrag_thold=self.context_params.defrag_thold,
22152211
logits_all=self._logits_all,
22162212
embedding=self.context_params.embeddings,
22172213
offload_kqv=self.context_params.offload_kqv,

llama_cpp/llama_cpp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ class llama_model_params(ctypes.Structure):
802802
# float yarn_beta_fast; // YaRN low correction dim
803803
# float yarn_beta_slow; // YaRN high correction dim
804804
# uint32_t yarn_orig_ctx; // YaRN original context size
805-
# float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
805+
# float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, < 0 disabled (default)
806806

807807
# ggml_backend_sched_eval_callback cb_eval;
808808
# void * cb_eval_user_data;
@@ -848,7 +848,7 @@ class llama_context_params(ctypes.Structure):
848848
yarn_beta_fast (float): YaRN low correction dim
849849
yarn_beta_slow (float): YaRN high correction dim
850850
yarn_orig_ctx (int): YaRN original context size
851-
defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default)
851+
defrag_thold (float): [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
852852
cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
853853
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
854854
type_k (int): data type for K cache

llama_cpp/server/model.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
288288
yarn_beta_fast=settings.yarn_beta_fast,
289289
yarn_beta_slow=settings.yarn_beta_slow,
290290
yarn_orig_ctx=settings.yarn_orig_ctx,
291-
defrag_thold=settings.defrag_thold,
292291
mul_mat_q=settings.mul_mat_q,
293292
logits_all=settings.logits_all,
294293
embedding=settings.embedding,

llama_cpp/server/settings.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ class ModelSettings(BaseSettings):
9595
yarn_beta_fast: float = Field(default=32.0)
9696
yarn_beta_slow: float = Field(default=1.0)
9797
yarn_orig_ctx: int = Field(default=0)
98-
defrag_thold: float = Field(default=-1.0)
9998
mul_mat_q: bool = Field(
10099
default=True, description="if true, use experimental mul_mat_q kernels"
101100
)

0 commit comments

Comments
 (0)