Sync llama : remove KV cache defragmentation logic

JamePeng · JamePeng · commit ae462753ce75 · 2025-08-27T01:51:58.000+08:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -88,7 +88,6 @@ def __init__(
         yarn_beta_fast: float = 32.0,
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
-        defrag_thold: float = -1.0,
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
@@ -172,7 +171,6 @@ def __init__(
             yarn_beta_fast: YaRN low correction dim
             yarn_beta_slow: YaRN high correction dim
             yarn_orig_ctx: YaRN original context size
-            defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
@@ -342,7 +340,6 @@ def __init__(
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.defrag_thold = defrag_thold
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
@@ -2211,7 +2208,6 @@ def __getstate__(self):
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            defrag_thold=self.context_params.defrag_thold,
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -802,7 +802,7 @@ class llama_model_params(ctypes.Structure):
 #     float    yarn_beta_fast;   // YaRN low correction dim
 #     float    yarn_beta_slow;   // YaRN high correction dim
 #     uint32_t yarn_orig_ctx;    // YaRN original context size
-#     float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+#     float    defrag_thold;     // [DEPRECATED] defragment the KV cache if holes/size > thold, < 0 disabled (default)
 
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
@@ -848,7 +848,7 @@ class llama_context_params(ctypes.Structure):
         yarn_beta_fast (float): YaRN low correction dim
         yarn_beta_slow (float): YaRN high correction dim
         yarn_orig_ctx (int): YaRN original context size
-        defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default)
+        defrag_thold (float): [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
         cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -288,7 +288,6 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             yarn_beta_fast=settings.yarn_beta_fast,
             yarn_beta_slow=settings.yarn_beta_slow,
             yarn_orig_ctx=settings.yarn_orig_ctx,
-            defrag_thold=settings.defrag_thold,
             mul_mat_q=settings.mul_mat_q,
             logits_all=settings.logits_all,
             embedding=settings.embedding,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -95,7 +95,6 @@ class ModelSettings(BaseSettings):
     yarn_beta_fast: float = Field(default=32.0)
     yarn_beta_slow: float = Field(default=1.0)
     yarn_orig_ctx: int = Field(default=0)
-    defrag_thold: float = Field(default=-1.0)
     mul_mat_q: bool = Field(
         default=True, description="if true, use experimental mul_mat_q kernels"
     )

Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,6 @@ class ModelSettings(BaseSettings):`
`95`	`95`	`yarn_beta_fast: float = Field(default=32.0)`
`96`	`96`	`yarn_beta_slow: float = Field(default=1.0)`
`97`	`97`	`yarn_orig_ctx: int = Field(default=0)`
`98`		`- defrag_thold: float = Field(default=-1.0)`
`99`	`98`	`mul_mat_q: bool = Field(`
`100`	`99`	`default=True, description="if true, use experimental mul_mat_q kernels"`
`101`	`100`	`)`