@@ -88,7 +88,6 @@ def __init__(
8888 yarn_beta_fast : float = 32.0 ,
8989 yarn_beta_slow : float = 1.0 ,
9090 yarn_orig_ctx : int = 0 ,
91- defrag_thold : float = - 1.0 ,
9291 logits_all : bool = False ,
9392 embedding : bool = False ,
9493 offload_kqv : bool = True ,
@@ -172,7 +171,6 @@ def __init__(
172171 yarn_beta_fast: YaRN low correction dim
173172 yarn_beta_slow: YaRN high correction dim
174173 yarn_orig_ctx: YaRN original context size
175- defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
176174 logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
177175 embedding: Embedding mode only.
178176 offload_kqv: Offload K, Q, V to GPU.
@@ -342,7 +340,6 @@ def __init__(
342340 yarn_beta_slow if yarn_beta_slow != 0.0 else 0
343341 )
344342 self .context_params .yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
345- self .context_params .defrag_thold = defrag_thold
346343 self ._logits_all = logits_all if draft_model is None else True
347344 self .context_params .embeddings = embedding # TODO: Rename to embeddings
348345 self .context_params .offload_kqv = offload_kqv
@@ -2211,7 +2208,6 @@ def __getstate__(self):
22112208 yarn_beta_fast = self .context_params .yarn_beta_fast ,
22122209 yarn_beta_slow = self .context_params .yarn_beta_slow ,
22132210 yarn_orig_ctx = self .context_params .yarn_orig_ctx ,
2214- defrag_thold = self .context_params .defrag_thold ,
22152211 logits_all = self ._logits_all ,
22162212 embedding = self .context_params .embeddings ,
22172213 offload_kqv = self .context_params .offload_kqv ,
0 commit comments