ngxson
diff --git a/‎common/arg.cpp‎
Lines changed: 4 additions & 2 deletions b/‎common/arg.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 0 additions & 1 deletion b/‎common/common.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 0 additions & 1 deletion b/‎common/common.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/llama.vim‎
Lines changed: 1 addition & 1 deletion b/‎examples/llama.vim‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/llama.h‎
Lines changed: 1 addition & 1 deletion b/‎include/llama.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/compare-llama-bench.py‎
Lines changed: 0 additions & 1 deletion b/‎scripts/compare-llama-bench.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/llama-context.cpp‎
Lines changed: 1 addition & 2 deletions b/‎src/llama-context.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/llama-cparams.h‎
Lines changed: 0 additions & 1 deletion b/‎src/llama-cparams.h‎
Lines changed: 0 additions & 1 deletion
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
-        string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        string_format("KV cache defragmentation threshold (DEPRECATED)"),
         [](common_params & params, const std::string & value) {
-            params.defrag_thold = std::stof(value);
+            GGML_UNUSED(params);
+            GGML_UNUSED(value);
+            LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(common_arg(
 
@@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
     cparams.pooling_type      = params.pooling_type;
     cparams.attention_type    = params.attention_type;
-    cparams.defrag_thold      = params.defrag_thold;
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
 
@@ -288,7 +288,6 @@ struct common_params {
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
     // offload params
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
@@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
 "
 "   --batch-size [512, model max context]
 "
 
@@ -312,7 +312,7 @@ extern "C" {
         float    yarn_beta_fast;   // YaRN low correction dim
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
+        float    defrag_thold;     // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
 
         ggml_backend_sched_eval_callback cb_eval;
         void * cb_eval_user_data;
 
@@ -28,7 +28,6 @@
     "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
     "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
     "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-    "defrag_thold",
     "use_mmap",     "embeddings",   "no_op_offload",  "n_prompt",   "n_gen",        "n_depth",
     "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",     "stddev_ts",
 ]
 
@@ -39,7 +39,6 @@ llama_context::llama_context(
     cparams.yarn_attn_factor = params.yarn_attn_factor;
     cparams.yarn_beta_fast   = params.yarn_beta_fast;
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.defrag_thold     = params.defrag_thold;
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
@@ -978,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
     bool did_optimize = false;
 
-    // handle any pending defrags/shifts
+    // handle any pending shifts/copies
     memory_update(false);
 
     llama_memory_context_ptr mctx;
 
@@ -24,7 +24,6 @@ struct llama_cparams {
     float yarn_attn_factor;
     float yarn_beta_fast;
     float yarn_beta_slow;
-    float defrag_thold;
 
     bool embeddings;
     bool causal_attn;
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`"`
`18`	`18`	`" start the llama.cpp server with a FIM-compatible model. for example:`
`19`	`19`	`"`
`20`		`-" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256`
	`20`	`+" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256`
`21`	`21`	`"`
`22`	`22`	`" --batch-size [512, model max context]`
`23`	`23`	`"`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,6 @@`
`28`	`28`	`"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",`
`29`	`29`	`"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",`
`30`	`30`	`"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",`
`31`		`- "defrag_thold",`
`32`	`31`	`"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth",`
`33`	`32`	`"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",`
`34`	`33`	`]`