Skip to content

Commit 0522270

Browse files
committed
Adds server parameters for draft model cache type. Fixes ggml-org/llama.cpp/#11200
1 parent 2f099b5 commit 0522270

File tree

4 files changed

+33
-4
lines changed

4 files changed

+33
-4
lines changed

common/arg.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3181,6 +3181,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31813181
params.speculative.model.path = value;
31823182
}
31833183
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3184+
add_opt(common_arg(
3185+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3186+
string_format(
3187+
"KV cache data type for K for the draft model\n"
3188+
"allowed values: %s\n"
3189+
"(default: %s)",
3190+
get_all_kv_cache_types().c_str(),
3191+
ggml_type_name(params.speculative.cache_type_k)
3192+
),
3193+
[](common_params & params, const std::string & value) {
3194+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3195+
}
3196+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3197+
add_opt(common_arg(
3198+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3199+
string_format(
3200+
"KV cache data type for V for the draft model\n"
3201+
"allowed values: %s\n"
3202+
"(default: %s)",
3203+
get_all_kv_cache_types().c_str(),
3204+
ggml_type_name(params.speculative.cache_type_v)
3205+
),
3206+
[](common_params & params, const std::string & value) {
3207+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3208+
}
3209+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
31843210

31853211
add_opt(common_arg(
31863212
{"-mv", "--model-vocoder"}, "FNAME",

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ struct common_params_speculative {
199199
float p_split = 0.1f; // speculative decoding split probability
200200
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201201

202+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204+
202205
struct cpu_params cpuparams;
203206
struct cpu_params cpuparams_batch;
204207

tools/server/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ The project is under active development, and we are [looking for feedback and co
186186
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
187187
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
188188
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
189+
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for speculative decoding model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
190+
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for speculative decoding model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
189191
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
190192
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
191193
| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) |

tools/server/server.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1939,10 +1939,8 @@ struct server_context {
19391939
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
19401940
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
19411941
params_dft.n_parallel = 1;
1942-
1943-
// force F16 KV cache for the draft model for extra performance
1944-
params_dft.cache_type_k = GGML_TYPE_F16;
1945-
params_dft.cache_type_v = GGML_TYPE_F16;
1942+
params_dft.cache_type_k = params_base.speculative.cache_type_k;
1943+
params_dft.cache_type_v = params_base.speculative.cache_type_v;
19461944

19471945
llama_init_dft = common_init_from_params(params_dft);
19481946

0 commit comments

Comments
 (0)