File tree Expand file tree Collapse file tree 1 file changed +4
-4
lines changed Expand file tree Collapse file tree 1 file changed +4
-4
lines changed Original file line number Diff line number Diff line change @@ -1872,6 +1872,10 @@ struct server_context {
18721872 params_dft.n_gpu_layers = params_base.speculative .n_gpu_layers ;
18731873 params_dft.n_parallel = 1 ;
18741874
1875+ // force F16 KV cache for the draft model for extra performance
1876+ params_dft.cache_type_k = GGML_TYPE_F16;
1877+ params_dft.cache_type_v = GGML_TYPE_F16;
1878+
18751879 llama_init_dft = common_init_from_params (params_dft);
18761880
18771881 model_dft = llama_init_dft.model .get ();
@@ -1892,10 +1896,6 @@ struct server_context {
18921896 cparams_dft = common_context_params_to_llama (params_dft);
18931897 cparams_dft.n_batch = n_ctx_dft;
18941898
1895- // force F16 KV cache for the draft model for extra performance
1896- cparams_dft.type_k = GGML_TYPE_F16;
1897- cparams_dft.type_v = GGML_TYPE_F16;
1898-
18991899 // the context is not needed - we will create one for each slot
19001900 llama_init_dft.context .reset ();
19011901 }
You can’t perform that action at this time.
0 commit comments