server : fix draft context not being released (ggml-org#11354)

slaren · web-flow · commit 12c2bdf2de34 · 2025-01-22T17:44:40.000+01:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1772,6 +1772,9 @@ struct server_context {
             // force F16 KV cache for the draft model for extra performance
             cparams_dft.type_k = GGML_TYPE_F16;
             cparams_dft.type_v = GGML_TYPE_F16;
+
+            // the context is not needed - we will create one for each slot
+            llama_init_dft.context.reset();
         }
 
         chat_templates = common_chat_templates_from_model(model, params_base.chat_template);

Original file line number	Diff line number	Diff line change
`@@ -1772,6 +1772,9 @@ struct server_context {`
`1772`	`1772`	`// force F16 KV cache for the draft model for extra performance`
`1773`	`1773`	`cparams_dft.type_k = GGML_TYPE_F16;`
`1774`	`1774`	`cparams_dft.type_v = GGML_TYPE_F16;`
	`1775`	`+`
	`1776`	`+ // the context is not needed - we will create one for each slot`
	`1777`	`+ llama_init_dft.context.reset();`
`1775`	`1778`	`}`
`1776`	`1779`
`1777`	`1780`	`chat_templates = common_chat_templates_from_model(model, params_base.chat_template);`