Skip to content

Commit 272df3f

Browse files
committed
server : disable speculative decoding for SWA models
1 parent b4f7dcf commit 272df3f

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

tools/server/server.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1938,7 +1938,6 @@ struct server_context {
19381938
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
19391939
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
19401940
params_dft.n_parallel = 1;
1941-
params_dft.swa_full = true; // TODO: this is not optimal and can be improved
19421941

19431942
// force F16 KV cache for the draft model for extra performance
19441943
params_dft.cache_type_k = GGML_TYPE_F16;
@@ -2017,6 +2016,11 @@ struct server_context {
20172016
params_base.n_cache_reuse = 0;
20182017
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
20192018
}
2019+
2020+
if (!params_base.speculative.model.path.empty()) {
2021+
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
2022+
return false;
2023+
}
20202024
}
20212025

20222026
return true;

0 commit comments

Comments
 (0)