Skip to content

Commit c1703c1

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 155135d + 13b339b commit c1703c1

File tree

2 files changed

+14
-4
lines changed

2 files changed

+14
-4
lines changed

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,10 @@ struct common_params {
507507
// return false from callback to abort model loading or true to continue
508508
llama_progress_callback load_progress_callback = NULL;
509509
void * load_progress_callback_user_data = NULL;
510+
511+
bool has_speculative() const {
512+
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
513+
}
510514
};
511515

512516
// call once at the start of a program if it uses libcommon

tools/server/server.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2400,7 +2400,7 @@ struct server_context {
24002400

24012401
add_bos_token = llama_vocab_get_add_bos(vocab);
24022402

2403-
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
2403+
if (params_base.has_speculative()) {
24042404
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
24052405

24062406
auto params_dft = params_base;
@@ -2476,7 +2476,7 @@ struct server_context {
24762476
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
24772477
}
24782478

2479-
if (!params_base.speculative.model.path.empty()) {
2479+
if (params_base.has_speculative()) {
24802480
SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
24812481
return false;
24822482
}
@@ -2520,6 +2520,7 @@ struct server_context {
25202520
if (model_dft) {
25212521
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
25222522

2523+
// TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
25232524
slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
25242525
if (slot.ctx_dft == nullptr) {
25252526
SRV_ERR("%s", "failed to create draft context\n");
@@ -2825,6 +2826,7 @@ struct server_context {
28252826
}
28262827

28272828
// initialize draft batch
2829+
// TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
28282830
if (slot.ctx_dft) {
28292831
llama_batch_free(slot.batch_spec);
28302832

@@ -4291,6 +4293,8 @@ struct server_context {
42914293
}
42924294

42934295
// do speculative decoding
4296+
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
4297+
// perform the speculative drafting for all sequences at the same time in a single batch
42944298
for (auto & slot : slots) {
42954299
if (!slot.is_processing() || !slot.can_speculate()) {
42964300
continue;
@@ -4445,8 +4449,10 @@ int main(int argc, char ** argv) {
44454449

44464450
// TODO: should we have a separate n_parallel parameter for the server?
44474451
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
4448-
if (params.n_parallel == 1 && params.kv_unified == false) {
4449-
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);
4452+
// TODO: this is a common configuration that is suitable for most local use cases
4453+
// however, overriding the parameters is a bit confusing - figure out something more intuitive
4454+
if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
4455+
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
44504456

44514457
params.n_parallel = 4;
44524458
params.kv_unified = true;

0 commit comments

Comments
 (0)