server : add speculative decoding support

ggerganov · ggerganov · commit fb7929a9a255 · 2024-11-24T15:16:26.000+02:00
ggml-ci
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2,10 +2,11 @@
 
 #include "arg.h"
 #include "common.h"
-#include "log.h"
-#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "log.h"
+#include "sampling.h"
+#include "speculative.h"
 
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@@ -127,6 +128,12 @@ struct server_slot {
     int id;
     int id_task = -1;
 
+    llama_batch batch_spec;
+
+    llama_context * ctx_dft = nullptr;
+
+    common_speculative * spec = nullptr;
+
     // the index relative to completion multi-task request
     size_t index = 0;
 
@@ -591,11 +598,14 @@ struct server_response {
 };
 
 struct server_context {
+    common_params params;
+
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
     std::vector<common_lora_adapter_container> loras;
 
-    common_params params;
+    llama_model * model_dft = nullptr;
+    llama_context_params cparams_dft;
 
     llama_batch batch = {};
 
@@ -628,17 +638,33 @@ struct server_context {
             model = nullptr;
         }
 
+        if (model_dft) {
+            llama_free_model(model_dft);
+            model_dft = nullptr;
+        }
+
         // Clear any sampling context
         for (server_slot & slot : slots) {
             if (slot.smpl != nullptr) {
+                llama_free(slot.ctx_dft);
+                slot.ctx_dft = nullptr;
+
+                common_speculative_free(slot.spec);
+                slot.spec = nullptr;
+
                 common_sampler_free(slot.smpl);
+                slot.smpl = nullptr;
+
+                llama_batch_free(slot.batch_spec);
             }
         }
 
         llama_batch_free(batch);
     }
 
     bool load_model(const common_params & params_) {
+        SRV_INF("loading model '%s'\n", params_.model.c_str());
+
         params = params_;
 
         common_init_result llama_init = common_init_from_params(params);
@@ -657,6 +683,40 @@ struct server_context {
         add_bos_token = llama_add_bos_token(model);
         has_eos_token = !llama_add_eos_token(model);
 
+        if (!params.model_draft.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_.model_draft.c_str());
+
+            auto params_dft = params;
+
+            params_dft.model = params.model_draft;
+            params_dft.n_gpu_layers = params.n_gpu_layers_draft;
+
+            if (params.draft_cpuparams.n_threads > 0) {
+                params_dft.cpuparams.n_threads = params.draft_cpuparams.n_threads;
+            }
+
+            params_dft.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
+
+            common_init_result llama_init_dft = common_init_from_params(params_dft);
+
+            model_dft = llama_init_dft.model;
+
+            if (model_dft == nullptr) {
+                SRV_ERR("failed to load draft model, '%s'\n", params.model_draft.c_str());
+                return false;
+            }
+
+            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.model_draft.c_str(), params.model.c_str());
+                return false;
+            }
+
+            cparams_dft = common_context_params_to_llama(params);
+
+            // the context is not needed - we will create one for each slot
+            llama_free(llama_init_dft.context);
+        }
+
         return true;
     }
 
@@ -685,6 +745,22 @@ struct server_context {
             slot.n_ctx = n_ctx_slot;
             slot.n_predict = params.n_predict;
 
+            if (model_dft) {
+                slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
+                if (slot.ctx_dft == nullptr) {
+                    SRV_ERR("%s", "failed to create draft context\n");
+                    return;
+                }
+
+                slot.spec = common_speculative_init(slot.ctx_dft);
+                if (slot.spec == nullptr) {
+                    SRV_ERR("%s", "failed to create speculator\n");
+                    return;
+                }
+
+                slot.batch_spec = llama_batch_init(params.n_draft + 1, 0, 1);
+            }
+
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
             slot.sparams = params.sampling;
@@ -2168,38 +2244,108 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                llama_token id;
 
-                common_sampler_accept(slot.smpl, id, true);
+                {
+                    completion_token_output result;
 
-                slot.n_decoded += 1;
-                if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                    metrics.on_prompt_eval(slot);
-                }
+                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
-                result.tok = id;
+                    common_sampler_accept(slot.smpl, id, true);
 
-                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+                    slot.n_decoded += 1;
+                    if (slot.n_decoded == 1) {
+                        slot.t_start_generation = ggml_time_us();
+                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                        metrics.on_prompt_eval(slot);
+                    }
 
-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
-                }
+                    result.tok = id;
 
-                if (!process_token(result, slot)) {
-                    // release slot because of stop condition
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    metrics.on_prediction(slot);
+                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+
+                    for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                        result.probs.push_back({
+                            cur_p->data[i].id,
+                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                        });
+                    }
+
+                    if (!process_token(result, slot)) {
+                        // release slot because of stop condition
+                        slot.release();
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                    }
                 }
 
                 slot.i_batch = -1;
+
+                if (slot.ctx_dft) {
+                    struct common_speculative_params params_spec;
+                    params_spec.n_draft   = params.n_draft;
+                    params_spec.n_reuse   = 256;
+                    params_spec.p_min     = 0.9f;
+
+                    llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
+
+                    if (draft.size() > params.n_draft_min) {
+                        common_batch_clear(slot.batch_spec);
+                        common_batch_add(slot.batch_spec, id, slot.n_past++, { slot.id }, true);
+
+                        for (size_t i = 0; i < draft.size(); ++i) {
+                            common_batch_add(slot.batch_spec, draft[i], slot.n_past + i, { slot.id }, true);
+                        }
+
+                        llama_decode(ctx, slot.batch_spec);
+
+                        const auto ids = common_sampler_sample_n(slot.smpl, ctx, draft);
+
+                        slot.n_past += ids.size() - 1;
+
+                        slot.cache_tokens.push_back(id);
+
+                        for (size_t i = 0; i < ids.size(); ++i) {
+                            completion_token_output result;
+
+                            id = ids[i];
+
+                            common_sampler_accept(slot.smpl, id, true);
+
+                            slot.n_decoded += 1;
+                            if (slot.n_decoded == 1) {
+                                slot.t_start_generation = ggml_time_us();
+                                slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                                metrics.on_prompt_eval(slot);
+                            }
+
+                            result.tok = id;
+
+                            const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+
+                            for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                                result.probs.push_back({
+                                    cur_p->data[i].id,
+                                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                                });
+                            }
+
+                            if (!process_token(result, slot)) {
+                                // release slot because of stop condition
+                                slot.release();
+                                slot.print_timings();
+                                send_final_response(slot);
+                                metrics.on_prediction(slot);
+                                break;
+                            }
+                        }
+
+                        llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+
+                        slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
+                    }
+                }
             }
         }