ikawrakow · ikawrakow · Aug 16, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -74,6 +74,7 @@ add_library(${TARGET} STATIC
     train.cpp
     ngram-cache.h
     ngram-cache.cpp
+    speculative.cpp
     )
 
 if (BUILD_SHARED_LIBS)

diff --git a/common/common.cpp b/common/common.cpp
@@ -486,6 +486,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.n_ctx = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "-cd" || arg == "--ctx-size-draft") {
+        CHECK_ARG
+        params.n_ctx_draft = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "--grp-attn-n" || arg == "-gan") {
         CHECK_ARG
         params.grp_attn_n = std::stoi(argv[i]);
@@ -706,7 +711,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             }
         }
         return true;
-    }  
+    }
     if (arg == "--cfg-negative-prompt") {
         CHECK_ARG
         sparams.cfg_negative_prompt = argv[i];
@@ -915,6 +920,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.cache_type_v = argv[++i];
         return true;
     }
+    if (arg == "-ctkd" || arg == "--cache-type-k-draft") {
+        params.cache_type_k_draft = argv[++i];
+        return true;
+    }
+    if (arg == "-ctvd" || arg == "--cache-type-v-draft") {
+        params.cache_type_v_draft = argv[++i];
+        return true;
+    }
     if (arg == "-mli" || arg == "--multiline-input") {
         params.multiline_input = true;
         return true;
@@ -1052,7 +1065,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         size_t pos = 0;
         while ((pos = servers.find(",")) != std::string::npos) {
             std::string server = servers.substr(0, pos);
-            ggml_backend_rpc_buffer_type(server.c_str());            
+            ggml_backend_rpc_buffer_type(server.c_str());
             servers.erase(0, pos + 1);
         }
         ggml_backend_rpc_buffer_type(servers.c_str());
@@ -1648,6 +1661,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
 
     options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
+    options.push_back({ "*",           "-cd,   --ctx-size-draft N",     "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.n_ctx_draft });
     options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
     options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
     options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
@@ -1758,6 +1772,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-nkvo, --no-kv-offload",        "disable KV offload" });
     options.push_back({ "*",           "-ctk,  --cache-type-k TYPE",    "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
     options.push_back({ "*",           "-ctv,  --cache-type-v TYPE",    "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
+    options.push_back({ "*",           "-ctkd, --cache-type-k-draft TYPE", "KV cache data type for K for the draft model" });
+    options.push_back({ "*",           "-ctvd, --cache-type-v-draft TYPE", "KV cache data type for V for the draft model" });
 
     options.push_back({ "perplexity" });
     options.push_back({ "perplexity",  "       --all-logits",           "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
@@ -1981,7 +1997,7 @@ std::string string_join(const std::vector<std::string> & strs, const std::string
     if (strs.empty()) {
         return "";
     }
-    
+
     std::ostringstream oss;
     for (size_t i = 0; i < strs.size(); ++i) {
         if (i > 0) {

diff --git a/common/common.h b/common/common.h
@@ -83,6 +83,7 @@ struct gpt_params {
     int32_t n_threads_batch_draft =    -1;
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx_draft           =     0; // context size for draft model
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
@@ -207,6 +208,8 @@ struct gpt_params {
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
+    std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
+    std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model
 
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector

diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -442,7 +442,9 @@ static llama_token_data_array llama_sampling_prepare_impl(
         cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
     }
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    ctx_sampling->cur_p = { cur.data(), cur.size(), false };
+
+    llama_token_data_array & cur_p = ctx_sampling->cur_p;
 
     // apply penalties
     const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
@@ -506,3 +508,47 @@ void llama_sampling_accept(
         llama_sampler_dry_accept(ctx_sampling->smpl, id);
     }
 }
+
+llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling) {
+    return &ctx_sampling->cur_p;
+}
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<llama_token> & draft) {
+    std::vector<int> idxs(draft.size() + 1);
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        idxs[i] = i;
+    }
+
+    return llama_sampling_sample_and_accept_n(gsmpl, ctx, idxs, draft);
+}
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft) {
+    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
+
+    std::vector<llama_token> result;
+    result.reserve(idxs.size());
+
+    size_t i = 0;
+    for (; i < draft.size(); i++) {
+        const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]);
+
+        llama_sampling_accept(gsmpl, ctx, id, true);
+
+        result.push_back(id);
+
+        if (draft[i] != id) {
+            break;
+        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]);
+
+        llama_sampling_accept(gsmpl, ctx, id, true);
+
+        result.push_back(id);
+    }
+
+    return result;
+}
+
diff --git a/common/sampling.h b/common/sampling.h
@@ -101,6 +101,8 @@ struct llama_sampling_context {
 
     size_t n_valid; // Number of correct top tokens with correct probabilities.
 
+    llama_token_data_array cur_p; // current candidates
+
     std::mt19937 rng;
 };
 
@@ -176,3 +178,11 @@ void llama_sampling_accept(
         struct llama_context * ctx_main,
         llama_token id,
         bool apply_grammar);
+
+// returns at least 1 token, up to draft.size()
+// access the internal list of current candidate tokens
+llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling);
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<llama_token> & draft);
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft);