add --spec-replace flag

g2mt · g2mt · commit b9fdf203f26a · 2025-06-28T18:30:16.000Z
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3217,6 +3217,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    add_opt(common_arg(
+        {"--spec-replace"}, "TARGET", "DRAFT",
+        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+        [](common_params & params, const std::string & tgt, const std::string & dft) {
+            params.speculative.replacements.push_back({ tgt, dft });
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-ctkd", "--cache-type-k-draft"}, "TYPE",
         string_format(
diff --git a/common/common.h b/common/common.h
@@ -198,6 +198,7 @@ struct common_params_speculative {
     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
 
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -7,6 +7,7 @@
 
 #include <cstring>
 #include <algorithm>
+#include <map>
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -19,6 +20,7 @@ struct common_speculative {
     llama_batch batch;
     llama_tokens prompt_dft;
     bool vocab_dft_compatible = true; // whether retokenization is needed
+    std::map<std::string, std::string> tgt_dft_replacements = {};
 };
 
 struct common_speculative * common_speculative_init(
@@ -144,6 +146,41 @@ bool common_speculative_are_compatible(
     return true;
 }
 
+void common_speculative_add_replacement_tgt_dft(
+        struct common_speculative * spec,
+        const char *source, const char *dest) {
+    spec->tgt_dft_replacements[source] = dest;
+}
+
+static std::string replace_to_dft(
+        struct common_speculative * spec,
+        const std::string& input) {
+    std::string result = input;
+    for (const auto& pair : spec->tgt_dft_replacements) {
+        size_t pos = result.find(pair.first);
+        while (pos != std::string::npos) {
+            result.replace(pos, pair.first.length(), pair.second);
+            pos = result.find(pair.first, pos + pair.second.length());
+        }
+    }
+    return result;
+}
+
+static std::string replace_to_tgt(
+        struct common_speculative * spec,
+        const std::string& input) {
+    std::string result = input;
+    for (const auto& pair : spec->tgt_dft_replacements) {
+        size_t pos = result.find(pair.second);
+        while (pos != std::string::npos) {
+            result.replace(pos, pair.second.length(), pair.first);
+            pos = result.find(pair.second, pos + pair.first.length());
+        }
+    }
+    return result;
+}
+
+
 llama_tokens common_speculative_gen_draft(
         struct common_speculative * spec,
         struct common_speculative_params params,
@@ -168,10 +205,11 @@ llama_tokens common_speculative_gen_draft(
 
         std::string text;
         text = common_detokenize(ctx_tgt, prompt_tgt_main_model, false);
+        text = replace_to_dft(spec, text);
         LOG_DBG("main->draft detokenized string: '%s'\n", text.c_str());
         prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, false);
-
         text.clear();
+
         const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
         int32_t n_chars;
         n_chars = llama_detokenize(vocab_tgt, &id_last, 1, &text[0], text.size(), false, false);
@@ -180,6 +218,7 @@ llama_tokens common_speculative_gen_draft(
             n_chars = llama_detokenize(vocab_tgt, &id_last, 1, &text[0], text.size(), false, false);
         }
         text.resize(n_chars);
+        text = replace_to_dft(spec, text);
         LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
         id_last = common_tokenize(ctx_dft, text, false, false)[0];
     }
@@ -312,6 +351,7 @@ llama_tokens common_speculative_gen_draft(
 
     if (!spec->vocab_dft_compatible) {
         std::string detokenized = common_detokenize(ctx_dft, result, false);
+        detokenized = replace_to_tgt(spec, detokenized);
         LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
         result = common_tokenize(ctx_tgt, detokenized, false, false);
     }
diff --git a/common/speculative.h b/common/speculative.h
@@ -23,6 +23,10 @@ bool common_speculative_are_compatible(
         const struct llama_context * ctx_tgt,
         const struct llama_context * ctx_dft);
 
+void common_speculative_add_replacement_tgt_dft(
+        struct common_speculative * spec,
+        const char *source, const char *dest);
+
 // sample up to n_draft tokens and add them to the batch using the draft model
 llama_tokens common_speculative_gen_draft(
                struct common_speculative * spec,
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
@@ -127,6 +127,9 @@ int main(int argc, char ** argv) {
     params_spec.p_min   = p_min;
 
     struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
+    for (auto &pair : params.speculative.replacements) {
+        common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
+    }
 
     llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -2079,6 +2079,9 @@ struct server_context {
                     SRV_ERR("%s", "failed to create speculator\n");
                     return;
                 }
+                for (auto &pair : params_base.speculative.replacements) {
+                    common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
+                }
             }
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);

Original file line number	Diff line number	Diff line change
`@@ -2079,6 +2079,9 @@ struct server_context {`
`2079`	`2079`	`SRV_ERR("%s", "failed to create speculator\n");`
`2080`	`2080`	`return;`
`2081`	`2081`	`}`
	`2082`	`+ for (auto &pair : params_base.speculative.replacements) {`
	`2083`	`+ common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());`
	`2084`	`+ }`
`2082`	`2085`	`}`
`2083`	`2086`
`2084`	`2087`	`SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);`