implement speculative decoding translation in examples/speculative.cpp (wip)

g2mt · g2mt · commit d3fb498ce8f3 · 2025-07-29T22:06:52.000Z
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -1,3 +1,4 @@
+#include "speculative.h"
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
@@ -102,6 +103,35 @@ int main(int argc, char ** argv) {
     auto * mem_tgt = llama_get_memory(ctx_tgt);
     auto * mem_dft = llama_get_memory(ctx_dft);
 
+    // Check if vocabularies are compatible
+    bool vocab_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
+
+    // Check vocabulary size difference
+    if (vocab_compatible) {
+        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
+        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
+        const int vocab_diff = abs(n_vocab_tgt - n_vocab_dft);
+
+        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
+            vocab_compatible = false;
+            LOG_DBG("vocab size difference too large: %d vs %d\n", n_vocab_tgt, n_vocab_dft);
+        } else {
+            // Check token consistency for a range of tokens
+            for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
+                if (strcmp(llama_vocab_get_text(vocab_tgt, i), llama_vocab_get_text(vocab_dft, i)) != 0) {
+                    vocab_compatible = false;
+                    LOG_DBG("token %d differs between models\n", i);
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!vocab_compatible) {
+        LOG_INF("The draft model '%s' is not compatible with the target model '%s'. Tokens will be translated between the draft and target models.\n",
+                params.speculative.model.path.c_str(), params.model.path.c_str());
+    }
+
     // Tokenize the prompt
     std::vector<llama_token> inp;
     inp = common_tokenize(ctx_tgt, params.prompt, true, true);
@@ -127,7 +157,16 @@ int main(int argc, char ** argv) {
     // eval the prompt with both models
     llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
     llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
+
+    // Handle prompt tokens for draft model
+    if (vocab_compatible) {
+        llama_decode(ctx_dft, llama_batch_get_one(inp.data(), n_input));
+    } else {
+        // Convert prompt tokens from target to draft model
+        std::string prompt_text = common_detokenize(ctx_tgt, inp, true);
+        std::vector<llama_token> inp_dft = common_tokenize(ctx_dft, prompt_text, true, true);
+        llama_decode(ctx_dft, llama_batch_get_one(inp_dft.data(), inp_dft.size()));
+    }
 
     const auto t_enc_end = ggml_time_us();
 
@@ -224,19 +263,37 @@ int main(int argc, char ** argv) {
 
                         LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                         float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
 
                         //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
 
                         // acquire the token probabilities assigned by the draft and target models
+                        llama_token token_tgt = drafts[s].tokens[i_dft];
+
+                        // If vocabularies are not compatible, we need to convert the token
+                        llama_token token_dft = token_tgt;
+                        if (!vocab_compatible) {
+                            // Convert from target token to draft token by detokenizing and retokenizing
+                            std::string token_text = common_token_to_piece(ctx_tgt, token_tgt);
+                            std::vector<llama_token> tokens_dft = common_tokenize(ctx_dft, token_text, false, true);
+                            if (!tokens_dft.empty()) {
+                                token_dft = tokens_dft[0];
+                            } else {
+                                // If conversion fails, skip this token
+                                drafts[s].active = false;
+                                active_seqs.erase(s);
+                                continue;
+                            }
+                        }
+
                         for (size_t i = 0; i < dist_tgt.size; i++) {
-                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
+                            if (dist_tgt.data[i].id == token_tgt) {
                                 p_tgt = dist_tgt.data[i].p;
                                 break;
                             }
                         }
                         for (size_t i = 0; i < dist_dft.size; i++) {
-                            if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
+                            if (dist_dft.data[i].id == token_dft) {
                                 p_dft = dist_dft.data[i].p;
                                 break;
                             }
@@ -501,25 +558,37 @@ int main(int argc, char ** argv) {
 
                 // add drafted token for each sequence
                 for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p->data[is].id;
-
+                    const llama_token id_dft = cur_p->data[is].id;
                     const int s = sa[is];
 
-                    common_sampler_accept(drafts[s].smpl, id, true);
+                    common_sampler_accept(drafts[s].smpl, id_dft, true);
 
-                    drafts[s].tokens.push_back(id);
+                    // Convert draft token to target token if vocabularies are not compatible
+                    llama_token id_tgt = id_dft;
+                    if (!vocab_compatible) {
+                        std::string token_text = common_token_to_piece(ctx_dft, id_dft);
+                        std::vector<llama_token> tokens_tgt = common_tokenize(ctx_tgt, token_text, false, true);
+                        if (!tokens_tgt.empty()) {
+                            id_tgt = tokens_tgt[0];
+                        } else {
+                            // If conversion fails, skip this token
+                            continue;
+                        }
+                    }
+
+                    drafts[s].tokens.push_back(id_dft);
                     // save cur_p.data into drafts[s].dists
                     drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
 
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    common_batch_add(batch_tgt, id_tgt, n_past_tgt + i + 1, { s }, true);
 
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
 
-                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    common_batch_add(batch_dft, id_dft, n_past_cur, { s }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
@@ -588,6 +657,7 @@ int main(int argc, char ** argv) {
     LOG_INF("target:\n\n");
     common_perf_print(ctx_tgt, smpl);
 
+
     common_sampler_free(smpl);
     for (int s = 0; s < n_seq_dft; ++s) {
         common_sampler_free(drafts[s].smpl);