implement tokenizer

mmoskal · mmoskal · commit 3fb701d5ed0c · 2024-11-07T11:00:23.000-08:00
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -2350,14 +2350,16 @@ struct llama_sampler_llg {
     const struct llama_model * model;
     std::string grammar_kind;
     std::string grammar_data;
+    LlgTokenizer *tokenizer;
     LlgConstraint *grammar;
     LlgMaskResult llg_res;
     bool has_llg_res;
 };
 
-static LlgConstraint *llama_sampler_llg_new(const char * grammar_kind, const char * grammar_data) {
+static LlgConstraint *llama_sampler_llg_new(LlgTokenizer *tokenizer,
+        const char * grammar_kind, const char * grammar_data) {
     LlgConstraintInit cinit;
-    llg_constraint_init_set_defaults(&cinit, nullptr);
+    llg_constraint_init_set_defaults(&cinit, tokenizer);
     auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
     if (llg_get_error(c)) {
         LLAMA_LOG_ERROR("llg error: %s\n", llg_get_error(c));
@@ -2418,7 +2420,7 @@ static void llama_sampler_llg_reset(struct llama_sampler * smpl) {
         return;
     }
 
-    auto * grammar_new = llama_sampler_llg_new(ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
+    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
     llg_free_constraint(ctx->grammar);
     ctx->grammar = grammar_new;
     ctx->has_llg_res = false;
@@ -2437,6 +2439,7 @@ static struct llama_sampler * llama_sampler_llg_clone(const struct llama_sampler
             result_ctx->grammar_kind = ctx->grammar_kind;
             result_ctx->grammar_data = ctx->grammar_data;
             result_ctx->grammar = llg_clone_constraint(ctx->grammar);
+            result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
         }
     }
 
@@ -2448,6 +2451,7 @@ static void llama_sampler_llg_free(struct llama_sampler * smpl) {
 
     if (ctx->grammar) {
         llg_free_constraint(ctx->grammar);
+        llg_free_tokenizer(ctx->tokenizer);
     }
 
     delete ctx;
@@ -2462,16 +2466,114 @@ static struct llama_sampler_i llama_sampler_llg_i = {
     /* .free   = */ llama_sampler_llg_free,
 };
 
+
+static size_t llama_sampler_llg_tokenize_fn(const void *user_data,
+                                const uint8_t *bytes,
+                                size_t bytes_len,
+                                uint32_t *output_tokens,
+                                size_t output_tokens_len)
+{
+    const struct llama_model *model = (const struct llama_model *)user_data;
+    int r = llama_tokenize(model, (const char *) bytes, bytes_len, 
+        (int32_t*)output_tokens, output_tokens_len, false, true);
+    if (r < 0)
+        return -r;
+    return r;
+}
+
+static LlgTokenizer *llama_sampler_llg_new_tokenizer(const struct llama_model * model) {
+    // TODO store the tokenizer in the model somehow
+    static const struct llama_model *model_cache;
+    static LlgTokenizer *tokenizer_cache;
+
+    if (model_cache == model) {
+        return llg_clone_tokenizer(tokenizer_cache);
+    }
+
+    auto tok_eos = llama_token_eot(model);
+    if (tok_eos == LLAMA_TOKEN_NULL)
+        tok_eos = llama_token_eos(model);
+
+    size_t vocab_size = llama_n_vocab(model);
+
+    auto token_lens = new uint32_t[vocab_size];
+    // we typically have ~7 bytes per token; let's go on the safe side here
+    auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
+    auto token_bytes = new uint8_t[token_bytes_size];
+
+    size_t offset = 0;
+    for (size_t i = 0; i < vocab_size; i++) {
+        size_t max_token = 1024;
+        if (token_bytes_size - offset < max_token) {
+            GGML_ABORT("token_bytes buffer too small\n");
+        }
+
+        llama_token token = i;
+        auto dp = (char *) token_bytes + offset;
+        auto size = llama_detokenize(model, &token, 1, dp, max_token, false, false);
+        if (size < 0) {
+            GGML_ABORT("llama_detokenize failed\n");
+        }
+        if (size == 0) {
+            size = llama_detokenize(model, &token, 1, dp + 1, max_token - 1, false, true);
+            if (size < 0) {
+                GGML_ABORT("llama_detokenize failed\n");
+            }
+            if (size != 0) {
+                *dp = '\xff'; // special token prefix marker
+                size += 1;
+            }
+        }
+
+        token_lens[i] = size;
+        offset += size;
+    }
+
+
+    LlgTokenizerInit tinit = {
+        /* .vocab_size                         = */ (uint32_t)vocab_size,
+        /* .tok_eos                            = */ (uint32_t)tok_eos,
+        /* .token_lens                         = */ token_lens,
+        /* .token_bytes                        = */ token_bytes,
+        /* .tokenizer_json                     = */ nullptr,
+        /* .tokenize_assumes_string            = */ false,
+        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
+        /* .use_approximate_greedy_tokenize_fn = */ false,
+        /* .tokenize_user_data                 = */ model,
+    };
+
+    char error_buffer[1024];
+    LlgTokenizer *tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
+
+    delete[] token_bytes;
+    delete[] token_lens;
+
+    if (tokenizer == nullptr) {
+        LLAMA_LOG_ERROR("llg tokenizer error: %s\n", error_buffer);
+        return tokenizer;
+    }
+
+    if (tokenizer_cache) {
+        llg_free_tokenizer(tokenizer_cache);
+    }
+    model_cache = model;
+    tokenizer_cache = tokenizer;
+
+    return tokenizer;
+}
+
 struct llama_sampler * llama_sampler_init_llg(const struct llama_model * model, 
         const char * grammar_kind, const char * grammar_data) {
     auto * ctx = new llama_sampler_llg;
 
     if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
+        auto tokenizer = llama_sampler_llg_new_tokenizer(model);
         *ctx = {
             /* .model        = */ model,
             /* .grammar_kind = */ grammar_kind,
             /* .grammar_data = */ grammar_data,
-            /* .grammar      = */ llama_sampler_llg_new(grammar_kind, grammar_data),
+            /* .tokenizer    = */ tokenizer,
+            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
             /* .llg_res      = */ {},
             /* .has_llg_res  = */ false,
         };
@@ -2480,6 +2582,7 @@ struct llama_sampler * llama_sampler_init_llg(const struct llama_model * model,
             /* .model        = */ model,
             /* .grammar_kind = */ {},
             /* .grammar_data = */ {},
+            /* .tokenizer    = */ nullptr,
             /* .grammar      = */ nullptr,
             /* .llg_res      = */ {},
             /* .has_llg_res  = */ false,