DRY: Using vocab instead of model

wwoodsTM · wwoodsTM · commit 304c815b9e66 · 2024-10-20T21:45:55.000-06:00
diff --git a/src/llama-impl.h b/src/llama-impl.h
@@ -70,32 +70,6 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
     struct llama_context * ctx
 );
 
-// exposing wrapper function that takes "model" instead of "vocab", to be used internally
-std::vector<llama_token> llama_tokenize_internal(
-        const struct llama_model * model,
-        const std::string & raw_text,
-        bool add_special = false,
-        bool parse_special = true);
-
-static std::string llama_detokenize(const struct llama_model * model, const std::vector<llama_token> & tokens, bool special) {
-    if (model == nullptr) {     // model is passed as nullptr in test-sampling.cpp
-        return "";
-    }
-    std::string text;
-    text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(model, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
-    if (n_chars < 0) {
-        text.resize(-n_chars);
-        n_chars = llama_detokenize(model, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
-        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
-    }
-
-    text.resize(n_chars);
-
-    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
-    return text;
-}
-
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 template<typename T>
 struct ring_buffer {
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1663,8 +1663,6 @@ struct llama_sampler * llama_sampler_init_penalties(
 // DRY
 
 struct llama_sampler_dry {
-    const   llama_model * model;
-
     int32_t total_context_size;
 
     const float   dry_multiplier;
@@ -1679,10 +1677,9 @@ struct llama_sampler_dry {
 };
 
 // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
-static void get_overlapping_token_sequences(const struct llama_model * model, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
-    const int n_vocab = llama_n_vocab(model);
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        std::string word = llama_detokenize(model, {token_id}, true);
+static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
+    for (llama_token token_id = 0; token_id < (int)vocab.n_vocab; token_id++) {
+        std::string word = llama_detokenize(vocab, {token_id}, true);
         if (word.find(str) != std::string::npos) {
             token_sequences.emplace(token_id, std::vector<llama_token>());
         } else {
@@ -1698,7 +1695,7 @@ static void get_overlapping_token_sequences(const struct llama_model * model, co
                     }
                 }
                 if (match) {
-                    std::vector<llama_token> tokenization = llama_tokenize_internal(model, str.substr(i), false, false);
+                    std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
                     if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
                         tokenization.resize(max_tail_len);
                     }
@@ -1951,8 +1948,7 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
 static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (llama_sampler_dry *) smpl->ctx;
 
-    auto * result = llama_sampler_init_dry(ctx->model, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
-
+    auto * result = llama_sampler_init_dry(nullptr, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
     // Copy the state, including the processed breakers
     {
         auto * result_ctx = (llama_sampler_dry *) result->ctx;
@@ -1978,7 +1974,7 @@ static struct llama_sampler_i llama_sampler_dry_i = {
     /* .free   = */ llama_sampler_dry_free,
 };
 
-struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
     if (dry_multiplier < 0 || dry_base <= 0 || dry_allowed_length < 0) {
         return nullptr;
     }
@@ -2008,14 +2004,13 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model,
                 sequence_break.resize(MAX_CHAR_LEN);
             }
 
-            get_overlapping_token_sequences(model, sequence_break, processed_breakers, MAX_SEQ_LEN);
+            get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
         }
     }
 
     return new llama_sampler {
         /* .iface = */ &llama_sampler_dry_i,
         /* .ctx   = */ new llama_sampler_dry {
-            /* .model                  = */ model,
             /* .total_context_size     = */ context_size,
             /* .dry_multiplier         = */ dry_multiplier,
             /* .dry_base               = */ dry_base,
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
@@ -3,7 +3,6 @@
 // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
 
 #include "llama-grammar.h"
-#include "llama-impl.h"
 
 struct llama_vocab;
 struct llama_grammar;
@@ -30,11 +29,21 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
 struct llama_sampler * llama_sampler_init_infill_impl(
         const struct llama_vocab & vocab);
 
+struct llama_sampler * llama_sampler_init_dry_impl(
+        const struct llama_vocab &  vocab,
+                         int32_t    context_size,
+                           float    dry_multiplier,
+                           float    dry_base,
+                         int32_t    dry_allowed_length,
+                         int32_t    dry_penalty_last_n,
+                      const char ** seq_breakers,
+                          size_t    num_breakers);
+
 struct llama_sampler * llama_sampler_init_dry(
-    const struct llama_model * model,
-    int32_t context_size,
-    float dry_multiplier,
-    float dry_base,
-    int32_t dry_allowed_length,
-    int32_t dry_penalty_last_n,
-    const std::vector<std::vector<llama_token>>& seq_breakers);
+        const struct llama_model * model,
+                         int32_t   context_size,
+                           float   dry_multiplier,
+                           float   dry_base,
+                         int32_t   dry_allowed_length,
+                         int32_t   dry_penalty_last_n,
+  const std::vector<std::vector<llama_token>>& seq_breakers);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -1966,3 +1966,19 @@ int32_t llama_detokenize_impl(
 
     return total <= text_len_max ? total : -total;
 }
+
+std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
+    }
+
+    text.resize(n_chars);
+
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return text;
+}
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -163,3 +163,8 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special);
+
+std::string llama_detokenize(
+        const struct llama_vocab & vocab,
+  const std::vector<llama_token> & tokens,
+                            bool   special);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -21452,15 +21452,6 @@ int32_t llama_tokenize(
     return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
 }
 
-// wrapper function that takes "model" instead of "vocab", to be used internally
-std::vector<llama_token> llama_tokenize_internal(
-        const struct llama_model * model,
-        const std::string & raw_text,
-        bool add_special,
-        bool parse_special) {
-    return llama_tokenize_internal(model->vocab, raw_text, add_special, parse_special);
-}
-
 int32_t llama_token_to_piece(
     const struct llama_model * model,
                  llama_token   token,
@@ -21805,6 +21796,10 @@ struct llama_sampler * llama_sampler_init_infill(const struct llama_model * mode
     return llama_sampler_init_infill_impl(model->vocab);
 }
 
+struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+    return llama_sampler_init_dry_impl(model->vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
+}
+
 //
 // model split
 //
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
@@ -208,10 +208,8 @@ static void test_dry(
     }
 
     llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
-    const int32_t context_size = 1024;
-    struct llama_model * model = nullptr;
 
-    struct llama_sampler * sampler = llama_sampler_init_dry(model, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
+    struct llama_sampler * sampler = llama_sampler_init_dry(nullptr, 1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
 
     for (size_t i = 0; i < last_tokens.size(); i++) {
         llama_sampler_accept(sampler, last_tokens[i]);

Original file line number	Diff line number	Diff line change
`@@ -208,10 +208,8 @@ static void test_dry(`
`208`	`208`	`}`
`209`	`209`
`210`	`210`	`llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };`
`211`		`- const int32_t context_size = 1024;`
`212`		`- struct llama_model * model = nullptr;`
`213`	`211`
`214`		`- struct llama_sampler * sampler = llama_sampler_init_dry(model, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);`
	`212`	`+ struct llama_sampler * sampler = llama_sampler_init_dry(nullptr, 1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);`
`215`	`213`
`216`	`214`	`for (size_t i = 0; i < last_tokens.size(); i++) {`
`217`	`215`	`llama_sampler_accept(sampler, last_tokens[i]);`