DRY: WIP, restoring Koboldcpp comments, adding attribution in llama.h and llama-sampling.cpp

wwoodsTM · pi6am · wwoodsTM · commit c210cbabe0cf · 2024-10-17T22:14:44.000-06:00
Co-authored-by: pi6am &lt;34464159+pi6am@users.noreply.github.com&gt;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -898,7 +898,13 @@ struct server_context {
                     if (dry_sequence_breakers->is_array()) {
                         slot.sparams.dry_sequence_breakers = dry_sequence_breakers->get<std::vector<std::string>>();
                     } else if (dry_sequence_breakers->is_string()) {
-                        slot.sparams.dry_sequence_breakers = json::parse(dry_sequence_breakers->get<std::string>()).get<std::vector<std::string>>();
+                std::string dry_sequence_breakers_str = dry_sequence_breakers->get<std::string>();
+
+                if (dry_sequence_breakers_str.empty() || dry_sequence_breakers_str[0] != '[') {
+                    dry_sequence_breakers_str = "[" + dry_sequence_breakers_str + "]";
+                }
+
+                slot.sparams.dry_sequence_breakers = json::parse(dry_sequence_breakers_str).get<std::vector<std::string>>();
                     } else {
                         send_error(task, "\"dry_sequence_breakers\": Expected an array of strings or a JSON-encoded array of strings.", ERROR_TYPE_INVALID_REQUEST);
                         return false;
diff --git a/include/llama.h b/include/llama.h
@@ -1149,7 +1149,7 @@ extern "C" {
                                 bool   penalize_nl,     // consider newlines as a repeatable token
                                 bool   ignore_eos);     // ignore the end-of-sequence token
 
-    ///  @details DRY sampler as described in: https://github.com/oobabooga/text-generation-webui/pull/5677
+    ///  @details DRY sampler as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
     LLAMA_API struct llama_sampler * llama_sampler_init_dry(
             const struct llama_model * model,
                              int32_t   context_size,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1678,6 +1678,7 @@ struct llama_sampler_dry {
     ring_buffer<llama_token> last_tokens;
 };
 
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
 static void GetOverlappingTokenSequences(const struct llama_model * model, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
     const int n_vocab = llama_n_vocab(model);
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -1735,6 +1736,7 @@ static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token to
     ctx->last_tokens.push_back(token);
 }
 
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
 static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_dry *) smpl->ctx;
 
@@ -1752,7 +1754,28 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
     ctx->dry_repeat_count.assign(last_n_repeat, 0);
     ctx->dry_max_token_repeat.clear();
 
-    // Step 1: Look for restart sequences
+    // Step 1: Look for restart sequences to limit the maximum repetition length.
+    // Work backwards through the context looking for any token that begins a restart sequence.
+    //
+    // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
+    // sequences that together comprise a restart sequence. This allows us to quickly check
+    // whether each token is the head of a complete sequence. Most restart sequences are actually
+    // a single token, and for these the "tail" is an empty vector.
+    //
+    // If the token is a "head", test all restart sequences that begin with this token
+    // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
+    // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
+    // longest matching sequence (if any) is used to limit the maximum repetition length.
+    //
+    // Note that in the case case of a short sequence contained in a longer one, this might fail to
+    // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
+    // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
+    // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
+    //
+    // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
+    // have already clamped the maximum tail sequence length when generating `restart_sequences`.
+    // With clamping, this scan is O(N) in the context length.
+
     int rep_limit = last_n_repeat;
     for (int i = 0; i < last_n_repeat; ++i) {
         llama_token token = ctx->last_tokens.rat(i);
@@ -1762,10 +1785,15 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
         }
         int longest_match = -1;
         for (auto it = its.first; it != its.second; ++it) {
+            // Note that (*it) does not contain the head character, so seq_len will be
+            // the restart sequence length minus 1.
+            // In the common case of a single-token restart sequence, (*it) will be empty
+            // and we will trivially match.
             int seq_len = (int)it->second.size();
             if (seq_len > longest_match && seq_len <= (int)i) {
                 bool match = true;
                 for (int offset = 0; offset < seq_len; ++offset) {
+                    // The -1 when indexing `last_tokens` is because we already matched the head.
                     if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
                         match = false;
                         break;
@@ -1777,6 +1805,8 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
             }
         }
         if (longest_match >= 0) {
+            // We found a restart sequence starting `i` tokens from the end and continuing for
+            // `longest_match` tokens.
             rep_limit = i - longest_match;
             break;
         }
@@ -1785,13 +1815,35 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
         return;
     }
 
-    // Step 2: Z-algorithm implementation
+    // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
+    // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
+    // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
+    //
+    // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
+    // https://ivanyu.me/blog/2014/10/15/z-algorithm/
+    //
+    // The code below is adapted from the public domain implementation by the same author here:
+    // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
+    //
+    // Example:
+    // Last N tokens: a b c c b c y a b c
+    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+    //                    ^
+    //   This `3` means that the last three tokens of the context (a b c) also appear here.
+    //
+    // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
+    // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
+    // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
+    // ensure that the inner while loops only examine each token in the context once as the outer
+    // for loop iterates over the context.
+
     {
         const int last = last_n_repeat - 1;
         int rt = 0, lt = 0;
 
         for (int k = 1; k < last_n_repeat; ++k) {
             if (k > rt) {
+                // If k is outside the current Z-box, do naive computation.
                 int n = 0;
                 while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
                     ++n;
@@ -1802,7 +1854,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
                     rt = k+n-1;
                 }
             } else {
-                int p = k - lt;
+                // If k is inside the current Z-box, consider two cases.
+
+                int p = k - lt; // Pair index.
                 int right_part_len = rt - k + 1;
 
                 if (ctx->dry_repeat_count[last - p] < right_part_len) {
@@ -1823,19 +1877,37 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
         }
     }
 
-    // Step 3: Find maximum repeat length for each token
+    // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
+    // that would be generated by emitting each new token that would extend a sequence.
+    //
+    // Following the same example as above:
+    // Last N tokens: a b c c b c y a b c
+    // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+    //
+    // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
+    // c: 3 -> 4 (from `a b c` to `a b c c`)
+    // b: 1 -> 2 (from `c` to `c b`)
+    // y: 2 -> 3 (from `b c` to `b c y`)
+
     for (int i = 0; i < last_n_repeat - 1; ++i) {
         int repeat_len = ctx->dry_repeat_count[i];
         if (repeat_len >= ctx->dry_allowed_length) {
+            // This token ends a repeat, so the next token would continue one.
+            // By convention, the value of `repeat_len` only includes the tokens currently
+            // in the context, not the new token that would be added.
             llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
+            // Track the maximum sequence ending in this token.
             const auto& it = ctx->dry_max_token_repeat.find(token);
             if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
                 ctx->dry_max_token_repeat[token] = repeat_len;
             }
         }
     }
 
-    // Step 4: Apply penalties
+    // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
+
+    // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
+    // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
     const float FLOAT_MAX_LOG = 88.7228391f;
     int max_exponent = 0;
     if (ctx->dry_base > 1.000001f) {