DRY: WIP, Merged latest master, adding pi6am as co-author

wwoodsTM · pi6am · wwoodsTM · commit a50603d56aa9 · 2024-10-17T19:19:37.000-06:00
Co-authored-by: pi6am &lt;34464159+pi6am@users.noreply.github.com&gt;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2163,16 +2163,19 @@ struct server_context {
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
 
+                            // Should this be (re-)moved?
                             common_sampler_reset(slot.smpl);
 
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
+                                // Not sure if the for loop below should happen in multiple places but for now I moved it
+                                // until after the entire prompt is processed so that sampling would happen consistently.
 
                                 // push the prompt into the sampling context (do not apply grammar)
-                                for (int i = 0; i < slot.n_past; ++i) {
-                                    common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
-                                }
+                                // for (int i = 0; i < slot.n_past; ++i) {
+                                //     common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
+                                // }
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                 if (params.n_cache_reuse > 0) {
@@ -2206,7 +2209,7 @@ struct server_context {
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
 
-                                                common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
+                                                //common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
 
                                                 slot.n_past++;
                                             }
@@ -2288,6 +2291,11 @@ struct server_context {
 
                         GGML_ASSERT(batch.n_tokens > 0);
 
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.n_prompt_tokens; ++i) {
+                            common_sampler_accept(slot.smpl, prompt_tokens[i], false);
+                        }
+
                         // extract the logits only for the last token
                         batch.logits[batch.n_tokens - 1] = true;
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1781,7 +1781,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
             break;
         }
     }
-    if (rep_limit <= ctx->dry_allowed_length) {
+    if (rep_limit < ctx->dry_allowed_length) {
         return;
     }
 
@@ -1845,12 +1845,26 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
     for (size_t i = 0; i < cur_p->size; ++i) {
         const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
         if (af_kvp != ctx->dry_max_token_repeat.end()) {
-            int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
-            if (max_exponent > 0 && repeat_exp > max_exponent) {
-                repeat_exp = max_exponent;
+            // Check all sequence breakers starting with this token
+            auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
+            bool is_single_token_breaker = false;
+
+            for (auto it = range.first; it != range.second; ++it) {
+                if (it->second.empty()) {
+                    is_single_token_breaker = true;
+                    break;
+                }
+            }
+
+            // Apply penalty only if it's not a single-token sequence breaker
+            if (!is_single_token_breaker) {
+                int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
+                if (max_exponent > 0 && repeat_exp > max_exponent) {
+                    repeat_exp = max_exponent;
+                }
+                float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
+                cur_p->data[i].logit -= penalty;
             }
-            float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
-            cur_p->data[i].logit -= penalty;
         }
     }