Implementation of DRY Sampling (post-sampling-refactor)

wwoodsTM · wwoodsTM · commit 2331c7906000 · 2024-09-29T20:45:38.000-06:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1009,6 +1009,34 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.sparams.penalty_freq = std::stof(value);
         }
     ).set_sparam());
+    add_opt(llama_arg(
+        {"--dry-multiplier"}, "N",
+        format("Set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dry_multiplier = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--dry-base"}, "N",
+        format("Set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dry_base = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--dry-allowed-length"}, "N",
+        format("Set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
+        [](gpt_params & params, int value) {
+            params.sparams.dry_allowed_length = value;
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--dry-penalty-last-n"}, "N",
+        format("Set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
+        [](gpt_params & params, int value) {
+            params.sparams.dry_penalty_last_n = value;
+        }
+    ).set_sparam());
     add_opt(llama_arg(
         {"--dynatemp-range"}, "N",
         format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
diff --git a/common/common.cpp b/common/common.cpp
@@ -1975,6 +1975,10 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
+    fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
+    fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
+    fprintf(stream, "dry_penalty_last_n: %d # default: 0 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
     fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
diff --git a/common/common.h b/common/common.h
@@ -102,27 +102,33 @@ enum dimre_method {
 struct gpt_sampler_params {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
 
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
+    int32_t n_prev             = 64;    // number of previous tokens to remember
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;    // <= 0 to use vocab size
+    float   top_p              = 0.95f; // 1.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
+    float   tfs_z              = 1.00f; // 1.0 = disabled
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;  // 0.0f = disabled, recommended value: 0.8f
+    float   dry_base           = 1.75f;
+    int32_t dry_allowed_length = 2;
+    int32_t dry_penalty_last_n = -1;    // DRY last n tokens to penalize (0 = disable penalty, -1 = context size)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau       = 5.00f; // target entropy
+    float   mirostat_eta       = 0.10f; // learning rate
+    bool    penalize_nl        = false; // consider newlines as a repeatable token
+    bool    ignore_eos         = false;
+    bool    no_perf            = false; // disable performance metrics
+
+    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
 
     std::vector<enum gpt_sampler_type> samplers = {
         GPT_SAMPLER_TYPE_TOP_K,
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -4,6 +4,7 @@
 
 #include <cmath>
 #include <unordered_map>
+#include <cstring>
 
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -110,6 +111,9 @@ struct gpt_sampler {
 
     llama_token_data_array cur_p;
 
+    int32_t n_ctx;
+    bool    context_size_set;
+
     void set_logits(struct llama_context * ctx, int idx) {
         const auto * logits = llama_get_logits_ith(ctx, idx);
 
@@ -126,15 +130,17 @@ struct gpt_sampler {
 };
 
 std::string gpt_sampler_params::print() const {
-    char result[1024];
+    char result[1536];
 
     snprintf(result, sizeof(result),
-            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
-            mirostat, mirostat_eta, mirostat_tau);
+        "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
+        "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
+        "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+        "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+        penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+        dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
+        top_k, tfs_z, top_p, min_p, typ_p, temp,
+        mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
 }
@@ -151,6 +157,8 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
         /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
         /* .cur    = */ {},
         /* .cur_p  = */ {},
+        /* .n_ctx = */ 0,
+        /* .context_size_set   = */ false,
     };
 
     llama_sampler_chain_add(result->chain,
@@ -171,6 +179,13 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                 params.penalize_nl,
                 params.ignore_eos));
 
+    if (params.dry_multiplier != 0.0f && params.dry_base != 0.0f) {
+        auto * dry_sampler = llama_sampler_init_dry(model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n);
+
+        llama_sampler_dry_set_seq_breakers(dry_sampler, params.dry_sequence_breakers);
+        llama_sampler_chain_add(result->chain, dry_sampler);
+    }
+
     if (params.temp > 0.0f) {
         if (params.mirostat == 0) {
             for (const auto & cnstr : params.samplers) {
@@ -273,6 +288,21 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 }
 
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    // Check and set the context size if it hasn't been set yet
+    if (!gsmpl->context_size_set) {
+        gsmpl->n_ctx = llama_n_ctx(ctx);
+        gsmpl->context_size_set = true;
+
+        // Update the DRY sampler's context size if it is active
+        for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
+            auto * sampler = llama_sampler_chain_get(gsmpl->chain, i);
+            if (strcmp(llama_sampler_name(sampler), "dry") == 0) {
+                llama_sampler_dry_set_context_size(sampler, gsmpl->n_ctx);
+                break;
+            }
+        }
+    }
+
     gsmpl->set_logits(ctx, idx);
 
     auto & grmr  = gsmpl->grmr;
diff --git a/examples/main/README.md b/examples/main/README.md
@@ -187,6 +187,27 @@ Use the `--no-penalize-nl` option to disable newline penalization when applying
 
 Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
 
+### DRY Repetition Penalty
+
+DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns.
+
+- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled).
+- `--dry-base N`: Set the DRY sampling base value (default: 1.75).
+- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2).
+- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size).
+
+The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8.
+
+The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions.
+
+The `dry-allowed-length` option determines the minimum length of repeated sequences that will be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words.
+
+The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context, while 0 disables this aspect of the penalty. Use a positive value to limit the consideration to a specific number of recent tokens.
+
+DRY sampling works alongside traditional repetition penalties to provide more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence.
+
+Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1`
+
 ### Top-K Sampling
 
 -   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -114,6 +114,10 @@ The project is under active development, and we are [looking for feedback and co
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
 | `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
+| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
+| `--dry-base N` | DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
 | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
 | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
 | `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
@@ -354,6 +358,16 @@ node index.js
 
     `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
 
+    `dry_multiplier`: Set the DRY (Don't Repeat Yourself) sampling multiplier. Default: `0.0`, which is disabled.
+
+    `dry_base`: Set the DRY sampling base value. Default: `1.75`
+
+    `dry_allowed_length`: Set the allowed length for DRY sampling. Default: `2`
+
+    `dry_penalty_last_n`: Set DRY penalty for the last n tokens. Default: `-1`, where `0` is disabled and `-1` is context size.
+
+    `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Can be provided as a JSON array of strings or as a JSON-encoded string representing an array of strings. Default: `["\n", ":", "\"", "*"]`
+
     `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
     `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/include/llama.h b/include/llama.h
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp