XTC: Added xtc_probability_once parameter

MaggotHATE · MaggotHATE · commit 1f2add86ba27 · 2024-08-22T14:18:12.000+05:00
* Allows to choose between a quick random choice once (like in original) and per-token random choices.
diff --git a/base/common.cpp b/base/common.cpp
@@ -180,6 +180,16 @@ int get_math_cpu_count() {
     return get_num_physical_cores();
 }
 
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return; // Avoid infinite loop if 'search' is an empty string
+    }
+    size_t pos = 0;
+    while ((pos = s.find(search, pos)) != std::string::npos) {
+        s.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+}
 
 void process_escapes(std::string& input) {
     std::size_t input_len = input.length();
diff --git a/base/common.h b/base/common.h
@@ -156,6 +156,8 @@ std::string get_system_info(const gpt_params & params);
 
 std::string gpt_random_prompt(std::mt19937 & rng);
 
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+
 void process_escapes(std::string& input);
 
 //
diff --git a/base/llama-addon.cpp b/base/llama-addon.cpp
@@ -36,33 +36,44 @@
 #include <type_traits>
 #include <unordered_map>
 
-void llama_sample_xtc_addon(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_probability, float xtc_threshold, size_t min_keep) {
+void llama_sample_xtc_addon(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_probability, float xtc_threshold, float xtc_probability_once, size_t min_keep) {
     if (xtc_probability <= 0.0f || xtc_threshold <= 0.0f || candidates->size <= 1) {
         return;
     }
 
+    std::random_device rd;
+    float chance = (float)(rd()%100)/100;
+    //printf("\nChance = %f; ", chance);
+    if (xtc_probability_once && chance > xtc_probability) return;
+
     llama_sample_softmax(nullptr, candidates);
 
     const int64_t t_start_sample_us = ggml_time_us();
     size_t removed = 0;
     for (size_t i = 0; i < (candidates->size - 1); ++i) {
         if (candidates->data[i].p >= xtc_threshold) {
-                std::random_device rd;
-                float chance = (float)(rd()%100)/100;
-
-                if (chance <= xtc_probability) {
+                if (xtc_probability_once || chance <= xtc_probability) {
                     candidates->data[i].logit = -999.0f; // .p will be recalculated in llama_sample_softmax_impl later based on .logit, so we need to change these
                     ++removed;
+                    if (!xtc_probability_once) {
+                        chance = (float)(rd()%100)/100;
+                        printf(" chance = %f; ", chance);
+                    }
                 }
         }
     }
+
+    //printf("\nPresort (size %zu): %f, %f, %f", candidates->size, candidates->data[0].p, candidates->data[1].p, candidates->data[2].p);
+
     // sorting with new logits
     std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
         return a.logit > b.logit;
     });
     //resizing now that penalized tokens are at the back
     candidates->size = candidates->size - removed;
 
+    //printf("\nSort    (size %zu): %f, %f, %f\n", candidates->size, candidates->data[0].p, candidates->data[1].p, candidates->data[2].p);
+
     llama_set_time(ctx, t_start_sample_us);
 }
 
diff --git a/base/llama-addon.h b/base/llama-addon.h
@@ -7,6 +7,7 @@
           llama_token_data_array * candidates,
                            float   xtc_probability,
                            float   xtc_threshold,
+                           float   xtc_probability_once,
                           size_t   min_keep);
 
     /// @details P-Step sampling as described in [THIS PR]
diff --git a/base/sampling.cpp b/base/sampling.cpp
@@ -134,19 +134,20 @@ void sampler_queue(
                  llama_token_data_array & cur_p,
                                  size_t & min_keep) {
 
-    const float       temp              = params.temp;
-    const float       smoothing_factor  = params.smoothing_factor;
-    const float       smoothing_curve   = params.smoothing_curve;
-    const float       dynatemp_range    = params.dynatemp_range;
-    const int32_t     top_k             = params.top_k;
-    const float       top_p             = params.top_p;
-    const float       min_p             = params.min_p;
-    const float       tfs_z             = params.tfs_z;
-    const float       typical_p         = params.typical_p;
-    const float       p_step            = params.p_step;
-    const float       xtc_probability   = params.xtc_probability;
-    const float       xtc_threshold     = params.xtc_threshold;
-    const std::string samplers_sequence = params.samplers_sequence;
+    const float       temp                 = params.temp;
+    const float       smoothing_factor     = params.smoothing_factor;
+    const float       smoothing_curve      = params.smoothing_curve;
+    const float       dynatemp_range       = params.dynatemp_range;
+    const int32_t     top_k                = params.top_k;
+    const float       top_p                = params.top_p;
+    const float       min_p                = params.min_p;
+    const float       tfs_z                = params.tfs_z;
+    const float       typical_p            = params.typical_p;
+    const float       p_step               = params.p_step;
+    const float       xtc_probability      = params.xtc_probability;
+    const float       xtc_threshold        = params.xtc_threshold;
+    const float       xtc_probability_once = params.xtc_probability_once;
+    const std::string samplers_sequence    = params.samplers_sequence;
                       
     for (auto s : samplers_sequence){
         switch (s){
@@ -156,7 +157,7 @@ void sampler_queue(
             case 'p': llama_sample_top_p        (ctx_main, &cur_p, top_p,     min_keep); break;
             case 'm': llama_sample_min_p_addon  (ctx_main, &cur_p, min_p,     min_keep); break;
             case 's': llama_sample_p_step_addon (ctx_main, &cur_p, p_step,    min_keep); break;
-            case 'x': llama_sample_xtc_addon    (ctx_main, &cur_p, xtc_probability, xtc_threshold, min_keep); break;
+            case 'x': llama_sample_xtc_addon    (ctx_main, &cur_p, xtc_probability, xtc_threshold, xtc_probability_once, min_keep); break;
             case 't': {
                 if (dynatemp_range>0)
                 {
diff --git a/base/sampling.h b/base/sampling.h
@@ -59,6 +59,7 @@ typedef struct llama_sampling_params {
     int32_t     dry_penalty_last_n    = -1;                 // DRY last n tokens to penalize (0 = disable penalty, -1 = context size)
     float       xtc_probability       = 0.5; // probability of removing a top token
     float       xtc_threshold         = 0.1; // minimum tokens probablitity for this to run
+    bool        xtc_probability_once  = false; // should we calculate chances one or for each token
     std::string samplers_sequence     = "kfypmts"; // top_k, tail_free, typical_p, top_p, min_p, temp, p_step
 
     std::string grammar;  // optional BNF-like grammar to constrain sampling
diff --git a/include/jsonParams.h b/include/jsonParams.h
@@ -111,6 +111,14 @@ static bool checkJNum(nlohmann::json& config, std::string name){
     return false;
 }
 
+static bool checkJBool(nlohmann::json& config, std::string name){
+    if(config.contains(name)){
+        if(config[name].is_boolean()) return true;
+    }
+    
+    return false;
+}
+
 static bool checkJObj(nlohmann::json& config, std::string name){
     if(config.contains(name)){
         if(config[name].is_object()) {
@@ -511,6 +519,7 @@ static void getParamsFromJson(nlohmann::json& config, gpt_params& params, bool h
     if (checkJNum(config, "tfs_z")) params.sparams.tfs_z = config["tfs_z"];
     if (checkJNum(config, "xtc_probability")) params.sparams.xtc_probability = config["xtc_probability"];
     if (checkJNum(config, "xtc_threshold")) params.sparams.xtc_threshold = config["xtc_threshold"];
+    if (checkJBool(config, "xtc_probability_once")) params.sparams.xtc_probability_once = config["xtc_probability_once"];
 
 //penalties
     if (checkJNum(config, "repeat_penalty")) params.sparams.penalty_repeat = config["repeat_penalty"];
diff --git a/thread_chat.h b/thread_chat.h
@@ -1313,6 +1313,7 @@ struct configurableChat{
         } else if (params.sparams.p_step != paramsDefault.sparams.p_step) modelConfig[model]["p_step"] = params.sparams.p_step;
         if (params.sparams.xtc_probability != paramsDefault.sparams.xtc_probability) modelConfig[model]["xtc_probability"] = params.sparams.xtc_probability;
         if (params.sparams.xtc_threshold != paramsDefault.sparams.xtc_threshold) modelConfig[model]["xtc_threshold"] = params.sparams.xtc_threshold;
+        if (params.sparams.xtc_probability_once != paramsDefault.sparams.xtc_probability_once) modelConfig[model]["xtc_probability_once"] = params.sparams.xtc_probability_once;
         // penalties
         if (params.sparams.penalty_repeat != paramsDefault.sparams.penalty_repeat) modelConfig[model]["repeat_penalty"] = params.sparams.penalty_repeat;
         if (params.sparams.penalty_threshold != paramsDefault.sparams.penalty_threshold) modelConfig[model]["penalty_threshold"] = params.sparams.penalty_threshold;
@@ -1422,6 +1423,7 @@ struct configurableChat{
         if (params.sparams.p_step != paramsDefault.sparams.p_step) newCard["p_step"] = params.sparams.p_step;
         if (params.sparams.xtc_probability != paramsDefault.sparams.xtc_probability) newCard["xtc_probability"] = params.sparams.xtc_probability;
         if (params.sparams.xtc_threshold != paramsDefault.sparams.xtc_threshold) newCard["xtc_threshold"] = params.sparams.xtc_threshold;
+        if (params.sparams.xtc_probability_once != paramsDefault.sparams.xtc_probability_once) newCard["xtc_probability_once"] = params.sparams.xtc_probability_once;
         //penalties
         if (params.sparams.penalty_threshold != paramsDefault.sparams.penalty_threshold) newCard["penalty_threshold"] = params.sparams.penalty_threshold;
         if (params.sparams.penalty_repeat != paramsDefault.sparams.penalty_repeat) newCard["repeat_penalty"] = params.sparams.penalty_repeat;