llama : simplify infill sampler

ggerganov · ggerganov · commit 4a9ceca4b016 · 2024-10-10T20:43:07.000+03:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -947,20 +947,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.sparams.tfs_z = std::stof(value);
         }
     ).set_sparam());
-    add_opt(llama_arg(
-        {"--infill-p"}, "N",
-        string_format("infill p threshold (default: %.1f)", (double)params.sparams.infill_p),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.infill_p = std::stof(value);
-        }
-    ).set_sparam());
-    add_opt(llama_arg(
-        {"--infill-p-eog"}, "N",
-        string_format("infill p_eog threshold (default: %.1f)", (double)params.sparams.infill_p_eog),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.infill_p_eog = std::stof(value);
-        }
-    ).set_sparam());
     add_opt(llama_arg(
         {"--typical"}, "N",
         string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
diff --git a/common/common.h b/common/common.h
@@ -114,8 +114,6 @@ struct gpt_sampler_params {
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range    = 0.00f; // 0.0 = disabled
     float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    float   infill_p          = 0.80f;
-    float   infill_p_eog      = 0.01f;
     int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   penalty_repeat    = 1.00f; // 1.0 = disabled
     float   penalty_freq      = 0.00f; // 0.0 = disabled
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -194,7 +194,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                         llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                         break;
                     case GPT_SAMPLER_TYPE_INFILL:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model, params.infill_p, params.infill_p_eog));
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
                         break;
                     default:
                         GGML_ASSERT(false && "unknown sampler type");
diff --git a/examples/llama.vim b/examples/llama.vim
@@ -93,9 +93,7 @@ function! llama#fim(is_auto) abort
        "\ 'stop':             g:llama_config.stop,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'penalty_last_n':   0,
-        \ 'top_k':            5,
-        \ 'infill_p':         0.20,
-        \ 'infill_p_eog':     0.001,
+        \ 'top_k':            100,
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "infill"],
        "\ 'cache_prompt':     v:true,
@@ -180,15 +178,15 @@ function! s:fim_auto()
         call jobstop(s:current_job)
     endif
 
-    if reltimefloat(reltime(s:t_fim_last)) < 0.001*250
+    if reltimefloat(reltime(s:t_fim_last)) < 500*0.001
         if s:timer_fim != -1
             call timer_stop(s:timer_fim)
             let s:timer_fim = -1
         endif
     endif
 
     let s:t_fim_last = reltime()
-    let s:timer_fim = timer_start(250, {-> llama#fim(v:true)})
+    let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
 endfunction
 
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -894,8 +894,6 @@ struct server_context {
         slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
         slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
-        slot.sparams.infill_p          = json_value(data, "infill_p",          default_sparams.infill_p);
-        slot.sparams.infill_p_eog      = json_value(data, "infill_p_eog",      default_sparams.infill_p_eog);
         slot.sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
         slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
         slot.sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
@@ -1261,8 +1259,6 @@ struct server_context {
             {"min_p",                     slot.sparams.min_p},
             {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
-            {"infill_p",                  slot.sparams.infill_p},
-            {"infill_p_eog",              slot.sparams.infill_p_eog},
             {"repeat_last_n",             slot.sparams.penalty_last_n},
             {"repeat_penalty",            slot.sparams.penalty_repeat},
             {"presence_penalty",          slot.sparams.penalty_present},
diff --git a/include/llama.h b/include/llama.h
@@ -1150,8 +1150,11 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
 
-    // 1. if there is a high-prob token (>= 0.9f) - pick it
-    // 2. if sum of EOG probs is larger than p_eog -> mask non-EOG tokens away
+    // this sampler is meant to be used for fill-in-the-middle infilling
+    // it's supposed to be used after top_k sampling and will leave a single candidate token
+    //
+    // 1. if there is a high-prob token (>= 0.9f) -> pick it
+    // 2. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
     // 3. combine probs of tokens that have the same prefix
     //
     // example:
@@ -1166,10 +1169,9 @@ extern "C" {
     //   "hel":   0.8
     //   "dummy": 0.1
     //
-    LLAMA_API struct llama_sampler * llama_sampler_init_infill(
-            const struct llama_model * model,
-                               float   p,
-                               float   p_eog);
+    // 4. pick the token with the highest probability
+    //
+    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
 
     // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
     LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1648,9 +1648,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
 
 struct llama_sampler_infill {
     const struct llama_vocab * vocab;
-
-    const float p;
-    const float p_eog;
 };
 
 static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
@@ -1668,17 +1665,23 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     }
 
     float p_max     = 0.0f;
+    float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
         p_max = fmaxf(p_max, cur_p->data[i].p);
         if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
             p_eog_sum += cur_p->data[i].p;
+        } else {
+            p_txt_sum += cur_p->data[i].p;
         }
     }
 
-    if (p_max < 0.90f && p_eog_sum > ctx->p_eog) {
-        LLAMA_LOG_DEBUG("infill: all EOG tokens are more likely than p_eog (%f), keeping only EOG tokens\n", ctx->p_eog);
+    const float rat = p_txt_sum / p_eog_sum;
+    LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+    if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) {
+        LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
 
         // keep just the EOG tokens
         const auto size_org = cur_p->size;
@@ -1717,9 +1720,9 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
-    // mask non-EOG tokens with prob < ctx->p
+    // mask non-EOG tokens with prob < 0.2
     for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].p < ctx->p && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+        if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
             cur_p->data[i].logit = -INFINITY;
         }
     }
@@ -1753,7 +1756,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
 static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
-    return llama_sampler_init_infill_impl(*ctx->vocab, ctx->p, ctx->p_eog);
+    return llama_sampler_init_infill_impl(*ctx->vocab);
 }
 
 static void llama_sampler_infill_free(struct llama_sampler * smpl) {
@@ -1770,15 +1773,11 @@ static struct llama_sampler_i llama_sampler_infill_i = {
 };
 
 struct llama_sampler * llama_sampler_init_infill_impl(
-        const struct llama_vocab & vocab,
-                           float   p,
-                           float   p_eog) {
+        const struct llama_vocab & vocab) {
     return new llama_sampler {
         /* .iface = */ &llama_sampler_infill_i,
         /* .ctx   = */ new llama_sampler_infill {
             /* .vocab = */ &vocab,
-            /* .p     = */ p,
-            /* .p_eog = */ p_eog,
         },
     };
 }
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
@@ -27,6 +27,4 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
                       const char * grammar_root);
 
 struct llama_sampler * llama_sampler_init_infill_impl(
-        const struct llama_vocab & vocab,
-                           float   p,
-                           float   p_eog);
+        const struct llama_vocab & vocab);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -21817,8 +21817,8 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
     return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }
 
-struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model, float p, float p_eog) {
-    return llama_sampler_init_infill_impl(model->vocab, p, p_eog);
+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+    return llama_sampler_init_infill_impl(model->vocab);
 }
 
 //

Original file line number	Diff line number	Diff line change
`@@ -21817,8 +21817,8 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod`
`21817`	`21817`	`return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);`
`21818`	`21818`	`}`
`21819`	`21819`
`21820`		`-struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model, float p, float p_eog) {`
`21821`		`- return llama_sampler_init_infill_impl(model->vocab, p, p_eog);`
	`21820`	`+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {`
	`21821`	`+ return llama_sampler_init_infill_impl(model->vocab);`
`21822`	`21822`	`}`
`21823`	`21823`
`21824`	`21824`	`//`