llama : simplify infill sampler

ggerganov · ggerganov · commit 4d846525a676 · 2024-10-13T18:53:53.000+03:00
diff --git a/examples/llama.vim b/examples/llama.vim
@@ -70,11 +70,11 @@ let s:default_config = {
     \ 'n_suffix':         128,
     \ 'n_predict':        64,
     \ 't_max_prompt_ms':  500,
-    \ 't_max_predict_ms': 200,
+    \ 't_max_predict_ms': 500,
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
     \ 'max_line_suffix':  8,
-    \ 'ring_n_chunks':    32,
+    \ 'ring_n_chunks':    16,
     \ 'ring_chunk_size':  128,
     \ 'ring_scope':       1024,
     \ }
diff --git a/include/llama.h b/include/llama.h
@@ -1154,9 +1154,8 @@ extern "C" {
     // this sampler is meant to be used for fill-in-the-middle infilling
     // it's supposed to be used after top_k sampling
     //
-    // 1. if there is a high-prob token (>= 0.9f) -> skip step 2
-    // 2. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
-    // 3. combine probs of tokens that have the same prefix
+    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+    // 2. combine probs of tokens that have the same prefix
     //
     // example:
     //
@@ -1170,8 +1169,8 @@ extern "C" {
     //   "hel":   0.8
     //   "dummy": 0.1
     //
-    // 4. discard non-EOG tokens with low prob (< 0.2)
-    // 5. if no tokens are left -> pick EOT
+    // 3. discard non-EOG tokens with low prob (< 0.2)
+    // 4. if no tokens are left -> pick EOT
     //
     LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1667,13 +1667,10 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     }
 #endif
 
-    float p_max     = 0.0f;
     float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
-        p_max = fmaxf(p_max, cur_p->data[i].p);
-
         if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
             p_eog_sum += cur_p->data[i].p;
         } else {
@@ -1683,22 +1680,31 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
     const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum;
 
-    LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size);
+    LLAMA_LOG_DEBUG("infill: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_txt_sum, p_eog_sum, rat, cur_p->size);
 
-    if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) {
+    if (p_eog_sum*cur_p->size > p_txt_sum) {
         LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
 
         // keep just the EOG tokens
         const auto size_org = cur_p->size;
 
         cur_p->size = 0;
 
+        float p_sum = 0.0f;
+
         for (size_t i = 0; i < size_org; ++i) {
             if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+                p_sum += cur_p->data[i].p;
+
                 cur_p->data[cur_p->size++] = cur_p->data[i];
             }
         }
 
+        // normalize probs
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= p_sum;
+        }
+
         return;
     }