llama : improve infill sampler

ggerganov · ggerganov · commit 436967d9e908 · 2024-10-15T09:37:36.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -1152,7 +1152,7 @@ extern "C" {
               const llama_logit_bias * logit_bias);
 
     // this sampler is meant to be used for fill-in-the-middle infilling
-    // it's supposed to be used after top_k sampling
+    // it's supposed to be used after top_k + top_p sampling
     //
     // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
     // 2. combine probs of tokens that have the same prefix
@@ -1169,7 +1169,7 @@ extern "C" {
     //   "hel":   0.8
     //   "dummy": 0.1
     //
-    // 3. discard non-EOG tokens with low prob (< 0.2)
+    // 3. discard non-EOG tokens with low prob
     // 4. if no tokens are left -> pick EOT
     //
     LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1646,7 +1646,7 @@ struct llama_sampler * llama_sampler_init_logit_bias(
 
 // infill
 
-//#define GGML_DEBUG_SAMPLER_INFILL
+#define GGML_DEBUG_SAMPLER_INFILL
 
 struct llama_sampler_infill {
     const struct llama_vocab * vocab;
@@ -1662,10 +1662,14 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     llama_sampler_softmax_impl(cur_p);
 
 #if defined(GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+
     for (size_t i = 0; i < cur_p->size; ++i) {
-        LLAMA_LOG_DEBUG("infill: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
-#endif
 
     float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;
@@ -1680,10 +1684,10 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
     const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum;
 
-    LLAMA_LOG_DEBUG("infill: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_txt_sum, p_eog_sum, rat, cur_p->size);
+    LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
 
-    if (p_eog_sum*cur_p->size > p_txt_sum) {
-        LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
+    if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+        LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
 
         // keep just the EOG tokens
         const auto size_org = cur_p->size;
@@ -1708,6 +1712,8 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         return;
     }
 
+    size_t n_combined = 0;
+
     // combine tokens with common prefix
     for (size_t i = 0; i < cur_p->size; ++i) {
         for (size_t j = 0; j < cur_p->size; ++j) {
@@ -1729,30 +1735,44 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
                     cur_p->data[i].logit = -INFINITY;
                     cur_p->data[i].p     = 0.0f;
                 }
+
+                n_combined++;
             }
         }
     }
 
-    const auto size_org = cur_p->size;
+    size_t n_non_eog = 0;
 
-    cur_p->size = 0;
+    size_t size_org = cur_p->size;
 
     float p_sum = 0.0f;
+    float thold = 0.2f;
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
 
     for (size_t i = 0; i < size_org; ++i) {
-        // discard non-EOG tokens with prob < 0.2
-        if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
             continue;
         }
 
-        // keep this token
+        if (!is_eog) {
+            ++n_non_eog;
+        }
+
         p_sum += cur_p->data[i].p;
 
+        // keep this token
         cur_p->data[cur_p->size++] = cur_p->data[i];
     }
 
-    // if all probs are -INFINITY -> reduce cur_p to single EOG token
-    if (cur_p->size == 0) {
+    LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+
+    // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+    if (n_non_eog == 0) {
         cur_p->size = 1;
         cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
         cur_p->data[0].logit = 1.0f;
@@ -1764,8 +1784,37 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     for (size_t i = 0; i < cur_p->size; ++i) {
         cur_p->data[i].p /= p_sum;
 
-        LLAMA_LOG_DEBUG("after : cur_p[%zu] = { id: %d, p: %f, logit: %f }\n", i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
+
+    size_org = cur_p->size;
+    p_sum = 0.0f;
+    thold = 1.0/(n_non_eog + 1);
+
+    cur_p->size = 0;
+
+    LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+
+        p_sum += cur_p->data[i].p;
+
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+
+#undef LOG_DBG_CUR
 }
 
 static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {