Initial commit

Silver267 · Silver267 · commit 2669c158310f · 2025-05-10T16:20:29.000-04:00
diff --git a/common/common.h b/common/common.h
@@ -139,6 +139,8 @@ struct common_params_sampling {
     float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range     = 0.00f; // 0.0 = disabled
     float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    float   smoothing_factor   = 0.0f;  // controls the quadratic adjustment in smooth / quadratic sampling
+    float   smoothing_curve    = 1.0f;  // controls the quadratic adjustment in smooth / quadratic sampling
     int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   penalty_repeat     = 1.00f; // 1.0 = disabled
     float   penalty_freq       = 0.00f; // 0.0 = disabled
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -262,7 +262,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
                     break;
                 case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent, params.smoothing_factor, params.smoothing_curve));
                     break;
                 case COMMON_SAMPLER_TYPE_INFILL:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
diff --git a/include/llama.h b/include/llama.h
@@ -1250,8 +1250,8 @@ extern "C" {
     /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
     LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 
-    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
+    /// @details Dynamic temperature (a.k.a. entropy) + Smooth Sampling implementations wrapped into one function, no research papers available.
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent, float   smoothing_factor, float   smoothing_curve);
 
     /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
     LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -1005,6 +1005,8 @@ struct llama_sampler_temp_ext {
     const float temp;
     const float delta;
     const float exponent;
+    const float smoothing_factor;
+    const float smoothing_curve;
 };
 
 static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
@@ -1013,17 +1015,33 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
 
 static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
+
+    // no need to do anything if there is only one (or zero) candidates
+    if (cur_p->size <= 1) {
+        return;
+    }
+
+    // Apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
+    if (ctx->smoothing_factor > 0.0f) {
+        llama_sampler_softmax_impl(cur_p);
+        float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
+
+        // Apply the modified quadratic transformation using the smoothing_factor and smoothing_curve
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            float logit_shifted = cur_p->data[i].logit - h;
+            float k = (3 - ctx->smoothing_curve) / 2;
+            float s = (ctx->smoothing_curve - 1) / 2;
+            cur_p->data[i].logit = -(k * ctx->smoothing_factor * logit_shifted * logit_shifted) + (s * ctx->smoothing_factor * logit_shifted * logit_shifted * logit_shifted) + h;
+        }
+        llama_sampler_softmax_impl(cur_p);
+    }
+
     if (ctx->delta > 0) {
         const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
         const float max_temp = ctx->temp + ctx->delta;
 
         float exponent_val = ctx->exponent;
 
-        // no need to do anything if there is only one (or zero) candidates
-        if (cur_p->size <= 1) {
-            return;
-        }
-
         // Calculate maximum possible entropy
         float max_entropy = -logf(1.0f / cur_p->size);
 
@@ -1084,7 +1102,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
 
 static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
-    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
+    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent, ctx->smoothing_factor, ctx->smoothing_curve);
 }
 
 static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
@@ -1100,13 +1118,15 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
     /* .free   = */ llama_sampler_temp_ext_free,
 };
 
-struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
+struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent, float smoothing_factor, float smoothing_curve) {
     return llama_sampler_init(
         /* .iface = */ &llama_sampler_temp_ext_i,
         /* .ctx   = */ new llama_sampler_temp_ext {
             /* .temp     = */ temp,
             /* .delta    = */ delta,
             /* .exponent = */ exponent,
+            /* .smoothing_factor = */ smoothing_factor,
+            /* .smoothing_curve = */ smoothing_curve
         }
     );
 }
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
@@ -72,11 +72,11 @@ static void test_temp(const std::vector<float> & probs, const std::vector<float>
     tester.check();
 }
 
-static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
+static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent, float smoothing_factor, float smoothing_curve) {
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent, smoothing_factor, smoothing_curve));
     tester.apply(llama_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
@@ -311,8 +311,11 @@ int main(void) {
     test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
     test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
 
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
-    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f);
+
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.372382f, 0.342804f, 0.230319f, 0.054495f}, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.368339f, 0.349226f, 0.245247f, 0.037188f}, 1.0f, 0.0f, 1.0f, 1.0f, 2.0f);
 
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -144,6 +144,8 @@ struct slot_params {
             {"temperature",               sampling.temp},
             {"dynatemp_range",            sampling.dynatemp_range},
             {"dynatemp_exponent",         sampling.dynatemp_exponent},
+            {"smoothing_factor",          sampling.smoothing_factor},
+            {"smoothing_curve",           sampling.smoothing_curve},
             {"top_k",                     sampling.top_k},
             {"top_p",                     sampling.top_p},
             {"min_p",                     sampling.min_p},
@@ -257,6 +259,8 @@ struct server_task {
         params.sampling.temp               = json_value(data, "temperature",        defaults.sampling.temp);
         params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",     defaults.sampling.dynatemp_range);
         params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  defaults.sampling.dynatemp_exponent);
+        params.sampling.smoothing_factor   = json_value(data, "smoothing_factor",   defaults.sampling.smoothing_factor);
+        params.sampling.smoothing_curve    = json_value(data, "smoothing_curve",    defaults.sampling.smoothing_curve);
         params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",      defaults.sampling.penalty_last_n);
         params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",     defaults.sampling.penalty_repeat);
         params.sampling.penalty_freq       = json_value(data, "frequency_penalty",  defaults.sampling.penalty_freq);