more xxxx_penalty options

foldl · foldl · commit 0180f535cb2c · 2025-06-05T18:23:30.000+08:00
diff --git a/src/chat.h b/src/chat.h
@@ -791,9 +791,12 @@ namespace chatllm
         bool do_sample;
         bool reversed_role;
         int top_k;
+        int penalty_window;
         float top_p;
         float temperature;
         float presence_penalty;
+        float repeat_penalty;
+        float frequency_penalty;
         float tfs_z;
         std::string sampling;
         std::string ai_prefix;
diff --git a/src/main.cpp b/src/main.cpp
@@ -66,7 +66,9 @@ struct Args
     float top_p = 0.7f;
     float temp = 0.7f;
     float tfs_z = 0.95f;
-    float presence_penalty = 1.0f;
+    float presence_penalty = 0.0f;
+    float repeat_penalty = 1.0f;
+    float frequency_penalty = 0.0f;
     int num_threads = 0;
     bool multi_line = false;
     int seed;
@@ -90,6 +92,7 @@ struct Args
     bool moe_on_cpu = false;
     int batch_size = 4096;
     bool detect_thoughts = false;
+    int penalty_window = 256;
 };
 
 #define MULTI_LINE_END_MARKER_W  L"\\."
@@ -153,6 +156,7 @@ bool is_same_command_option(const std::string &a, const std::string &b)
 
 void usage(const std::string &prog)
 {
+    Args args;
     std::cout << "Usage: " << prog << " [options]\n"
               << "\n"
               << "Basic options:\n"
@@ -176,7 +180,7 @@ void usage(const std::string &prog)
               << "                            --layer_spec 0:3,1:4 (3 + 3 = 6 layers are selected, layer #1/2 are used twice)\n"
               << "                                                 layer structure: 0->1->2->1->2->3\n"
               << "  -c, --max_context_length N\n"
-              << "                          max context length (default: 512)\n"
+              << "                          max context length (default: " << args.max_context_length << ")\n"
               << "  --extending EXT         context extending method (EXT = restart | shift | none)\n"
               << "                          (default: none if `--load_session` is specified, otherwise restart)\n"
               << "  --multi                 enabled multiple lines of input                                                         [*]\n"
@@ -197,18 +201,21 @@ void usage(const std::string &prog)
               << "  --rpc_endpoints EP..    RPC endpoints (i.e. servers) for distributed inference (default: empty)\n"
               << "                          EP1;EP2, where EP ::= host:port\n"
               << "  --cache_dtype T         cache data type, T ::= f32 | f16 (default: f16)\n"
-              << "  --batch_size N          batch size (default: 4096)\n"
+              << "  --batch_size N          batch size (default: " << args.batch_size << ")\n"
               << "                          note: trade-off between prompt throughput and memory usage.\n"
               << "  --re_quantize Q         re-quantize model weights during loading (Q ::= q8_0 | q4_0 | q4_1 | q4_k | ...) (default: no re-quantization)\n"
               << "                          note: it does not make sense to re-quantize to a larger size.\n"
               << "Sampling options:\n"
               << "  --sampling ALG          sampling algorithm (ALG = greedy | top_p | tfs) (default: top_p) \n"
               << "                          where, tfs = Tail Free Sampling\n"
-              << "  -t, --temp T            temperature (default: 0.7) (Note: `-t 0` also sets sampling algorithm to greedy)\n"
-              << "  --top_k N               top-k sampling (default: 20)\n"
-              << "  --top_p N               top-p sampling (default: 0.7)\n"
-              << "  --tfs_z Z               Z param for TFS (default: 0.95)\n"
-              << "  --presence_penalty N    presence repetition penalty (default: 1.0, no penalty)\n"
+              << "  -t, --temp T            temperature (default: " << args.temp << ") (Note: `-t 0` also sets sampling algorithm to greedy)\n"
+              << "  --top_k N               top-k sampling (default: " << args.top_k << ")\n"
+              << "  --top_p N               top-p sampling (default: " << args.top_p << ")\n"
+              << "  --tfs_z Z               Z param for TFS (default: " << args.tfs_z << ")\n"
+              << "  --repeat_penalty N      repetition penalty (default: " << args.repeat_penalty << ", 1.0=no penalty)\n"
+              << "  --presence_penalty N    penalty alpha for presence (default: " << args.presence_penalty << ", 0.0=disabled)\n"
+              << "  --frequency_penalty N   penalty alpha for probability (default: " << args.frequency_penalty << ", 0.0=disabled)\n"
+              << "  --penalty_window N      last N tokens to consider for penalize (default: " << args.penalty_window << ", 0=disable all)\n"
               << "  --seed N                seed for random generator (default: random)\n"
               << "  --beam_size N           beam size for generation (default: -1, disabled)\n"
               << "                          functionality of beam search limited.\n"
@@ -465,6 +472,9 @@ static size_t parse_args(Args &args, const std::vector<std::string> &argv)
             handle_para0("--tfs_z",                       tfs_z,                std::stof)
             handle_param("--temp",                  "-t", temp,                 std::stof)
             handle_para0("--presence_penalty",            presence_penalty,     std::stof)
+            handle_para0("--repeat_penalty",              repeat_penalty,       std::stof)
+            handle_para0("--frequency_penalty",           frequency_penalty,    std::stof)
+            handle_para0("--penalty_window",              penalty_window,       std::stoi)
             handle_param("--threads",               "-n", num_threads,          std::stoi)
             handle_para0("--seed",                        seed,                 std::stoi)
             handle_para0("--test",                        test_fn,              std::string)
@@ -852,7 +862,10 @@ static void run_qa_ranker(Args &args, chatllm::Pipeline &pipeline, TextStreamer
 #define DEF_GenerationConfig(gen_config, args) chatllm::GenerationConfig gen_config(args.max_length, args.max_context_length, args.temp > 0, args.reversed_role, \
                                          args.top_k, args.top_p, args.temp, args.num_threads, args.sampling, args.presence_penalty, args.tfs_z); \
                                          gen_config.set_ai_prefix(args.ai_prefix); gen_config.dump_dot = args.dump_dot; \
-                                         gen_config.emb_rank_query_sep = args.emb_rank_query_sep;
+                                         gen_config.emb_rank_query_sep = args.emb_rank_query_sep; \
+                                         gen_config.repeat_penalty = args.repeat_penalty; \
+                                         gen_config.frequency_penalty = args.frequency_penalty; \
+                                         gen_config.penalty_window = args.penalty_window;
 
 #define DEF_ExtraArgs(pipe_args, args)  \
     chatllm::ModelObject::extra_args pipe_args(args.max_length, args.layer_spec, args.moe_on_cpu, args.num_threads, args.batch_size, args.cache_dtype, args.re_quantize);\
diff --git a/src/models.cpp b/src/models.cpp
@@ -721,20 +721,113 @@ namespace chatllm
         }
     }
 
+    class LogitsPenalty
+    {
+    public:
+        LogitsPenalty()
+            : repeat_penalty_en(false),
+              freq_penalty_en(false),
+              inv_repeat_penalty(0.0f), repeat_penalty(0.0f), freq_penalty(0.0f), presence_penalty(0.0f)
+        {}
+
+        LogitsPenalty(const GenerationConfig &gen_config)
+            : repeat_penalty_en((gen_config.penalty_window > 0) && (gen_config.repeat_penalty != 1.0f) && (gen_config.repeat_penalty > 0.0f)),
+              freq_penalty_en((gen_config.penalty_window > 0) && (gen_config.frequency_penalty != 0.0f) || (gen_config.presence_penalty != 0.0f)),
+              inv_repeat_penalty(repeat_penalty_en ? 1 / gen_config.repeat_penalty : 0.0f),
+              repeat_penalty(gen_config.repeat_penalty),
+              freq_penalty(freq_penalty_en ? gen_config.frequency_penalty / gen_config.penalty_window : 0.0f),
+              presence_penalty(gen_config.presence_penalty)
+        {
+            if (gen_config.penalty_window > 0)
+            {
+                token_history.resize(gen_config.penalty_window);
+            }
+            reset();
+        }
+
+        virtual void skip_this(int token_id)
+        {
+            skip_tokens.emplace(token_id);
+        }
+
+        virtual void reset()
+        {
+            for (size_t i = 0; i < token_history.size(); i++)
+                token_history[i] = -1;
+            hist_write = 0;
+            memset(token_count.data(), 0, token_count.size() * sizeof(token_count[0]));
+        }
+
+        virtual void accept_choice(int token_id)
+        {
+            if (token_history.size() < 1) return;
+            int id = token_history[hist_write];
+            if ((0 <= id) && (id < (int)token_count.size()))
+                token_count[id]--;
+            token_history[hist_write++] = token_id;
+            if (hist_write >= token_history.size()) hist_write = 0;
+            if ((0 <= token_id) && (token_id < (int)token_count.size()))
+                token_count[token_id]++;
+        }
+
+        virtual void process(float *logits, const int vocab_size)
+        {
+            if (token_history.size() < 1) return;
+
+            if (vocab_size != (int)token_count.size())
+            {
+                token_count.resize(vocab_size);
+            }
+
+            for (int i = 0; i < vocab_size; i++)
+            {
+                if (repeat_penalty_en)
+                {
+                    if (token_count[i] > 0)
+                        logits[i] *= logits[i] > 0 ? inv_repeat_penalty : repeat_penalty;
+                }
+
+                if (freq_penalty_en)
+                    logits[i] -= float(token_count[i]) * freq_penalty + float(token_count[i] > 0) * presence_penalty;
+            }
+        }
+
+    protected:
+        const bool repeat_penalty_en;
+        const bool freq_penalty_en;
+        const float inv_repeat_penalty;
+        const float repeat_penalty;
+        const float freq_penalty;
+        const float presence_penalty;
+        std::vector<int> token_history;
+        std::vector<int> token_count;
+        size_t hist_write;
+        std::set<int> skip_tokens;
+    };
+
     class Sampler
     {
     public:
         static const int ABORT = -1;
+        Sampler() : penalty() {}
 
+        Sampler(const GenerationConfig &gen_config)
+            : penalty(gen_config)
+        {}
     public:
         virtual void seed(int x)
         {
             gen.seed((unsigned int)x);
         }
 
-        virtual void reset() {}
+        virtual void reset()
+        {
+            penalty.reset();
+        }
 
         virtual int sampling(float *logits, const int vocab_size) = 0;
+    public:
+        LogitsPenalty penalty;
     protected:
         std::mt19937 gen;
     };
@@ -751,40 +844,26 @@ namespace chatllm
     class NonGreedySampler: public Sampler
     {
     public:
-        NonGreedySampler(float temperature, float presence_penalty, int top_k)
-            : inv_temp(0.0f), inv_presence_penalty(0.0f), presence_penalty(presence_penalty), top_k(top_k)
+        NonGreedySampler(const GenerationConfig &gen_config, float temperature, int top_k)
+            : Sampler(gen_config),
+              inv_temp(0.0f), top_k(top_k)
         {
             temp_en = fabs(temperature - 1.0f) > 1e-5f;
             if (temp_en) inv_temp = 1.f / temperature;
-
-            presence_penalty_en = fabs(presence_penalty - 1.0f) > 1e-5f;
-            if (presence_penalty_en) inv_presence_penalty = 1.0f / presence_penalty;
         }
 
-        void reset() override
-        {
-            g.clear();
-        }
 
         int sampling(float *logits, const int vocab_size) override
         {
-            g.resize(vocab_size, 0);
-            token_scores.resize(vocab_size);
-
             if (temp_en)
             {
                 for (int i = 0; i < vocab_size; i++)
                     logits[i] *= inv_temp;
             }
 
-            if (presence_penalty_en)
-            {
-                for (int i = 0; i < vocab_size; i++)
-                {
-                    if (g[i] > 0)
-                        logits[i] *= logits[i] > 0 ? inv_presence_penalty : presence_penalty;
-                }
-            }
+            penalty.process(logits, vocab_size);
+
+            token_scores.resize(vocab_size);
 
             for (int i = 0; i < vocab_size; i++)
             {
@@ -813,7 +892,8 @@ namespace chatllm
             std::discrete_distribution<> dist(logits, logits + token_scores.size());
             int next_token_id = token_scores[dist(gen)].id;
 
-            g[next_token_id] += 1;
+            penalty.accept_choice(next_token_id);
+
             return next_token_id;
         }
 
@@ -846,20 +926,16 @@ namespace chatllm
 
         virtual void do_sampling(float *logits, const int vocab_size) = 0;
         bool temp_en;
-        bool presence_penalty_en;
         float inv_temp;
-        float inv_presence_penalty;
-        float presence_penalty;
         int top_k;
         std::vector<TokenIdScore> token_scores;
-        std::vector<int> g;
     };
 
     class TopPSampler : public NonGreedySampler
     {
     public:
-        TopPSampler(float temperature, float presence_penalty, int top_k, float top_p)
-            : NonGreedySampler(temperature, presence_penalty, top_k), top_p(top_p)
+        TopPSampler(const GenerationConfig &gen_config, float temperature, int top_k, float top_p)
+            : NonGreedySampler(gen_config, temperature, top_k), top_p(top_p)
         {}
 
     protected:
@@ -895,8 +971,8 @@ namespace chatllm
     class FreeTailSampler : public NonGreedySampler
     {
     public:
-        FreeTailSampler(float temperature, float presence_penalty, int top_k, float z)
-            : NonGreedySampler(temperature, presence_penalty, top_k), z(z)
+        FreeTailSampler(const GenerationConfig &gen_config, float temperature, int top_k, float z)
+            : NonGreedySampler(gen_config, temperature, top_k), z(z)
         {}
 
     protected:
@@ -952,9 +1028,9 @@ namespace chatllm
             if (gen_config.do_sample)
             {
                 if (gen_config.sampling == "top_p")
-                    r = new TopPSampler(gen_config.temperature, gen_config.presence_penalty, gen_config.top_k, gen_config.top_p);
+                    r = new TopPSampler(gen_config, gen_config.temperature, gen_config.top_k, gen_config.top_p);
                 else if (gen_config.sampling == "tfs")
-                    r = new FreeTailSampler(gen_config.temperature, gen_config.presence_penalty, gen_config.top_k, gen_config.tfs_z);
+                    r = new FreeTailSampler(gen_config, gen_config.temperature, gen_config.top_k, gen_config.tfs_z);
                 else if (gen_config.sampling != "greedy")
                     CHATLLM_CHECK(false) << "unknown sampling algorithm: " << gen_config.sampling;
             }