DRY: Added co-author, testing, fix for consistent prompt sampling

wwoodsTM · l3utterfly · wwoodsTM · commit c1a2ad0210cb · 2024-10-01T03:16:49.000-06:00
Co-authored-by: l3utterfly &lt;gc.pthzfoldr@gmail.com&gt;
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -145,7 +145,7 @@ std::string gpt_sampler_params::print() const {
     return std::string(result);
 }
 
-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
+struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params, int32_t context_size) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
     lparams.no_perf = params.no_perf;
@@ -180,7 +180,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                 params.ignore_eos));
 
     if (params.dry_multiplier != 0.0f && params.dry_base != 0.0f) {
-        auto * dry_sampler = llama_sampler_init_dry(model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n);
+        auto * dry_sampler = llama_sampler_init_dry(model, context_size, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n);
 
         llama_sampler_dry_set_seq_breakers(dry_sampler, params.dry_sequence_breakers);
         llama_sampler_chain_add(result->chain, dry_sampler);
@@ -289,19 +289,19 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
     // Check and set the context size if it hasn't been set yet
-    if (!gsmpl->context_size_set) {
-        gsmpl->n_ctx = llama_n_ctx(ctx);
-        gsmpl->context_size_set = true;
-
-        // Update the DRY sampler's context size if it is active
-        for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
-            auto * sampler = llama_sampler_chain_get(gsmpl->chain, i);
-            if (strcmp(llama_sampler_name(sampler), "dry") == 0) {
-                llama_sampler_dry_set_context_size(sampler, gsmpl->n_ctx);
-                break;
-            }
-        }
-    }
+    // if (!gsmpl->context_size_set) {
+    //     gsmpl->n_ctx = llama_n_ctx(ctx);
+    //     gsmpl->context_size_set = true;
+
+    //     // Update the DRY sampler's context size if it is active
+    //     for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
+    //         auto * sampler = llama_sampler_chain_get(gsmpl->chain, i);
+    //         if (strcmp(llama_sampler_name(sampler), "dry") == 0) {
+    //             llama_sampler_dry_set_context_size(sampler, gsmpl->n_ctx);
+    //             break;
+    //         }
+    //     }
+    // }
 
     gsmpl->set_logits(ctx, idx);
 
diff --git a/common/sampling.h b/common/sampling.h
@@ -36,7 +36,7 @@ struct gpt_sampler;
 
 // llama_sampler API overloads
 
-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
+struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params, int32_t context_size);
 
 void gpt_sampler_free(struct gpt_sampler * gsmpl);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
@@ -298,7 +298,7 @@ int main(int argc, char ** argv) {
             LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    smpl = gpt_sampler_init(model, sparams);
+    smpl = gpt_sampler_init(model, sparams, n_ctx);
 
     LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
     LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
@@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     LOG("\n");
 
-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams, llama_n_ctx(ctx_llava->ctx_llama));
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -237,7 +237,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
 
     LOG_INF("\n");
 
-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams, llama_n_ctx(ctx_llava->ctx_llama));
     return smpl;
 }
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams, llama_n_ctx(ctx));
 
     // verification n-grams
     std::vector<ngram_data> ngrams_cur(G);
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
 
     bool has_eos = false;
 
-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams, max_context_size);
 
     std::vector<llama_token> draft;
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -448,7 +448,7 @@ int main(int argc, char ** argv) {
         }
     }
 
-    smpl = gpt_sampler_init(model, sparams);
+    smpl = gpt_sampler_init(model, sparams, n_ctx);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         return 1;
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.smpl = gpt_sampler_init(model, params.sparams);
+        client.smpl = gpt_sampler_init(model, params.sparams, n_ctx);
     }
 
     std::vector<llama_token> tokens_system;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -163,7 +163,7 @@ struct server_slot {
     int32_t n_prompt_tokens           = 0;
     int32_t n_prompt_tokens_processed = 0;
 
-    json prompt; // can be either a string, array of strings or array of token ids
+    json prompt; // can be either a string, array of strings, array of token ids, or mixed array of strings and token ids
 
     // when a task is submitted, we first tokenize the prompt and store it here
     std::vector<llama_token> prompt_tokens;
@@ -975,16 +975,15 @@ struct server_context {
             }
 
             if ((prompt->is_string()) ||
-                (prompt->is_array() &&  prompt->size() == 1 && prompt->at(0).is_string()) ||
-                (prompt->is_array() && !prompt->empty()     && prompt->at(0).is_number_integer())) {
+                (prompt->is_array() && !prompt->empty() && (prompt->at(0).is_string() || prompt->at(0).is_number_integer()))) {
                 slot.prompt = *prompt;
             } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) {
                 slot.prompt = prompt->at(0);
             } else if (prompt->is_array() && prompt->size() > 1) {
                 // array of strings
                 for (const auto & el : *prompt) {
                     if (!el.is_string()) {
-                        send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST);
+                        send_error(task, "\"prompt\" must be a string, an array of strings, an array of integers, or a mixed array of strings and integers", ERROR_TYPE_INVALID_REQUEST);
                         return false;
                     }
                 }
@@ -1062,18 +1061,10 @@ struct server_context {
         }
 
         {
-            // These lines seem to force the clearing of sampler data between generations:
-
-            // if (slot.smpl != nullptr) {
-            //     gpt_sampler_free(slot.smpl);
-            // }
-            // slot.smpl = gpt_sampler_init(model, slot.sparams);
-
-            // Changed it to this so data could be maintained between generations:
-
-            if (slot.smpl == nullptr) {
-                slot.smpl = gpt_sampler_init(model, slot.sparams);
+            if (slot.smpl != nullptr) {
+                gpt_sampler_free(slot.smpl);
             }
+            slot.smpl = gpt_sampler_init(model, slot.sparams, slot.n_ctx);
 
             if (slot.smpl == nullptr) {
                 // for now, the only error that may happen here is invalid grammar
@@ -1518,24 +1509,25 @@ struct server_context {
             throw std::runtime_error(error_msg);
         }
         json prompt = data.at("prompt");
-        // if the prompt is a singleton (i.e. a string, a list of tokens, or a mixed array of strings and tokens), we only need to create a single task
-        if (prompt.is_string() || (prompt.is_array() && !prompt.empty() && !prompt[0].is_array())) {
-            bool is_mixed = false;
-            bool has_string = prompt.is_string();
+
+        auto is_valid_singleton_array = [](const json& arr) {
             bool has_number = false;
-            if (prompt.is_array()) {
-                for (const auto& elem : prompt) {
-                    if (elem.is_string()) has_string = true;
-                    else if (elem.is_number()) has_number = true;
-                    if (has_string && has_number) {
-                        is_mixed = true;
-                        break;
-                    }
+            for (const auto& elem : arr) {
+                if (elem.is_number()) {
+                    has_number = true;
+                } else if (!elem.is_string()) {
+                    return false;
                 }
             }
+            return has_number;
+        };
+
+        bool is_singleton = prompt.is_string() || (prompt.is_array() && is_valid_singleton_array(prompt));
+
+        // if the prompt is a singleton (i.e. a string, a list of tokens, or a mixed array of strings and tokens), we only need to create a single task
+        if (prompt.is_string() || (prompt.is_array() && is_valid_singleton_array(prompt))) {
             data["index"] = 0;
             create_task(data, false, nullptr);
-            SRV_DBG("creating single%s prompt task\n", is_mixed ? " mixed" : "");
         }
         // otherwise, it's a multiple-prompt task or a rerank task, we break it into smaller tasks
         else if (prompt.is_array()) {
@@ -2154,7 +2146,8 @@ struct server_context {
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
 
-                            //gpt_sampler_reset(slot.smpl);                     // This line is likely preventing sampler state from being maintained from generation to generation
+                            // Should this be (re-)moved?
+                            gpt_sampler_reset(slot.smpl);
 
                             if (!slot.params.cache_prompt) {
                                 slot.n_past_se = 0;
@@ -2165,10 +2158,13 @@ struct server_context {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
 
+                                // Not sure if the for loop below should happen in multiple places but for now I moved it
+                                // until after the entire prompt is processed so that sampling would happen consistently.
+
                                 // push the prompt into the sampling context (do not apply grammar)
-                                for (int i = 0; i < slot.n_past; ++i) {
-                                    gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
-                                }
+                                // for (int i = 0; i < slot.n_past; ++i) {
+                                //     gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
+                                // }
                             }
                         }
 
@@ -2264,6 +2260,11 @@ struct server_context {
 
                         GGML_ASSERT(batch.n_tokens > 0);
 
+                        // Process all prompt tokens through sampler system
+                        for (int i = 0; i < slot.n_prompt_tokens; ++i) {
+                            gpt_sampler_accept(slot.smpl, prompt_tokens[i], false);
+                        }
+
                         // extract the logits only for the last token
                         batch.logits[batch.n_tokens - 1] = true;
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
     bool has_eos = false;
 
     // target model sampling context (reuse the llama_context's sampling instance)
-    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams, max_context_size);
 
     struct llama_sampler * softmax = llama_sampler_init_softmax();
 
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
 
     for (int s = 0; s < n_seq_dft; ++s) {
         // allocate gpt_sampler for each draft sequence
-        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
+        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams, max_context_size);
     }
 
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
diff --git a/include/llama.h b/include/llama.h
@@ -1135,6 +1135,7 @@ extern "C" {
     ///  @details DRY sampler as described in: https://github.com/oobabooga/text-generation-webui/pull/5677
     LLAMA_API struct llama_sampler * llama_sampler_init_dry(
             const struct llama_model * model,
+                             int32_t   context_size,
                                float   dry_multiplier,
                                float   dry_base,
                              int32_t   dry_allowed_length,
@@ -1231,6 +1232,7 @@ extern "C" {
 #include <string>
 
 LLAMA_API void llama_sampler_dry_set_seq_breakers(struct llama_sampler * sampler, const std::vector<std::string>& seq_breakers);
+LLAMA_API void llama_sampler_dry_set_seq_breakers_as_tokens(struct llama_sampler * smpl, const std::vector<std::vector<llama_token>>& seq_breakers);
 
 #endif // __cplusplus
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp

Original file line number	Diff line number	Diff line change
`@@ -298,7 +298,7 @@ int main(int argc, char ** argv) {`
`298`	`298`	`LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());`
`299`	`299`	`}`
`300`	`300`	`}`
`301`		`- smpl = gpt_sampler_init(model, sparams);`
	`301`	`+ smpl = gpt_sampler_init(model, sparams, n_ctx);`
`302`	`302`
`303`	`303`	`LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));`
`304`	`304`	`LOG_INF("sampler params: \n%s\n", sparams.print().c_str());`
Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par`
`237`	`237`
`238`	`238`	`LOG_INF("\n");`
`239`	`239`
`240`		`- struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);`
	`240`	`+ struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams, llama_n_ctx(ctx_llava->ctx_llama));`
`241`	`241`	`return smpl;`
`242`	`242`	`}`
`243`	`243`
Original file line number	Diff line number	Diff line change
`@@ -448,7 +448,7 @@ int main(int argc, char ** argv) {`
`448`	`448`	`}`
`449`	`449`	`}`
`450`	`450`
`451`		`- smpl = gpt_sampler_init(model, sparams);`
	`451`	`+ smpl = gpt_sampler_init(model, sparams, n_ctx);`
`452`	`452`	`if (!smpl) {`
`453`	`453`	`LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);`
`454`	`454`	`return 1;`
Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {`
`160`	`160`	`for (size_t i = 0; i < clients.size(); ++i) {`
`161`	`161`	`auto & client = clients[i];`
`162`	`162`	`client.id = i;`
`163`		`- client.smpl = gpt_sampler_init(model, params.sparams);`
	`163`	`+ client.smpl = gpt_sampler_init(model, params.sparams, n_ctx);`
`164`	`164`	`}`
`165`	`165`
`166`	`166`	`std::vector<llama_token> tokens_system;`