DRY: Cleaned up server sampling fix in preparation for rebase with separate PR

wwoodsTM · wwoodsTM · commit faff4b1f2501 · 2024-10-23T11:25:57.000-06:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2192,19 +2192,9 @@ struct server_context {
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
 
-                            // Should this be (re-)moved?
-                            //common_sampler_reset(slot.smpl);
-
                             if (slot.params.cache_prompt) {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
-                                // Not sure if the for loop below should happen in multiple places but for now I moved it
-                                // until after the entire prompt is processed so that sampling would happen consistently.
-
-                                // push the prompt into the sampling context (do not apply grammar)
-                                // for (int i = 0; i < slot.n_past; ++i) {
-                                //     common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
-                                // }
 
                                 // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                 if (params.n_cache_reuse > 0) {
@@ -2238,8 +2228,6 @@ struct server_context {
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
 
-                                                //common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
-
                                                 slot.n_past++;
                                             }
 
@@ -2291,8 +2279,6 @@ struct server_context {
 
                         // there is no common part left
                         slot.n_past = 0;
-
-                        //common_sampler_reset(slot.smpl);
                     }
 
                     SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);