@@ -2163,16 +2163,19 @@ struct server_context {
21632163 GGML_ASSERT (slot.n_prompt_tokens < slot.n_ctx );
21642164 }
21652165
2166+ // Should this be (re-)moved?
21662167 common_sampler_reset (slot.smpl );
21672168
21682169 if (slot.params .cache_prompt ) {
21692170 // reuse any previously computed tokens that are common with the new prompt
21702171 slot.n_past = longest_common_prefix (slot.cache_tokens , prompt_tokens);
2172+ // Not sure if the for loop below should happen in multiple places but for now I moved it
2173+ // until after the entire prompt is processed so that sampling would happen consistently.
21712174
21722175 // push the prompt into the sampling context (do not apply grammar)
2173- for (int i = 0 ; i < slot.n_past ; ++i) {
2174- common_sampler_accept (slot.smpl , slot.cache_tokens [i], false );
2175- }
2176+ // for (int i = 0; i < slot.n_past; ++i) {
2177+ // common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
2178+ // }
21762179
21772180 // reuse chunks from the cached prompt by shifting their KV cache in the new position
21782181 if (params.n_cache_reuse > 0 ) {
@@ -2206,7 +2209,7 @@ struct server_context {
22062209 for (size_t i = 0 ; i < n_match; i++) {
22072210 slot.cache_tokens [head_p + i] = slot.cache_tokens [head_c + i];
22082211
2209- common_sampler_accept (slot.smpl , slot.cache_tokens [head_p + i], false );
2212+ // common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
22102213
22112214 slot.n_past ++;
22122215 }
@@ -2288,6 +2291,11 @@ struct server_context {
22882291
22892292 GGML_ASSERT (batch.n_tokens > 0 );
22902293
2294+ // Process all prompt tokens through sampler system
2295+ for (int i = 0 ; i < slot.n_prompt_tokens ; ++i) {
2296+ common_sampler_accept (slot.smpl , prompt_tokens[i], false );
2297+ }
2298+
22912299 // extract the logits only for the last token
22922300 batch.logits [batch.n_tokens - 1 ] = true ;
22932301
0 commit comments