Skip to content

Commit a50603d

Browse files
wwoodsTMpi6am
andcommitted
DRY: WIP, Merged latest master, adding pi6am as co-author
Co-authored-by: pi6am <[email protected]>
1 parent 0b97d65 commit a50603d

File tree

2 files changed

+32
-10
lines changed

2 files changed

+32
-10
lines changed

examples/server/server.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2163,16 +2163,19 @@ struct server_context {
21632163
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
21642164
}
21652165

2166+
// Should this be (re-)moved?
21662167
common_sampler_reset(slot.smpl);
21672168

21682169
if (slot.params.cache_prompt) {
21692170
// reuse any previously computed tokens that are common with the new prompt
21702171
slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
2172+
// Not sure if the for loop below should happen in multiple places but for now I moved it
2173+
// until after the entire prompt is processed so that sampling would happen consistently.
21712174

21722175
// push the prompt into the sampling context (do not apply grammar)
2173-
for (int i = 0; i < slot.n_past; ++i) {
2174-
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
2175-
}
2176+
// for (int i = 0; i < slot.n_past; ++i) {
2177+
// common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
2178+
// }
21762179

21772180
// reuse chunks from the cached prompt by shifting their KV cache in the new position
21782181
if (params.n_cache_reuse > 0) {
@@ -2206,7 +2209,7 @@ struct server_context {
22062209
for (size_t i = 0; i < n_match; i++) {
22072210
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
22082211

2209-
common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
2212+
//common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
22102213

22112214
slot.n_past++;
22122215
}
@@ -2288,6 +2291,11 @@ struct server_context {
22882291

22892292
GGML_ASSERT(batch.n_tokens > 0);
22902293

2294+
// Process all prompt tokens through sampler system
2295+
for (int i = 0; i < slot.n_prompt_tokens; ++i) {
2296+
common_sampler_accept(slot.smpl, prompt_tokens[i], false);
2297+
}
2298+
22912299
// extract the logits only for the last token
22922300
batch.logits[batch.n_tokens - 1] = true;
22932301

src/llama-sampling.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,7 +1781,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
17811781
break;
17821782
}
17831783
}
1784-
if (rep_limit <= ctx->dry_allowed_length) {
1784+
if (rep_limit < ctx->dry_allowed_length) {
17851785
return;
17861786
}
17871787

@@ -1845,12 +1845,26 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
18451845
for (size_t i = 0; i < cur_p->size; ++i) {
18461846
const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
18471847
if (af_kvp != ctx->dry_max_token_repeat.end()) {
1848-
int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
1849-
if (max_exponent > 0 && repeat_exp > max_exponent) {
1850-
repeat_exp = max_exponent;
1848+
// Check all sequence breakers starting with this token
1849+
auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
1850+
bool is_single_token_breaker = false;
1851+
1852+
for (auto it = range.first; it != range.second; ++it) {
1853+
if (it->second.empty()) {
1854+
is_single_token_breaker = true;
1855+
break;
1856+
}
1857+
}
1858+
1859+
// Apply penalty only if it's not a single-token sequence breaker
1860+
if (!is_single_token_breaker) {
1861+
int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
1862+
if (max_exponent > 0 && repeat_exp > max_exponent) {
1863+
repeat_exp = max_exponent;
1864+
}
1865+
float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
1866+
cur_p->data[i].logit -= penalty;
18511867
}
1852-
float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
1853-
cur_p->data[i].logit -= penalty;
18541868
}
18551869
}
18561870

0 commit comments

Comments
 (0)