Skip to content

Commit 0f2bf55

Browse files
committed
speculative : do not discard the last drafted token
1 parent 965ad1c commit 0f2bf55

File tree

2 files changed

+6
-6
lines changed

2 files changed

+6
-6
lines changed

common/speculative.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
252252
// add drafted token for each sequence
253253
const llama_token id = cur_p->data[0].id;
254254

255-
// only collect very high-confidence draft tokens
256-
if (cur_p->data[0].p < params.p_min) {
257-
break;
258-
}
259-
260255
common_sampler_accept(smpl, id, true);
261256

262257
result.push_back(id);
@@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
265260
break;
266261
}
267262

263+
// only collect very high-confidence draft tokens
264+
if (cur_p->data[0].p < params.p_min) {
265+
break;
266+
}
267+
268268
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269269

270270
// evaluate the drafted tokens on the draft model

examples/server/server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ struct server_task {
274274
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
275275

276276
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
277-
params.speculative.n_min = std::max(params.speculative.n_min, 2);
277+
params.speculative.n_min = std::max(params.speculative.n_min, 0);
278278
params.speculative.n_max = std::max(params.speculative.n_max, 0);
279279

280280
// Use OpenAI API logprobs only if n_probs wasn't provided

0 commit comments

Comments
 (0)