Skip to content

Commit 3470ee9

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents a037c51 + cd5e3b5 commit 3470ee9

File tree

13 files changed

+180
-52
lines changed

13 files changed

+180
-52
lines changed

common/chat.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
313313
}
314314
if (!msg.reasoning_content.empty()) {
315315
jmsg["reasoning_content"] = msg.reasoning_content;
316-
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
317316
}
318317
if (!msg.tool_name.empty()) {
319318
jmsg["name"] = msg.tool_name;
@@ -1810,7 +1809,23 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
18101809

18111810
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
18121811
common_chat_params data;
1813-
auto prompt = apply(tmpl, inputs);
1812+
1813+
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
1814+
auto adjusted_messages = json::array();
1815+
for (const auto & msg : inputs.messages) {
1816+
auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
1817+
auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
1818+
1819+
if (has_reasoning_content && has_tool_calls) {
1820+
auto adjusted_message = msg;
1821+
adjusted_message["thinking"] = msg.at("reasoning_content");
1822+
adjusted_messages.push_back(adjusted_message);
1823+
} else {
1824+
adjusted_messages.push_back(msg);
1825+
}
1826+
}
1827+
1828+
auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
18141829

18151830
// Check if we need to replace the return token with end token during
18161831
// inference and without generation prompt. For more details see:

include/llama.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,10 @@ extern "C" {
461461
LLAMA_API bool llama_supports_gpu_offload(void);
462462
LLAMA_API bool llama_supports_rpc (void);
463463

464+
// NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
465+
// In some cases the requested values via llama_context_params may differ from the actual values used by the context
464466
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
467+
LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
465468
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
466469
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
467470
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
@@ -585,7 +588,7 @@ extern "C" {
585588
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
586589

587590
// Manually free a LoRA adapter
588-
// Note: loaded adapters will be free when the associated model is deleted
591+
// NOTE: loaded adapters will be free when the associated model is deleted
589592
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
590593

591594
// Get the invocation tokens if the current lora is an alora
@@ -1111,8 +1114,6 @@ extern "C" {
11111114
// // sample from the logits of the last token in the batch
11121115
// const llama_token id = llama_sampler_sample(smpl, ctx, -1);
11131116
//
1114-
// // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
1115-
// llama_sampler_accept(smpl, id);
11161117
// ...
11171118
// }
11181119
//

src/llama-context.cpp

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,24 @@ llama_context::llama_context(
112112
}
113113
}
114114

115-
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
115+
if (cparams.kv_unified) {
116+
cparams.n_ctx_seq = cparams.n_ctx;
117+
} else {
118+
cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
119+
120+
if (cparams.n_ctx_seq == 0) {
121+
throw std::runtime_error("n_ctx_seq == 0");
122+
}
123+
124+
if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
125+
cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
126+
LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
127+
}
128+
}
116129

117130
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
118131
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
119-
LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
132+
LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
120133
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
121134
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
122135
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
@@ -125,14 +138,14 @@ llama_context::llama_context(
125138
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
126139
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
127140

128-
if (n_ctx_per_seq < hparams.n_ctx_train) {
129-
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
130-
__func__, n_ctx_per_seq, hparams.n_ctx_train);
141+
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
142+
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
143+
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
131144
}
132145

133-
if (n_ctx_per_seq > hparams.n_ctx_train) {
134-
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
135-
__func__, n_ctx_per_seq, hparams.n_ctx_train);
146+
if (cparams.n_ctx_seq > hparams.n_ctx_train) {
147+
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
148+
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
136149
}
137150

138151
if (!hparams.vocab_only) {
@@ -453,8 +466,8 @@ uint32_t llama_context::n_ctx() const {
453466
return cparams.n_ctx;
454467
}
455468

456-
uint32_t llama_context::n_ctx_per_seq() const {
457-
return cparams.n_ctx / cparams.n_seq_max;
469+
uint32_t llama_context::n_ctx_seq() const {
470+
return cparams.n_ctx_seq;
458471
}
459472

460473
uint32_t llama_context::n_batch() const {
@@ -2383,6 +2396,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
23832396
return ctx->n_ctx();
23842397
}
23852398

2399+
uint32_t llama_n_ctx_seq(const llama_context * ctx) {
2400+
return ctx->n_ctx_seq();
2401+
}
2402+
23862403
uint32_t llama_n_batch(const llama_context * ctx) {
23872404
return ctx->n_batch();
23882405
}

src/llama-context.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ struct llama_context {
4343

4444
ggml_backend_sched_t get_sched() const;
4545

46-
uint32_t n_ctx() const;
47-
uint32_t n_ctx_per_seq() const;
48-
uint32_t n_batch() const;
49-
uint32_t n_ubatch() const;
50-
uint32_t n_seq_max() const;
46+
uint32_t n_ctx() const;
47+
uint32_t n_ctx_seq() const;
48+
uint32_t n_batch() const;
49+
uint32_t n_ubatch() const;
50+
uint32_t n_seq_max() const;
5151

5252
uint32_t n_threads() const;
5353
uint32_t n_threads_batch() const;

src/llama-cparams.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
struct llama_cparams {
1010
uint32_t n_ctx; // context size used during inference
11+
uint32_t n_ctx_seq; // context for a single sequence
1112
uint32_t n_batch;
1213
uint32_t n_ubatch;
1314
uint32_t n_seq_max;

src/llama-model.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6712,14 +6712,14 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co
67126712
}
67136713

67146714
ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
6715-
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
6715+
const uint32_t n_ctx_seq = cparams.n_ctx_seq;
67166716

67176717
// choose long/short freq factors based on the context size
67186718
if (layers[il].rope_freqs != nullptr) {
67196719
return layers[il].rope_freqs;
67206720
}
67216721

6722-
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
6722+
if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
67236723
return layers[il].rope_long;
67246724
}
67256725

@@ -6795,12 +6795,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
67956795
/* filter_attn */ std::move(filter_attn),
67966796
/* filter_recr */ std::move(filter_recr));
67976797
} else {
6798-
uint32_t n_ctx_per_stream = cparams.n_ctx;
6799-
6800-
if (!cparams.kv_unified) {
6801-
n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
6802-
}
6803-
68046798
llama_memory_i::layer_reuse_cb reuse = nullptr;
68056799

68066800
if (arch == LLM_ARCH_GEMMA3N) {
@@ -6824,7 +6818,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
68246818
cparams.offload_kqv,
68256819
params.swa_full,
68266820
cparams.kv_unified,
6827-
n_ctx_per_stream,
6821+
cparams.n_ctx_seq,
68286822
cparams.n_seq_max,
68296823
cparams.n_ubatch,
68306824
1,
@@ -6840,7 +6834,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
68406834
!cparams.flash_attn,
68416835
cparams.offload_kqv,
68426836
cparams.kv_unified,
6843-
n_ctx_per_stream,
6837+
cparams.n_ctx_seq,
68446838
cparams.n_seq_max,
68456839
1,
68466840
hparams.n_swa,

tests/test-thread-safety.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,14 @@ int main(int argc, char ** argv) {
131131
}
132132

133133
batch = llama_batch_get_one(&token, 1);
134-
if (llama_decode(ctx.get(), batch)) {
134+
135+
int ret = llama_decode(ctx.get(), batch);
136+
if (ret == 1 && i > 0) {
137+
LOG_INF("Context full, stopping generation.\n");
138+
break;
139+
}
140+
141+
if (ret != 0) {
135142
LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
136143
failed.store(true);
137144
return;

tools/server/server.cpp

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,7 +2407,7 @@ struct server_context {
24072407

24082408
params_dft.devices = params_base.speculative.devices;
24092409
params_dft.model = params_base.speculative.model;
2410-
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
2410+
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx;
24112411
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
24122412
params_dft.n_parallel = 1;
24132413
params_dft.cache_type_k = params_base.speculative.cache_type_k;
@@ -2495,10 +2495,16 @@ struct server_context {
24952495
}
24962496

24972497
void init() {
2498-
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
2499-
25002498
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
25012499

2500+
const int n_ctx_train = llama_model_n_ctx_train(model);
2501+
2502+
int n_ctx_slot = llama_n_ctx_seq(ctx);
2503+
if (n_ctx_slot > n_ctx_train) {
2504+
SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train);
2505+
n_ctx_slot = n_ctx_train;
2506+
}
2507+
25022508
for (int i = 0; i < params_base.n_parallel; i++) {
25032509
server_slot slot;
25042510

@@ -2527,7 +2533,7 @@ struct server_context {
25272533
}
25282534
}
25292535

2530-
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
2536+
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
25312537

25322538
slot.callback_on_release = [this](int) {
25332539
queue_tasks.pop_deferred_task();
@@ -2699,6 +2705,39 @@ struct server_context {
26992705
return ret;
27002706
}
27012707

2708+
// return true if at least one slot has been purged
2709+
// TODO: improve logic
2710+
// - smarter decision which slot to purge (LRU or longest prompt?)
2711+
// - move slot to level 2 cache instead of removing?
2712+
// - instead of purging, try to store and resume later?
2713+
bool try_purge_idle_slots() {
2714+
bool res = false;
2715+
2716+
if (!params_base.kv_unified) {
2717+
return res;
2718+
}
2719+
2720+
for (auto & slot : slots) {
2721+
if (slot.is_processing()) {
2722+
continue;
2723+
}
2724+
2725+
if (slot.prompt.n_tokens() > 0) {
2726+
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
2727+
2728+
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
2729+
slot.prompt.tokens.clear();
2730+
2731+
res = true;
2732+
2733+
// purge slots one by one
2734+
break;
2735+
}
2736+
}
2737+
2738+
return res;
2739+
}
2740+
27022741
bool launch_slot_with_task(server_slot & slot, server_task && task) {
27032742
slot.reset();
27042743

@@ -3635,9 +3674,10 @@ struct server_context {
36353674
int32_t n_batch = llama_n_batch(ctx);
36363675
int32_t n_ubatch = llama_n_ubatch(ctx);
36373676

3638-
// next, batch any pending prompts without exceeding n_batch
3639-
float alora_scale = -1.0f;
3677+
float alora_scale = -1.0f;
36403678
size_t alora_disabled_id = 0;
3679+
3680+
// next, batch any pending prompts without exceeding n_batch
36413681
if (params_base.cont_batching || batch.n_tokens == 0) {
36423682
for (auto & slot : slots) {
36433683
// check if we can batch this slot with the previous one
@@ -3914,8 +3954,11 @@ struct server_context {
39143954

39153955
// truncate any tokens that are beyond n_past for this slot
39163956
const llama_pos p0 = slot.prompt.tokens.pos_next();
3957+
3958+
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
3959+
39173960
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
3918-
SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0);
3961+
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
39193962
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
39203963

39213964
// there is no common part left
@@ -3924,8 +3967,6 @@ struct server_context {
39243967
slot.prompt.tokens.clear();
39253968
}
39263969

3927-
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
3928-
39293970
// check if we should process the image
39303971
if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
39313972
// process the image
@@ -4126,6 +4167,8 @@ struct server_context {
41264167
std::string err;
41274168

41284169
if (n_batch == 1 && ret == 1) {
4170+
// TODO: try to terminate only the largest active slot/sequence and continue with the rest
4171+
// need to remove the tokens from the current batch too
41294172
err = "Context size has been exceeded.";
41304173
}
41314174

@@ -4141,17 +4184,23 @@ struct server_context {
41414184
// TODO: handle ret == 2 (abort) when we start aborting
41424185

41434186
if (!err.empty()) {
4144-
SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
4187+
SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
4188+
41454189
for (auto & slot : slots) {
4146-
send_error(slot, err);
4147-
slot.release();
4190+
if (slot.is_processing()) {
4191+
send_error(slot, err);
4192+
slot.release();
4193+
}
41484194
}
4195+
41494196
break;
41504197
}
41514198
}
41524199

41534200
// retry with half the batch size to try to find a free slot in the KV cache
4154-
n_batch /= 2;
4201+
if (!try_purge_idle_slots()) {
4202+
n_batch /= 2;
4203+
}
41554204

41564205
SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
41574206

@@ -4391,6 +4440,15 @@ int main(int argc, char ** argv) {
43914440
return 1;
43924441
}
43934442

4443+
// TODO: should we have a separate n_parallel parameter for the server?
4444+
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
4445+
if (params.n_parallel == 1 && params.kv_unified == false) {
4446+
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);
4447+
4448+
params.n_parallel = 4;
4449+
params.kv_unified = true;
4450+
}
4451+
43944452
common_init();
43954453

43964454
// struct that contains llama context and inference
@@ -4944,7 +5002,7 @@ int main(int argc, char ** argv) {
49445002
// Everything else, including multimodal completions.
49455003
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
49465004
}
4947-
const size_t n_ctx_slot = ctx_server.n_ctx / ctx_server.params_base.n_parallel;
5005+
const size_t n_ctx_slot = ctx_server.slots.front().n_ctx;
49485006
tasks.reserve(inputs.size());
49495007
for (size_t i = 0; i < inputs.size(); i++) {
49505008
auto n_prompt_tokens = inputs[i].size();

0 commit comments

Comments
 (0)