@@ -386,7 +386,7 @@ struct server_task {
386386 trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387387 trigger.value = word;
388388 trigger.token = token;
389- params.sampling .grammar_triggers .push_back (trigger);
389+ params.sampling .grammar_triggers .push_back (std::move ( trigger) );
390390 } else {
391391 SRV_DBG (" Grammar trigger word: `%s`\n " , word.c_str ());
392392 params.sampling .grammar_triggers .push_back ({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
751751 {" name" , tc.name },
752752 {" arguments" , tc.arguments },
753753 }},
754- {" id" , tc.id },
754+ // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755+ // We only generate a random id for the ones that don't generate one by themselves
756+ // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757+ {" id" , tc.id .empty () ? gen_tool_call_id () : tc.id },
755758 });
756759 }
757760 message[" tool_calls" ] = tool_calls;
@@ -2037,6 +2040,18 @@ struct server_context {
20372040 return ret;
20382041 }
20392042
2043+ bool can_be_detokenized (const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2044+ const llama_model * model = llama_get_model (ctx);
2045+ const llama_vocab * vocab = llama_model_get_vocab (model);
2046+ const int32_t n_vocab = llama_vocab_n_tokens (vocab);
2047+ for (const auto & token : tokens) {
2048+ if (token < 0 || token >= n_vocab) {
2049+ return false ;
2050+ }
2051+ }
2052+ return true ;
2053+ }
2054+
20402055 bool launch_slot_with_task (server_slot & slot, const server_task & task) {
20412056 slot.reset ();
20422057 slot.id_task = task.id ;
@@ -2051,6 +2066,11 @@ struct server_context {
20512066 slot.lora = task.params .lora ;
20522067 }
20532068
2069+ bool can_detokenize = can_be_detokenized (ctx, slot.prompt_tokens );
2070+ if (!can_detokenize) {
2071+ send_error (task, " Prompt contains invalid tokens" , ERROR_TYPE_INVALID_REQUEST);
2072+ return false ;
2073+ }
20542074 SLT_DBG (slot, " launching slot : %s\n " , safe_json_to_str (slot.to_json ()).c_str ());
20552075
20562076 if (slot.n_predict > 0 && slot.params .n_predict > slot.n_predict ) {
@@ -2093,7 +2113,7 @@ struct server_context {
20932113 SRV_DBG (" %s" , " clearing KV cache\n " );
20942114
20952115 // clear the entire KV cache
2096- llama_kv_cache_clear (ctx);
2116+ llama_kv_self_clear (ctx);
20972117 clean_kv_cache = false ;
20982118 }
20992119
@@ -2635,8 +2655,8 @@ struct server_context {
26352655 res->n_tasks_deferred = queue_tasks.queue_tasks_deferred .size ();
26362656 res->t_start = metrics.t_start ;
26372657
2638- res->kv_cache_tokens_count = llama_get_kv_cache_token_count (ctx);
2639- res->kv_cache_used_cells = llama_get_kv_cache_used_cells (ctx);
2658+ res->kv_cache_tokens_count = llama_kv_self_n_tokens (ctx);
2659+ res->kv_cache_used_cells = llama_kv_self_used_cells (ctx);
26402660
26412661 res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total ;
26422662 res->t_prompt_processing_total = metrics.t_prompt_processing_total ;
@@ -2752,7 +2772,7 @@ struct server_context {
27522772
27532773 // Erase token cache
27542774 const size_t n_erased = slot->cache_tokens .size ();
2755- llama_kv_cache_seq_rm (ctx, slot->id , -1 , -1 );
2775+ llama_kv_self_seq_rm (ctx, slot->id , -1 , -1 );
27562776 slot->cache_tokens .clear ();
27572777
27582778 auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2820,8 +2840,8 @@ struct server_context {
28202840
28212841 SLT_WRN (slot, " slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n " , n_keep, n_left, n_discard);
28222842
2823- llama_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
2824- llama_kv_cache_seq_add (ctx, slot.id , n_keep + n_discard, slot.n_past , -n_discard);
2843+ llama_kv_self_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
2844+ llama_kv_self_seq_add (ctx, slot.id , n_keep + n_discard, slot.n_past , -n_discard);
28252845
28262846 if (slot.params .cache_prompt ) {
28272847 for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++) {
@@ -3012,8 +3032,8 @@ struct server_context {
30123032
30133033 const int64_t kv_shift = (int64_t ) head_p - (int64_t ) head_c;
30143034
3015- llama_kv_cache_seq_rm (ctx, slot.id , head_p, head_c);
3016- llama_kv_cache_seq_add (ctx, slot.id , head_c, head_c + n_match, kv_shift);
3035+ llama_kv_self_seq_rm (ctx, slot.id , head_p, head_c);
3036+ llama_kv_self_seq_add (ctx, slot.id , head_c, head_c + n_match, kv_shift);
30173037
30183038 for (size_t i = 0 ; i < n_match; i++) {
30193039 slot.cache_tokens [head_p + i] = slot.cache_tokens [head_c + i];
@@ -3051,9 +3071,9 @@ struct server_context {
30513071 }
30523072
30533073 // keep only the common part
3054- if (!llama_kv_cache_seq_rm (ctx, slot.id , slot.n_past , -1 )) {
3074+ if (!llama_kv_self_seq_rm (ctx, slot.id , slot.n_past , -1 )) {
30553075 // could not partially delete (likely using a non-Transformer model)
3056- llama_kv_cache_seq_rm (ctx, slot.id , -1 , -1 );
3076+ llama_kv_self_seq_rm (ctx, slot.id , -1 , -1 );
30573077
30583078 // there is no common part left
30593079 slot.n_past = 0 ;
@@ -3293,7 +3313,7 @@ struct server_context {
32933313 slot.cache_tokens .push_back (id);
32943314 slot.cache_tokens .insert (slot.cache_tokens .end (), ids.begin (), ids.end () - 1 );
32953315
3296- llama_kv_cache_seq_rm (ctx, slot.id , slot.n_past , -1 );
3316+ llama_kv_self_seq_rm (ctx, slot.id , slot.n_past , -1 );
32973317
32983318 for (size_t i = 0 ; i < ids.size (); ++i) {
32993319 completion_token_output result;
0 commit comments