@@ -2777,7 +2777,10 @@ struct server_context {
27772777 } break ;
27782778 case SERVER_TASK_TYPE_SLOT_SAVE:
27792779 {
2780- if (!ensure_no_mtmd (task.id )) break ;
2780+ if (!ensure_no_mtmd (task.id )) {
2781+ break ;
2782+ }
2783+
27812784 int id_slot = task.slot_action .slot_id ;
27822785 server_slot * slot = get_slot_by_id (id_slot);
27832786 if (slot == nullptr ) {
@@ -3269,7 +3272,7 @@ struct server_context {
32693272
32703273 // Process all prompt tokens through sampler system
32713274 for (size_t i = 0 ; i < slot.cache_tokens .size (); ++i) {
3272- llama_token id = slot.prompt_tokens [i];
3275+ llama_token id = slot.cache_tokens [i];
32733276 if (id != LLAMA_TOKEN_NULL) {
32743277 common_sampler_accept (slot.smpl , id, false );
32753278 }
@@ -3491,7 +3494,7 @@ struct server_context {
34913494 slot.n_draft_accepted += ids.size () - 1 ;
34923495
34933496 slot.cache_tokens .push_back (id);
3494- slot.cache_tokens .insert (ids);
3497+ slot.cache_tokens .insert ({ ids. begin (), ids. end () - 1 } );
34953498
34963499 llama_kv_self_seq_rm (ctx, slot.id , slot.n_past , -1 );
34973500
@@ -4105,8 +4108,9 @@ int main(int argc, char ** argv) {
41054108 std::vector<server_tokens> inputs;
41064109 if (oaicompat && !prompt.is_string ()) {
41074110 throw std::runtime_error (" prompt must be a string" );
4111+ }
41084112
4109- } else if (oaicompat && has_mtmd) {
4113+ if (oaicompat && has_mtmd) {
41104114 // multimodal
41114115 std::string prompt_str = prompt.get <std::string>();
41124116 mtmd_input_text inp_txt = {
@@ -4124,9 +4128,9 @@ int main(int argc, char ** argv) {
41244128 if (tokenized != 0 ) {
41254129 throw std::runtime_error (" Failed to tokenize prompt" );
41264130 }
4131+
41274132 server_tokens tmp (chunks, true );
41284133 inputs.push_back (std::move (tmp));
4129-
41304134 } else {
41314135 // non-multimodal version
41324136 auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, true , true );
0 commit comments