@@ -753,12 +753,7 @@ struct server_context {
753753 metrics.init ();
754754 }
755755
756- std::vector<llama_token> tokenize (const json & json_prompt, bool add_special) const {
757- // TODO: currently, we tokenize using special tokens by default
758- // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
759- // but it's better compared to completely ignoring ChatML and other chat templates
760- const bool TMP_FORCE_SPECIAL = true ;
761-
756+ std::vector<llama_token> tokenize (const json & json_prompt, bool add_special, bool parse_special) const {
762757 // If `add_bos` is true, we only add BOS, when json_prompt is a string,
763758 // or the first element of the json_prompt array is a string.
764759 std::vector<llama_token> prompt_tokens;
@@ -771,10 +766,10 @@ struct server_context {
771766
772767 std::vector<llama_token> p;
773768 if (first) {
774- p = ::llama_tokenize (ctx, s, add_special, TMP_FORCE_SPECIAL );
769+ p = ::llama_tokenize (ctx, s, add_special, parse_special );
775770 first = false ;
776771 } else {
777- p = ::llama_tokenize (ctx, s, false , TMP_FORCE_SPECIAL );
772+ p = ::llama_tokenize (ctx, s, false , parse_special );
778773 }
779774
780775 prompt_tokens.insert (prompt_tokens.end (), p.begin (), p.end ());
@@ -788,7 +783,7 @@ struct server_context {
788783 }
789784 } else {
790785 auto s = json_prompt.template get <std::string>();
791- prompt_tokens = ::llama_tokenize (ctx, s, add_special, TMP_FORCE_SPECIAL );
786+ prompt_tokens = ::llama_tokenize (ctx, s, add_special, parse_special );
792787 }
793788
794789 return prompt_tokens;
@@ -1220,7 +1215,7 @@ struct server_context {
12201215 slot.params .n_predict , n_ctx_train);
12211216 }
12221217
1223- SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: '%s'\n " , slot.n_decoded , slot.n_remaining , token_str.c_str ());
1218+ SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n " , slot.n_decoded , slot.n_remaining , result. tok , token_str.c_str ());
12241219
12251220 return slot.has_next_token ; // continue
12261221 }
@@ -1488,9 +1483,8 @@ struct server_context {
14881483 if (prompt.is_string () || json_is_array_of_numbers (prompt)) {
14891484 data[" index" ] = 0 ;
14901485 create_task (data, false , nullptr );
1491- }
1492- // otherwise, it's a multiple-prompt task, we break it into smaller tasks
1493- else if (prompt.is_array ()) {
1486+ } else if (prompt.is_array ()) {
1487+ // otherwise, it's a multiple-prompt task, we break it into smaller tasks
14941488 std::vector<json> prompts = prompt;
14951489 if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
14961490 // prompts[0] is the question
@@ -1515,9 +1509,8 @@ struct server_context {
15151509 }
15161510 }
15171511 }
1518- }
1519- // invalid case
1520- else {
1512+ } else {
1513+ // invalid case
15211514 throw std::runtime_error (error_msg);
15221515 }
15231516
@@ -1988,31 +1981,23 @@ struct server_context {
19881981
19891982 if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
19901983 const bool add_bos = llama_add_bos_token (model);
1991- bool suff_rm_leading_spc = true ;
1992- if (params.input_suffix .find_first_of (' ' ) == 0 && params.input_suffix .size () > 1 ) {
1993- params.input_suffix .erase (0 , 1 );
1994- suff_rm_leading_spc = false ;
1995- }
19961984
1997- auto prefix_tokens = tokenize (slot.params .input_prefix , false );
1998- auto suffix_tokens = tokenize (slot.params .input_suffix , false );
1985+ auto prefix_tokens = tokenize (slot.params .input_prefix , false , false );
1986+ auto suffix_tokens = tokenize (slot.params .input_suffix , false , false );
19991987
2000- const int space_token = 29871 ; // TODO: this should not be hardcoded
2001- if (suff_rm_leading_spc && !suffix_tokens.empty () && suffix_tokens[0 ] == space_token) {
2002- suffix_tokens.erase (suffix_tokens.begin ());
2003- }
2004-
2005- prefix_tokens.insert (prefix_tokens.begin (), llama_token_prefix (model));
2006- suffix_tokens.insert (suffix_tokens.begin (), llama_token_suffix (model));
1988+ prefix_tokens.insert (prefix_tokens.begin (), llama_token_fim_pre (model));
1989+ suffix_tokens.insert (suffix_tokens.begin (), llama_token_fim_suf (model));
20071990
20081991 auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
20091992 auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
1993+
20101994 if (add_bos) {
20111995 embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
20121996 }
1997+
20131998 embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
20141999
2015- const llama_token middle_token = llama_token_middle (model);
2000+ const llama_token middle_token = llama_token_fim_mid (model);
20162001 if (middle_token >= 0 ) {
20172002 embd_inp.push_back (middle_token);
20182003 }
@@ -2031,28 +2016,28 @@ struct server_context {
20312016 prompt_tokens.clear ();
20322017 prompt_tokens.push_back (llama_token_bos (model));
20332018 {
2034- const auto part = tokenize (slot.prompt [0 ], false );
2019+ const auto part = tokenize (slot.prompt [0 ], false , false );
20352020 prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
20362021 }
20372022 prompt_tokens.push_back (llama_token_eos (model));
20382023 prompt_tokens.push_back (llama_token_sep (model));
20392024 {
2040- const auto part = tokenize (slot.prompt [1 ], false );
2025+ const auto part = tokenize (slot.prompt [1 ], false , false );
20412026 prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
20422027 }
20432028 prompt_tokens.push_back (llama_token_eos (model));
20442029 } else {
2045- prompt_tokens = tokenize (slot.prompt , system_prompt.empty ()); // add BOS if there isn't system prompt
2030+ prompt_tokens = tokenize (slot.prompt , system_prompt.empty (), true ); // add BOS if there isn't system prompt
20462031 }
20472032
20482033 slot.n_past = 0 ;
20492034 slot.n_prompt_tokens = prompt_tokens.size ();
20502035
20512036 SLT_INF (slot, " prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " , slot.n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
20522037
2053- // print tokens:
2038+ // print prompt tokens:
20542039 for (int i = 0 ; i < (int ) prompt_tokens.size (); i++) {
2055- SLT_INF (slot, " prompt token %3d: %6d (%s) \n " , i, prompt_tokens[i], llama_token_to_piece (ctx, prompt_tokens[i]).c_str ());
2040+ SLT_DBG (slot, " prompt token %3d: %6d '%s' \n " , i, prompt_tokens[i], llama_token_to_piece (ctx, prompt_tokens[i]).c_str ());
20562041 }
20572042
20582043 // empty prompt passed -> release the slot and send empty response
@@ -2947,7 +2932,23 @@ int main(int argc, char ** argv) {
29472932 return handle_completions_generic (SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
29482933 };
29492934
2950- const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
2935+ const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
2936+ std::string err;
2937+ if (llama_token_fim_pre (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2938+ err += " prefix token is missing. " ;
2939+ }
2940+ if (llama_token_fim_suf (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2941+ err += " suffix token is missing. " ;
2942+ }
2943+ if (llama_token_fim_mid (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2944+ err += " middle token is missing. " ;
2945+ }
2946+
2947+ if (!err.empty ()) {
2948+ res_error (res, format_error_response (string_format (" Infill is not supported by this model: %s" , err.c_str ()), ERROR_TYPE_NOT_SUPPORTED));
2949+ return ;
2950+ }
2951+
29512952 json data = json::parse (req.body );
29522953 return handle_completions_generic (SERVER_TASK_CMPL_TYPE_INFILL, data, res);
29532954 };
@@ -3033,7 +3034,8 @@ int main(int argc, char ** argv) {
30333034 if (body.count (" content" ) != 0 ) {
30343035 const bool add_special = json_value (body, " add_special" , false );
30353036 const bool with_pieces = json_value (body, " with_pieces" , false );
3036- std::vector<llama_token> tokens = ctx_server.tokenize (body.at (" content" ), add_special);
3037+
3038+ std::vector<llama_token> tokens = ctx_server.tokenize (body.at (" content" ), add_special, true );
30373039
30383040 if (with_pieces) {
30393041 for (const auto & token : tokens) {
0 commit comments