@@ -139,6 +139,7 @@ struct slot_params {
139139
140140    json input_prefix;
141141    json input_suffix;
142+     json extra_context;
142143};
143144
144145struct  server_slot  {
@@ -170,6 +171,7 @@ struct server_slot {
170171
171172    //  when a task is submitted, we first tokenize the prompt and store it here
172173    std::vector<llama_token> prompt_tokens;
174+     std::vector<llama_token> extra_tokens;
173175
174176    std::string generated_text;
175177    std::vector<llama_token> cache_tokens;
@@ -906,8 +908,26 @@ struct server_context {
906908        }
907909
908910        //  infill
909-         slot.params .input_prefix  = json_value (data, " input_prefix"  , default_params.input_prefix );
910-         slot.params .input_suffix  = json_value (data, " input_suffix"  , default_params.input_suffix );
911+         slot.params .input_prefix   = json_value (data, " input_prefix"  ,  default_params.input_prefix );
912+         slot.params .input_suffix   = json_value (data, " input_suffix"  ,  default_params.input_suffix );
913+         slot.params .extra_context  = json_value (data, " extra_context"  , default_params.extra_context );
914+ 
915+         SLT_DBG (slot, " extra_context chunks: %d\n "  , (int ) slot.params .extra_context .size ());
916+         for  (const  auto  & chunk : slot.params .extra_context ) {
917+             //  { "text": string, "filename": string }
918+             if  (!chunk.contains (" text"  ) || !chunk[" text"  ].is_string ()) {
919+                 send_error (task, " extra_context chunk must contain a \" text\"  field with a string value"  , ERROR_TYPE_INVALID_REQUEST);
920+                 return  false ;
921+             }
922+ 
923+             //  filename is optional
924+             if  (chunk.contains (" filename"  ) && !chunk[" filename"  ].is_string ()) {
925+                 send_error (task, " extra_context chunk's \" filename\"  field must be a string"  , ERROR_TYPE_INVALID_REQUEST);
926+                 return  false ;
927+             }
928+ 
929+             SLT_DBG (slot, " extra_context chunk in file '%s':\n %s\n "  , chunk.value (" filename"  , " "  ).c_str (), chunk.value (" text"  , " "  ).c_str ());
930+         }
911931
912932        //  get prompt
913933        if  (task.cmpl_type  != SERVER_TASK_CMPL_TYPE_INFILL) {
@@ -1934,13 +1954,66 @@ struct server_context {
19341954                                } break ;
19351955                            case  SERVER_TASK_CMPL_TYPE_INFILL:
19361956                                {
1957+                                     //  use FIM repo-level pattern:
1958+                                     //  ref: https://arxiv.org/pdf/2409.12186
1959+                                     // 
1960+                                     //  [FIM_REP]myproject
1961+                                     //  [FIM_SEP]filename0
1962+                                     //  extra chunk 0
1963+                                     //  [FIM_SEP]filename1
1964+                                     //  extra chunk 1
1965+                                     //  ...
1966+                                     //  [FIM_SEP]filename
1967+                                     //  [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]
1968+                                     // 
19371969                                    auto  prefix_tokens = tokenize (slot.params .input_prefix , false , false );
19381970                                    auto  suffix_tokens = tokenize (slot.params .input_suffix , false , false );
19391971
1940-                                     //  for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
1941-                                     const  int  n_suffix_take = std::min<int >(suffix_tokens.size (), n_batch/4 );
1972+                                     slot.extra_tokens .clear ();
1973+                                     if  (llama_token_fim_rep (model) != LLAMA_TOKEN_NULL) {
1974+                                         static  const  auto  k_fim_repo = tokenize (" myproject\n "  , false , false );
1975+ 
1976+                                         slot.extra_tokens .push_back (llama_token_fim_rep (model));
1977+                                         slot.extra_tokens .insert (slot.extra_tokens .end (), k_fim_repo.begin (), k_fim_repo.end ());
1978+                                     }
1979+ 
1980+                                     for  (const  auto  & chunk : slot.params .extra_context ) {
1981+                                         //  { "text": string, "filename": string }
1982+                                         const  std::string text     = chunk.value (" text"  , " "  );
1983+                                         const  std::string filename = chunk.value (" filename"  , " tmp"  );
1984+ 
1985+                                         if  (llama_token_fim_sep (model) != LLAMA_TOKEN_NULL) {
1986+                                             const  auto  k_fim_file = tokenize (filename + " \n "  , false , false );
1987+ 
1988+                                             slot.extra_tokens .insert (slot.extra_tokens .end (), llama_token_fim_sep (model));
1989+                                             slot.extra_tokens .insert (slot.extra_tokens .end (), k_fim_file.begin (), k_fim_file.end ());
1990+                                         } else  {
1991+                                             //  chunk separator in binary form to avoid confusing the AI
1992+                                             static  const  char  k_chunk_prefix_str[] = {0x0a , 0x0a , 0x2d , 0x2d , 0x2d , 0x20 , 0x73 , 0x6e , 0x69 , 0x70 , 0x70 , 0x65 , 0x74 , 0x20 , 0x2d , 0x2d , 0x2d , 0x0a , 0x0a , 0x00 };
1993+                                             static  const  auto  k_chunk_prefix_tokens = tokenize (k_chunk_prefix_str, false , false );
1994+ 
1995+                                             slot.extra_tokens .insert (slot.extra_tokens .end (), k_chunk_prefix_tokens.begin (), k_chunk_prefix_tokens.end ());
1996+                                         }
1997+ 
1998+                                         const  auto  chunk_tokens = tokenize (text, false , false );
1999+                                         slot.extra_tokens .insert (slot.extra_tokens .end (), chunk_tokens.begin (), chunk_tokens.end ());
2000+                                     }
2001+ 
2002+                                     if  (llama_token_fim_sep (model) != LLAMA_TOKEN_NULL) {
2003+                                         //  TODO: current filename
2004+                                         static  const  auto  k_fim_file = tokenize (" filename\n "  , false , false );
2005+ 
2006+                                         slot.extra_tokens .insert (slot.extra_tokens .end (), llama_token_fim_sep (model));
2007+                                         slot.extra_tokens .insert (slot.extra_tokens .end (), k_fim_file.begin (), k_fim_file.end ());
2008+                                     }
2009+ 
2010+                                     //  for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
2011+                                     const  int  n_suffix_take = std::min<int >(suffix_tokens.size (), (n_batch)/4 );
19422012                                    const  int  n_prefix_take = std::min<int >(prefix_tokens.size (), (n_batch - 3 ) - n_suffix_take);
19432013
2014+                                     //  fill the rest of the context with extra chunks
2015+                                     const  int  n_extra_take = std::min<int >(std::max<int >(0 , slot.n_ctx  - (n_batch) - 2 *slot.n_predict ), slot.extra_tokens .size ());
2016+ 
19442017                                    prefix_tokens.erase (prefix_tokens.begin (), prefix_tokens.begin () + prefix_tokens.size () - n_prefix_take);
19452018                                    suffix_tokens.resize (n_suffix_take);
19462019
@@ -1954,6 +2027,11 @@ struct server_context {
19542027                                        embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
19552028                                    }
19562029
2030+                                     SLT_DBG (slot, " extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n "  , slot.n_ctx , n_extra_take, (int ) slot.extra_tokens .size ());
2031+ 
2032+                                     //  put the extra context before the FIM prefix
2033+                                     embd_inp.insert (embd_inp.begin (), slot.extra_tokens .end () - n_extra_take, slot.extra_tokens .end ());
2034+ 
19572035                                    embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
19582036                                    embd_inp.push_back (llama_token_fim_mid (model));
19592037
@@ -2058,11 +2136,15 @@ struct server_context {
20582136
20592137                                    while  (head_c < slot.cache_tokens .size () &&
20602138                                           head_p < prompt_tokens.size ()) {
2061-                                         if  (llama_token_is_control (model, slot.cache_tokens [head_c])) {
2139+                                         if  (llama_token_is_control (model, slot.cache_tokens [head_c]) &&
2140+                                             slot.cache_tokens [head_c] != llama_token_fim_rep (model) &&
2141+                                             slot.cache_tokens [head_c] != llama_token_fim_sep (model)) {
20622142                                            break ;
20632143                                        }
20642144
2065-                                         if  (llama_token_is_control (model, prompt_tokens[head_p])) {
2145+                                         if  (llama_token_is_control (model, prompt_tokens[head_p]) &&
2146+                                             prompt_tokens[head_p] != llama_token_fim_rep (model) &&
2147+                                             prompt_tokens[head_p] != llama_token_fim_sep (model)) {
20662148                                            break ;
20672149                                        }
20682150
@@ -2071,11 +2153,15 @@ struct server_context {
20712153                                        while  (head_c + n_match < slot.cache_tokens .size () &&
20722154                                               head_p + n_match < prompt_tokens.size ()     &&
20732155                                               slot.cache_tokens [head_c + n_match] == prompt_tokens[head_p + n_match]) {
2074-                                             if  (llama_token_is_control (model, slot.cache_tokens [head_c + n_match])) {
2156+                                             if  (llama_token_is_control (model, slot.cache_tokens [head_c + n_match]) &&
2157+                                                 slot.cache_tokens [head_c + n_match] != llama_token_fim_rep (model) &&
2158+                                                 slot.cache_tokens [head_c + n_match] != llama_token_fim_sep (model)) {
20752159                                                break ;
20762160                                            }
20772161
2078-                                             if  (llama_token_is_control (model, prompt_tokens[head_p + n_match])) {
2162+                                             if  (llama_token_is_control (model, prompt_tokens[head_p + n_match]) &&
2163+                                                 prompt_tokens[head_p + n_match] != llama_token_fim_rep (model) &&
2164+                                                 prompt_tokens[head_p + n_match] != llama_token_fim_sep (model)) {
20792165                                                break ;
20802166                                            }
20812167
0 commit comments