1717"
1818" start the llama.cpp server with a FIM-compatible model. for example:
1919"
20- " $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
20+ " $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 - -ubatch-size 512 --batch-size 1024 --cache-reuse 512
2121"
2222" --batch-size [512, model max context]
2323"
24- " adjust the batch size to control how much of the provided context will be used during the inference
24+ " adjust the batch size to control how much of the provided local context will be used during the inference
2525" lower values will use smaller part of the context around the cursor, which will result in faster processing
2626"
2727" --ubatch-size [64, 2048]
@@ -58,11 +58,12 @@ highlight llama_hl_info guifg=#77ff2f
5858" - leaving a buffer
5959" - writing a file
6060"
61- " ring context parameters :
61+ " parameters for the ring-buffer with extra context :
6262"
6363" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable)
6464" ring_chunk_size: max size of the chunks (in number of lines)
6565" ring_scope: the range around the cursor position (in number of lines) for gathering chunks
66+ " ring_update_ms: how often to process queued chunks in normal mode
6667"
6768let s: default_config = {
6869 \ ' endpoint' : ' http://127.0.0.1:8012/infill' ,
@@ -77,6 +78,7 @@ let s:default_config = {
7778 \ ' ring_n_chunks' : 64 ,
7879 \ ' ring_chunk_size' : 64 ,
7980 \ ' ring_scope' : 1024 ,
81+ \ ' ring_update_ms' : 1000 ,
8082 \ }
8183
8284let g: llama_config = get (g: , ' llama_config' , s: default_config )
@@ -101,7 +103,8 @@ function! llama#init()
101103 let s: line_cur_prefix = ' '
102104 let s: line_cur_suffix = ' '
103105
104- let s: ring_chunks = []
106+ let s: ring_chunks = [] " current set of chunks used as extra context
107+ let s: ring_queued = [] " chunks that are queued to be sent for processing
105108 let s: ring_n_evict = 0
106109
107110 let s: hint_shown = v: false
@@ -112,6 +115,7 @@ function! llama#init()
112115
113116 let s: timer_fim = -1
114117 let s: t_fim_start = reltime () " used to measure total FIM time
118+ let s: t_last_move = reltime () " last time the cursor moved
115119
116120 let s: current_job = v: null
117121
@@ -120,15 +124,14 @@ function! llama#init()
120124 autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false, v:false)
121125 autocmd InsertLeavePre * call llama#fim_cancel ()
122126
123- autocmd CursorMoved * call llama#fim_cancel ()
127+ autocmd CursorMoved * call s: on_move ()
128+ autocmd CursorMovedI * call s: on_move ()
124129 autocmd CompleteChanged * call llama#fim_cancel ()
125130
126131 if g: llama_config .auto_fim
127132 autocmd InsertEnter * call llama#fim (v: true , v: false )
128133 autocmd CursorMovedI * call llama#fim (v: true , v: false )
129134 " autocmd CursorHoldI * call llama#fim(v:true, v:true)
130- else
131- autocmd CursorMovedI * call llama#fim_cancel ()
132135 endif
133136
134137 autocmd TextYankPost * if v: event .operator == # ' y' | call s: pick_chunk (v: event .regcontents, v: false , v: true ) | endif
@@ -142,6 +145,11 @@ function! llama#init()
142145 augroup END
143146
144147 silent ! call llama#fim_cancel ()
148+
149+ " init background update of the ring buffer
150+ if g: llama_config .ring_n_chunks > 0
151+ call s: ring_update ()
152+ endif
145153endfunction
146154
147155" TODO: figure out something better
@@ -163,6 +171,7 @@ function! s:chunk_sim(c0, c1)
163171 return 2.0 * l: common / (l: lines0 + l: lines1 )
164172endfunction
165173
174+ " pick a chunk from the provided text and queue it for processing
166175function ! s: pick_chunk (text, no_mod, do_evict)
167176 " do not pick chunks from buffers with pending changes or buffers that are not files
168177 if a: no_mod && (getbufvar (bufnr (' %' ), ' &modified' ) || ! buflisted (bufnr (' %' )) || ! filereadable (expand (' %' )))
@@ -190,18 +199,38 @@ function! s:pick_chunk(text, no_mod, do_evict)
190199
191200 " check if this chunk is already added
192201 let l: exist = v: false
202+
193203 for i in range (len (s: ring_chunks ))
194204 if s: ring_chunks [i ].data == l: chunk
195205 let l: exist = v: true
196206 break
197207 endif
198208 endfor
199209
210+ for i in range (len (s: ring_queued ))
211+ if s: ring_queued [i ].data == l: chunk
212+ let l: exist = v: true
213+ break
214+ endif
215+ endfor
216+
200217 if l: exist
201218 return
202219 endif
203220
204221 " evict chunks that are very similar to the new one
222+ for i in range (len (s: ring_queued ) - 1 , 0 , -1 )
223+ if s: chunk_sim (s: ring_queued [i ].data, l: chunk ) > 0.5
224+ if a: do_evict
225+ call remove (s: ring_queued , i )
226+ let s: ring_n_evict += 1
227+ else
228+ return
229+ endif
230+ endif
231+ endfor
232+
233+ " also from s:ring_chunks
205234 for i in range (len (s: ring_chunks ) - 1 , 0 , -1 )
206235 if s: chunk_sim (s: ring_chunks [i ].data, l: chunk ) > 0.5
207236 if a: do_evict
@@ -213,11 +242,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
213242 endif
214243 endfor
215244
245+ if len (s: ring_queued ) == 16
246+ call remove (s: ring_queued , 0 )
247+ endif
248+
249+ call add (s: ring_queued , {' data' : l: chunk , ' str' : l: chunk_str , ' time' : reltime (), ' filename' : expand (' %' )})
250+
251+ " let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
252+ endfunction
253+
254+ " called every g:llama_config.ring_update_ms, processed chunks are moved to s:ring_chunks
255+ function ! s: ring_update ()
256+ call timer_start (g: llama_config .ring_update_ms, {- > s: ring_update ()})
257+
258+ " update only if in normal mode or if the cursor hasn't moved for a while
259+ if mode () !=# ' n' && reltimefloat (reltime (s: t_last_move )) < 3.0
260+ return
261+ endif
262+
263+ if len (s: ring_queued ) == 0
264+ return
265+ endif
266+
267+ " move the first queued chunk to the ring buffer
216268 if len (s: ring_chunks ) == g: llama_config .ring_n_chunks
217269 call remove (s: ring_chunks , 0 )
218270 endif
219271
220- call add (s: ring_chunks , {' data' : l: chunk , ' str' : l: chunk_str , ' time' : reltime (), ' filename' : expand (' %' )})
272+ call add (s: ring_chunks , remove (s: ring_queued , 0 ))
273+
274+ " let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
221275
222276 " send asynchronous job with the new extra context so that it is ready for the next FIM
223277 let l: extra_context = []
@@ -229,16 +283,16 @@ function! s:pick_chunk(text, no_mod, do_evict)
229283 \ })
230284 endfor
231285
286+ " no samplers needed here
232287 let l: request = json_encode ({
233288 \ ' prompt' : " " ,
234289 \ ' input_prefix' : " " ,
235290 \ ' input_suffix' : " " ,
236291 \ ' n_predict' : 1 ,
237292 \ ' penalty_last_n' : 0 ,
238- \ ' top_k' : 40 ,
239- \ ' top_p' : 0.99 ,
293+ \ ' temperature' : 0.0 ,
240294 \ ' stream' : v: false ,
241- \ ' samplers' : [" top_k " , " top_p " , " infill " ],
295+ \ ' samplers' : [" temperature " ],
242296 \ ' cache_prompt' : v: true ,
243297 \ ' extra_context' : l: extra_context ,
244298 \ ' t_max_prompt_ms' : 1 ,
@@ -409,6 +463,12 @@ function! llama#fim_cancel()
409463 silent ! iunmap <buffer> <Esc>
410464endfunction
411465
466+ function ! s: on_move ()
467+ let s: t_last_move = reltime ()
468+
469+ call llama#fim_cancel ()
470+ endfunction
471+
412472" callback that processes the result from the server
413473function ! s: fim_on_stdout (job_id, data, event ) dict
414474 let l: raw = join (a: data , " \n " )
@@ -511,9 +571,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
511571 \ l: n_cached , l: n_ctx
512572 \ )
513573 else
514- let l: info = printf (" %s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms" ,
574+ let l: info = printf (" %s | context: %d / %d / r= %d / q=%d / e= %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms" ,
515575 \ g: llama_config .show_info == 2 ? l: prefix : ' llama.vim' ,
516- \ l: n_cached , l: n_ctx , len (s: ring_chunks ), s: ring_n_evict ,
576+ \ l: n_cached , l: n_ctx , len (s: ring_chunks ), len ( s: ring_queued ), s: ring_n_evict ,
517577 \ l: n_prompt , l: t_prompt_ms , l: s_prompt ,
518578 \ l: n_predict , l: t_predict_ms , l: s_predict ,
519579 \ 1000.0 * reltimefloat (reltime (s: t_fim_start ))
0 commit comments