Skip to content

Commit 59c0756

Browse files
committed
llama.vim : async context processing
ggml-ci
1 parent 78dc4e9 commit 59c0756

File tree

2 files changed

+72
-30
lines changed

2 files changed

+72
-30
lines changed

examples/llama.vim

Lines changed: 68 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"
1818
" start the llama.cpp server with a FIM-compatible model. for example:
1919
"
20-
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512
20+
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
2121
"
2222
" --batch-size [512, model max context]
2323
"
@@ -54,7 +54,9 @@ highlight llama_hl_info guifg=#77ff2f
5454
"
5555
" - completion request
5656
" - yank
57-
" - reading a file
57+
" - entering a buffer
58+
" - leaving a buffer
59+
" - writing a file
5860
"
5961
" ring context parameters:
6062
"
@@ -208,6 +210,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
208210
endif
209211

210212
call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
213+
214+
" send asynchronous job with the new extra context so that it is ready for the next FIM
215+
let l:extra_context = []
216+
for l:chunk in s:ring_chunks
217+
call add(l:extra_context, l:chunk.str)
218+
endfor
219+
220+
let l:request = json_encode({
221+
\ 'prompt': "",
222+
\ 'input_prefix': "",
223+
\ 'input_suffix': "",
224+
\ 'n_predict': 1,
225+
\ 'penalty_last_n': 0,
226+
\ 'top_k': 100,
227+
\ 'stream': v:false,
228+
\ 'samplers': ["top_k", "infill"],
229+
\ 'cache_prompt': v:true,
230+
\ 'extra_context': l:extra_context,
231+
\ 't_max_prompt_ms': 1,
232+
\ 't_max_predict_ms': 1
233+
\ })
234+
235+
let l:curl_command = printf(
236+
\ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
237+
\ g:llama_config.endpoint, shellescape(l:request)
238+
\ )
239+
240+
call jobstart(l:curl_command, {
241+
\ 'on_exit': function('s:fim_on_exit')
242+
\ })
211243
endfunction
212244

213245
function! llama#fim(is_auto) abort
@@ -245,21 +277,6 @@ function! llama#fim(is_auto) abort
245277
\ . join(l:lines_suffix, "\n")
246278
\ . "\n"
247279

248-
" TODO: per-file location
249-
let l:delta_y = abs(s:pos_y - s:pos_y_pick)
250-
251-
" only gather chunks if the cursor has moved a lot
252-
if a:is_auto && l:delta_y > 32
253-
" randomly pick a prefix or a suffix chunk
254-
if s:rand(0, 1)
255-
call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
256-
else
257-
call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
258-
endif
259-
260-
let s:pos_y_pick = s:pos_y
261-
endif
262-
263280
" array of strings
264281
let l:extra_context = []
265282
for l:chunk in s:ring_chunks
@@ -294,6 +311,21 @@ function! llama#fim(is_auto) abort
294311
\ 'is_auto': a:is_auto
295312
\ })
296313

314+
" TODO: per-file location
315+
let l:delta_y = abs(s:pos_y - s:pos_y_pick)
316+
317+
" only gather chunks if the cursor has moved a lot
318+
if a:is_auto && l:delta_y > 32
319+
" randomly pick a prefix or a suffix chunk
320+
if s:rand(0, 1)
321+
call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
322+
else
323+
call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
324+
endif
325+
326+
let s:pos_y_pick = s:pos_y
327+
endif
328+
297329
" this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
298330
if !a:is_auto
299331
augroup llama_insert
@@ -427,7 +459,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
427459
let l:generation_settings = get(l:response, 'generation_settings', {})
428460
let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
429461

430-
let l:n_cached = get(l:response, 'tokens_cached', 0)
462+
let l:n_cached = get(l:response, 'tokens_cached', 0)
463+
let l:truncated = get(l:response, 'truncated', v:false)
431464

432465
" if response.timings is available
433466
if len(get(l:response, 'timings', {})) > 0
@@ -466,22 +499,31 @@ function! s:fim_on_stdout(job_id, data, event) dict
466499
let l:id_vt_fim = nvim_create_namespace('vt_fim')
467500
let l:id_vt_info = nvim_create_namespace('vt_info')
468501

469-
" construct the info message and display it to the right of the current line
502+
" construct the info message
470503
if g:llama_config.show_info > 0 && l:has_info
471504
" prefix the info string with whitespace in order to offset it to the right of the fim overlay
472505
let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
473506

474-
let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
475-
\ g:llama_config.show_info == 2 ? l:prefix : '',
476-
\ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
477-
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
478-
\ l:n_predict, l:t_predict_ms, l:s_predict,
479-
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
480-
\ )
507+
if l:truncated
508+
let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
509+
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
510+
\ l:n_cached, l:n_ctx
511+
\ )
512+
else
513+
let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
514+
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
515+
\ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
516+
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
517+
\ l:n_predict, l:t_predict_ms, l:s_predict,
518+
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
519+
\ )
520+
endif
481521

482522
if g:llama_config.show_info == 1
523+
"" display it in the statusline
483524
let &statusline = l:info
484525
elseif g:llama_config.show_info == 2
526+
" display it to the right of the current line
485527
call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
486528
\ 'virt_text': [[l:info, 'llama_hl_info']],
487529
\ 'virt_text_pos': 'eol',

examples/server/server.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1965,12 +1965,12 @@ struct server_context {
19651965
}
19661966
}
19671967

1968-
// for now pick FIM context to fit in half batch (ratio prefix:suffix = 3:1, TODO: configurable?)
1969-
const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch/2)/4);
1970-
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch/2 - 3) - n_suffix_take);
1968+
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
1969+
const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
1970+
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
19711971

19721972
// fill the rest of the context with extra chunks
1973-
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch/2) - 2*slot.n_predict), slot.extra_tokens.size());
1973+
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
19741974

19751975
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
19761976
suffix_tokens.resize(n_suffix_take);

0 commit comments

Comments
 (0)