|
17 | 17 | " |
18 | 18 | " start the llama.cpp server with a FIM-compatible model. for example: |
19 | 19 | " |
20 | | -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512 |
| 20 | +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512 |
21 | 21 | " |
22 | 22 | " --batch-size [512, model max context] |
23 | 23 | " |
@@ -54,7 +54,9 @@ highlight llama_hl_info guifg=#77ff2f |
54 | 54 | " |
55 | 55 | " - completion request |
56 | 56 | " - yank |
57 | | -" - reading a file |
| 57 | +" - entering a buffer |
| 58 | +" - leaving a buffer |
| 59 | +" - writing a file |
58 | 60 | " |
59 | 61 | " ring context parameters: |
60 | 62 | " |
@@ -208,6 +210,36 @@ function! s:pick_chunk(text, no_mod, do_evict) |
208 | 210 | endif |
209 | 211 |
|
210 | 212 | call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()}) |
| 213 | + |
| 214 | + " send asynchronous job with the new extra context so that it is ready for the next FIM |
| 215 | + let l:extra_context = [] |
| 216 | + for l:chunk in s:ring_chunks |
| 217 | + call add(l:extra_context, l:chunk.str) |
| 218 | + endfor |
| 219 | + |
| 220 | + let l:request = json_encode({ |
| 221 | + \ 'prompt': "", |
| 222 | + \ 'input_prefix': "", |
| 223 | + \ 'input_suffix': "", |
| 224 | + \ 'n_predict': 1, |
| 225 | + \ 'penalty_last_n': 0, |
| 226 | + \ 'top_k': 100, |
| 227 | + \ 'stream': v:false, |
| 228 | + \ 'samplers': ["top_k", "infill"], |
| 229 | + \ 'cache_prompt': v:true, |
| 230 | + \ 'extra_context': l:extra_context, |
| 231 | + \ 't_max_prompt_ms': 1, |
| 232 | + \ 't_max_predict_ms': 1 |
| 233 | + \ }) |
| 234 | + |
| 235 | + let l:curl_command = printf( |
| 236 | + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", |
| 237 | + \ g:llama_config.endpoint, shellescape(l:request) |
| 238 | + \ ) |
| 239 | + |
| 240 | + call jobstart(l:curl_command, { |
| 241 | + \ 'on_exit': function('s:fim_on_exit') |
| 242 | + \ }) |
211 | 243 | endfunction |
212 | 244 |
|
213 | 245 | function! llama#fim(is_auto) abort |
@@ -245,21 +277,6 @@ function! llama#fim(is_auto) abort |
245 | 277 | \ . join(l:lines_suffix, "\n") |
246 | 278 | \ . "\n" |
247 | 279 |
|
248 | | - " TODO: per-file location |
249 | | - let l:delta_y = abs(s:pos_y - s:pos_y_pick) |
250 | | - |
251 | | - " only gather chunks if the cursor has moved a lot |
252 | | - if a:is_auto && l:delta_y > 32 |
253 | | - " randomly pick a prefix or a suffix chunk |
254 | | - if s:rand(0, 1) |
255 | | - call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) |
256 | | - else |
257 | | - call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false) |
258 | | - endif |
259 | | - |
260 | | - let s:pos_y_pick = s:pos_y |
261 | | - endif |
262 | | - |
263 | 280 | " array of strings |
264 | 281 | let l:extra_context = [] |
265 | 282 | for l:chunk in s:ring_chunks |
@@ -294,6 +311,21 @@ function! llama#fim(is_auto) abort |
294 | 311 | \ 'is_auto': a:is_auto |
295 | 312 | \ }) |
296 | 313 |
|
| 314 | + " TODO: per-file location |
| 315 | + let l:delta_y = abs(s:pos_y - s:pos_y_pick) |
| 316 | + |
| 317 | + " only gather chunks if the cursor has moved a lot |
| 318 | + if a:is_auto && l:delta_y > 32 |
| 319 | + " randomly pick a prefix or a suffix chunk |
| 320 | + if s:rand(0, 1) |
| 321 | + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) |
| 322 | + else |
| 323 | + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false) |
| 324 | + endif |
| 325 | + |
| 326 | + let s:pos_y_pick = s:pos_y |
| 327 | + endif |
| 328 | + |
297 | 329 | " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line |
298 | 330 | if !a:is_auto |
299 | 331 | augroup llama_insert |
@@ -427,7 +459,8 @@ function! s:fim_on_stdout(job_id, data, event) dict |
427 | 459 | let l:generation_settings = get(l:response, 'generation_settings', {}) |
428 | 460 | let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) |
429 | 461 |
|
430 | | - let l:n_cached = get(l:response, 'tokens_cached', 0) |
| 462 | + let l:n_cached = get(l:response, 'tokens_cached', 0) |
| 463 | + let l:truncated = get(l:response, 'truncated', v:false) |
431 | 464 |
|
432 | 465 | " if response.timings is available |
433 | 466 | if len(get(l:response, 'timings', {})) > 0 |
@@ -466,22 +499,31 @@ function! s:fim_on_stdout(job_id, data, event) dict |
466 | 499 | let l:id_vt_fim = nvim_create_namespace('vt_fim') |
467 | 500 | let l:id_vt_info = nvim_create_namespace('vt_info') |
468 | 501 |
|
469 | | - " construct the info message and display it to the right of the current line |
| 502 | + " construct the info message |
470 | 503 | if g:llama_config.show_info > 0 && l:has_info |
471 | 504 | " prefix the info string with whitespace in order to offset it to the right of the fim overlay |
472 | 505 | let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) |
473 | 506 |
|
474 | | - let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", |
475 | | - \ g:llama_config.show_info == 2 ? l:prefix : '', |
476 | | - \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, |
477 | | - \ l:n_prompt, l:t_prompt_ms, l:s_prompt, |
478 | | - \ l:n_predict, l:t_predict_ms, l:s_predict, |
479 | | - \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) |
480 | | - \ ) |
| 507 | + if l:truncated |
| 508 | + let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", |
| 509 | + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', |
| 510 | + \ l:n_cached, l:n_ctx |
| 511 | + \ ) |
| 512 | + else |
| 513 | + let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", |
| 514 | + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', |
| 515 | + \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, |
| 516 | + \ l:n_prompt, l:t_prompt_ms, l:s_prompt, |
| 517 | + \ l:n_predict, l:t_predict_ms, l:s_predict, |
| 518 | + \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) |
| 519 | + \ ) |
| 520 | + endif |
481 | 521 |
|
482 | 522 | if g:llama_config.show_info == 1 |
| 523 | + "" display it in the statusline |
483 | 524 | let &statusline = l:info |
484 | 525 | elseif g:llama_config.show_info == 2 |
| 526 | + " display it to the right of the current line |
485 | 527 | call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { |
486 | 528 | \ 'virt_text': [[l:info, 'llama_hl_info']], |
487 | 529 | \ 'virt_text_pos': 'eol', |
|
0 commit comments