llama.vim : async context processing

ggerganov · ggerganov · commit 59c0756a09a6 · 2024-10-13T18:53:53.000+03:00
ggml-ci
diff --git a/examples/llama.vim b/examples/llama.vim
@@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
 "
 "   --batch-size [512, model max context]
 "
@@ -54,7 +54,9 @@ highlight llama_hl_info guifg=#77ff2f
 "
 "  - completion request
 "  - yank
-"  - reading a file
+"  - entering a buffer
+"  - leaving a buffer
+"  - writing a file
 "
 " ring context parameters:
 "
@@ -208,6 +210,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
     endif
 
     call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
+
+    " send asynchronous job with the new extra context so that it is ready for the next FIM
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, l:chunk.str)
+    endfor
+
+    let l:request = json_encode({
+        \ 'prompt':           "",
+        \ 'input_prefix':     "",
+        \ 'input_suffix':     "",
+        \ 'n_predict':        1,
+        \ 'penalty_last_n':   0,
+        \ 'top_k':            100,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["top_k", "infill"],
+        \ 'cache_prompt':     v:true,
+        \ 'extra_context':    l:extra_context,
+        \ 't_max_prompt_ms':  1,
+        \ 't_max_predict_ms': 1
+        \ })
+
+    let l:curl_command = printf(
+        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
+        \ g:llama_config.endpoint, shellescape(l:request)
+        \ )
+
+    call jobstart(l:curl_command, {
+        \ 'on_exit':   function('s:fim_on_exit')
+        \ })
 endfunction
 
 function! llama#fim(is_auto) abort
@@ -245,21 +277,6 @@ function! llama#fim(is_auto) abort
         \ . join(l:lines_suffix, "\n")
         \ . "\n"
 
-    " TODO: per-file location
-    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
-
-    " only gather chunks if the cursor has moved a lot
-    if a:is_auto && l:delta_y > 32
-        " randomly pick a prefix or a suffix chunk
-        if s:rand(0, 1)
-            call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
-        else
-            call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
-        endif
-
-        let s:pos_y_pick = s:pos_y
-    endif
-
     " array of strings
     let l:extra_context = []
     for l:chunk in s:ring_chunks
@@ -294,6 +311,21 @@ function! llama#fim(is_auto) abort
         \ 'is_auto': a:is_auto
         \ })
 
+    " TODO: per-file location
+    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
+
+    " only gather chunks if the cursor has moved a lot
+    if a:is_auto && l:delta_y > 32
+        " randomly pick a prefix or a suffix chunk
+        if s:rand(0, 1)
+            call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+        else
+            call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
+        endif
+
+        let s:pos_y_pick = s:pos_y
+    endif
+
     " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
     if !a:is_auto
         augroup llama_insert
@@ -427,7 +459,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
         let l:generation_settings = get(l:response, 'generation_settings', {})
         let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
 
-        let l:n_cached = get(l:response, 'tokens_cached', 0)
+        let l:n_cached  = get(l:response, 'tokens_cached', 0)
+        let l:truncated = get(l:response, 'truncated', v:false)
 
         " if response.timings is available
         if len(get(l:response, 'timings', {})) > 0
@@ -466,22 +499,31 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:id_vt_fim  = nvim_create_namespace('vt_fim')
     let l:id_vt_info = nvim_create_namespace('vt_info')
 
-    " construct the info message and display it to the right of the current line
+    " construct the info message
     if g:llama_config.show_info > 0 && l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
-        let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
-            \ g:llama_config.show_info == 2 ? l:prefix : '',
-            \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
-            \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
-            \ l:n_predict, l:t_predict_ms, l:s_predict,
-            \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
-            \ )
+        if l:truncated
+            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached, l:n_ctx
+                \ )
+        else
+            let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
+                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
+                \ l:n_predict, l:t_predict_ms, l:s_predict,
+                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
+                \ )
+        endif
 
         if g:llama_config.show_info == 1
+            "" display it in the statusline
             let &statusline = l:info
         elseif g:llama_config.show_info == 2
+            " display it to the right of the current line
             call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
                 \ 'virt_text': [[l:info, 'llama_hl_info']],
                 \ 'virt_text_pos': 'eol',
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1965,12 +1965,12 @@ struct server_context {
                                         }
                                     }
 
-                                    // for now pick FIM context to fit in half batch (ratio prefix:suffix = 3:1, TODO: configurable?)
-                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch/2)/4);
-                                    const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch/2 - 3) - n_suffix_take);
+                                    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+                                    const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
+                                    const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
 
                                     // fill the rest of the context with extra chunks
-                                    const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch/2) - 2*slot.n_predict), slot.extra_tokens.size());
+                                    const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
 
                                     prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
                                     suffix_tokens.resize(n_suffix_take);

Original file line number	Diff line number	Diff line change
`@@ -1965,12 +1965,12 @@ struct server_context {`
`1965`	`1965`	`}`
`1966`	`1966`	`}`
`1967`	`1967`
`1968`		`- // for now pick FIM context to fit in half batch (ratio prefix:suffix = 3:1, TODO: configurable?)`
`1969`		`- const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch/2)/4);`
`1970`		`- const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch/2 - 3) - n_suffix_take);`
	`1968`	`+ // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)`
	`1969`	`+ const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);`
	`1970`	`+ const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);`
`1971`	`1971`
`1972`	`1972`	`// fill the rest of the context with extra chunks`
`1973`		`- const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch/2) - 2*slot.n_predict), slot.extra_tokens.size());`
	`1973`	`+ const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());`
`1974`	`1974`
`1975`	`1975`	`prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);`
`1976`	`1976`	`suffix_tokens.resize(n_suffix_take);`