diff --git a/autoload/llama.vim b/autoload/llama.vim index 8f4c290..f1da138 100644 --- a/autoload/llama.vim +++ b/autoload/llama.vim @@ -307,9 +307,9 @@ function! s:ring_update() \ 'samplers': ["temperature"], \ 'cache_prompt': v:true, \ 't_max_prompt_ms': 1, - \ 't_max_predict_ms': 1 + \ 't_max_predict_ms': 1, + \ 'response_fields': [""] \ }) - let l:curl_command = [ \ "curl", \ "--silent", @@ -420,7 +420,20 @@ function! llama#fim(is_auto, cache) abort \ 'samplers': ["top_k", "top_p", "infill"], \ 'cache_prompt': v:true, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, - \ 't_max_predict_ms': g:llama_config.t_max_predict_ms + \ 't_max_predict_ms': g:llama_config.t_max_predict_ms, + \ 'response_fields': [ + \ "content", + \ "timings/prompt_n", + \ "timings/prompt_ms", + \ "timings/prompt_per_token_ms", + \ "timings/prompt_per_second", + \ "timings/predicted_n", + \ "timings/predicted_ms", + \ "timings/predicted_per_token_ms", + \ "timings/predicted_per_second", + \ "truncated", + \ "tokens_cached", + \ ], \ }) let l:curl_command = [ @@ -662,24 +675,21 @@ function! s:fim_on_stdout(hash, cache, pos_x, pos_y, is_auto, job_id, data, even call remove(s:content, -1) endwhile - let l:generation_settings = get(l:response, 'generation_settings', {}) - let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) - - let l:n_cached = get(l:response, 'tokens_cached', 0) - let l:truncated = get(l:response, 'truncated', v:false) + let l:n_cached = get(l:response, 'timings/tokens_cached', 0) + let l:truncated = get(l:response, 'timings/truncated', v:false) " if response.timings is available - if len(get(l:response, 'timings', {})) > 0 + if has_key(l:response, 'timings/prompt_n') && has_key(l:response, 'timings/prompt_ms') && has_key(l:response, 'timings/prompt_per_second') + \ && has_key(l:response, 'timings/predicted_n') && has_key(l:response, 'timings/predicted_ms') && has_key(l:response, 'timings/predicted_per_second') let l:has_info = v:true - let l:timings = get(l:response, 'timings', {}) - let l:n_prompt = get(l:timings, 'prompt_n', 0) - let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) - let l:s_prompt = get(l:timings, 'prompt_per_second', 0) + let l:n_prompt = get(l:response, 'timings/prompt_n', 0) + let l:t_prompt_ms = get(l:response, 'timings/prompt_ms', 1) + let l:s_prompt = get(l:response, 'timings/prompt_per_second', 0) - let l:n_predict = get(l:timings, 'predicted_n', 0) - let l:t_predict_ms = get(l:timings, 'predicted_ms', 1) - let l:s_predict = get(l:timings, 'predicted_per_second', 0) + let l:n_predict = get(l:response, 'timings/predicted_n', 0) + let l:t_predict_ms = get(l:response, 'timings/predicted_ms', 1) + let l:s_predict = get(l:response, 'timings/predicted_per_second', 0) endif " if response was pulled from cache @@ -772,9 +782,9 @@ function! s:fim_on_stdout(hash, cache, pos_x, pos_y, is_auto, job_id, data, even let l:prefix = ' ' if l:truncated - let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", + let l:info = printf("%s | WARNING: the context is full: %d, increase the server context size or reduce g:llama_config.ring_n_chunks", \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx + \ l:n_cached \ ) elseif l:is_cached let l:info = printf("%s | C: %d / %d, | t: %.2f ms", @@ -783,9 +793,9 @@ function! s:fim_on_stdout(hash, cache, pos_x, pos_y, is_auto, job_id, data, even \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) \ ) else - let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", + let l:info = printf("%s | c: %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), + \ l:n_cached, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start))