|
1 | 1 | llama.vim *llama* |
2 | | -================================================================================ |
3 | 2 |
|
4 | 3 | LLM-based text completion using llama.cpp |
5 | 4 |
|
| 5 | +================================================================================ |
| 6 | +Requirement |
| 7 | + |
6 | 8 | Requires: |
7 | 9 |
|
8 | 10 | - neovim or vim 9.1+ |
9 | 11 | - curl |
10 | 12 | - llama.cpp server instance |
11 | 13 | - FIM-compatible model |
12 | 14 |
|
13 | | -Sample config: |
| 15 | + |
| 16 | +================================================================================ |
| 17 | +Shortcut |
14 | 18 |
|
15 | 19 | - Tab - accept the current suggestion |
16 | 20 | - Shift+Tab - accept just the first line of the suggestion |
17 | 21 | - Ctrl+B - accept just the first word of the suggestion |
18 | 22 | - Ctrl+F - toggle FIM completion manually |
19 | 23 |
|
20 | | -Start the llama.cpp server with a FIM-compatible model. For example: |
| 24 | +================================================================================ |
| 25 | +How to start the server |
21 | 26 |
|
22 | | -$ llama-server \ |
23 | | - -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 \ |
24 | | - --ubatch-size 512 --batch-size 1024 \ |
25 | | - --ctx-size 0 --cache-reuse 256 |
| 27 | +Start the llama.cpp server with a FIM-compatible model. For example: |
26 | 28 |
|
27 | | ---batch-size [512, model max context] |
| 29 | +>sh |
| 30 | + $ llama-server \ |
| 31 | + -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 \ |
| 32 | + --ubatch-size 512 --batch-size 1024 \ |
| 33 | + --ctx-size 0 --cache-reuse 256 |
| 34 | +< |
| 35 | +`--batch-size` [512, model max context] |
28 | 36 |
|
29 | 37 | Adjust the batch size to control how much of the provided local context will |
30 | 38 | be used during the inference lower values will use smaller part of the context |
31 | 39 | around the cursor, which will result in faster processing. |
32 | 40 |
|
33 | | ---ubatch-size [64, 2048] |
| 41 | +`--ubatch-size` [64, 2048] |
34 | 42 |
|
35 | 43 | Chunks the batch into smaller chunks for faster processing depends on the |
36 | 44 | specific hardware. Use llama-bench to profile and determine the best size. |
37 | 45 |
|
38 | | ---ctx-size [1024, model max context], 0 - use model max context |
| 46 | +`--ctx-size` [1024, model max context], 0 - use model max context |
39 | 47 |
|
40 | 48 | The maximum amount of context that the server will use. Ideally, this should |
41 | 49 | be the same as the model's max context size, but if your device does not have |
42 | 50 | enough memory to handle the full context, you can try to reduce this value. |
43 | 51 |
|
44 | | ---cache-reuse (ge:llama_config.n_predict, 1024] |
| 52 | +`--cache-reuse` (ge:llama_config.n_predict, 1024] |
45 | 53 |
|
46 | 54 | This should be either 0 (disabled) or strictly larger than |
47 | 55 | g:llama_config.n_predict using non-zero value enables context reuse on the |
48 | 56 | server side which dramatically improves the performance at large contexts. A |
49 | 57 | value of 256 should be good for all cases. |
50 | 58 |
|
51 | | -More info: |
| 59 | +================================================================================ |
| 60 | +Configuration *llama_config* |
| 61 | + |
| 62 | +To customize the behaviour of `llama.vim`, you can set/modify the `g:llama_config` |
| 63 | +variable. |
| 64 | + |
| 65 | +Currently the default config is: |
| 66 | +>vim |
| 67 | + let s:default_config = { |
| 68 | + \ 'endpoint': 'http://127.0.0.1:8012/infill', |
| 69 | + \ 'api_key': '', |
| 70 | + \ 'n_prefix': 256, |
| 71 | + \ 'n_suffix': 64, |
| 72 | + \ 'n_predict': 128, |
| 73 | + \ 't_max_prompt_ms': 500, |
| 74 | + \ 't_max_predict_ms': 500, |
| 75 | + \ 'show_info': 2, |
| 76 | + \ 'auto_fim': v:true, |
| 77 | + \ 'max_line_suffix': 8, |
| 78 | + \ 'max_cache_keys': 250, |
| 79 | + \ 'ring_n_chunks': 16, |
| 80 | + \ 'ring_chunk_size': 64, |
| 81 | + \ 'ring_scope': 1024, |
| 82 | + \ 'ring_update_ms': 1000, |
| 83 | + \ } |
| 84 | +< |
| 85 | + |
| 86 | +- {endpoint} llama.cpp server endpoint |
| 87 | + |
| 88 | +- {api_key} llama.cpp server api key (optional) |
| 89 | + |
| 90 | +- {n_prefix} number of lines before the cursor location to include |
| 91 | + in the local prefix |
| 92 | + |
| 93 | +- {n_suffix} number of lines after the cursor location to include |
| 94 | + in the local suffix |
| 95 | + |
| 96 | +- {n_predict} max number of tokens to predict |
| 97 | + |
| 98 | +- {t_max_prompt_ms} max alloted time for the prompt processing |
| 99 | + (TODO: not yet supported) |
| 100 | + |
| 101 | +- {t_max_predict_ms} max alloted time for the prediction |
| 102 | + |
| 103 | +- {show_info} show extra info about the inference |
| 104 | + (0 - disabled, 1 - statusline, 2 - inline) |
| 105 | + |
| 106 | +- {auto_fim} trigger FIM completion automatically on cursor movement |
| 107 | + |
| 108 | +- {max_line_suffix} do not auto-trigger FIM completion if there are |
| 109 | + more than this number of characters to the right |
| 110 | + of the cursor |
| 111 | + |
| 112 | +- {max_cache_keys} max number of cached completions to keep in result_cache |
| 113 | + |
| 114 | + |
| 115 | +parameters for the ring-buffer with extra context: |
| 116 | + |
| 117 | +- {ring_n_chunks} max number of chunks to pass as extra context to the |
| 118 | + server (0 to disable) |
| 119 | + |
| 120 | +- {ring_chunk_size} max size of the chunks (in number of lines) |
| 121 | + Note: Adjust these numbers so that you don't overrun |
| 122 | + your context. At `ring_n_chunks = 64` and |
| 123 | + `ring_chunk_size = 64` you need ~32k context |
| 124 | + |
| 125 | +- {ring_scope} the range around the cursor position (in number of |
| 126 | + lines) for gathering chunks after FIM |
| 127 | + |
| 128 | +- {ring_update_ms} how often to process queued chunks in normal mode |
| 129 | + |
| 130 | +Example: |
| 131 | + |
| 132 | +1. Disable the inline info (vimscript): |
| 133 | +>vim |
| 134 | + " put before llama.vim loads |
| 135 | + let g:llama_config = { 'show_info': 0 } |
| 136 | + |
| 137 | +2. Same but setting the specific variable directly |
| 138 | +>vim |
| 139 | + let g:llama_config.show_info = v:false |
| 140 | +< |
| 141 | + |
| 142 | +3. Disable auto FIM completion, etc (lazy.nvim, lua): |
| 143 | +>lua |
| 144 | + { |
| 145 | + 'ggml-org/llama.vim', |
| 146 | + init = function() |
| 147 | + vim.g.llama_config = { |
| 148 | + n_prefix = 1024, |
| 149 | + n_suffix= 1024, |
| 150 | + auto_fim = false, |
| 151 | + } |
| 152 | + end, |
| 153 | + } |
| 154 | +< |
| 155 | + |
| 156 | +To adjust the colors for hint and info texts, `llama.vim` provides the |
| 157 | +highlight group `llama_hl_hint` and `llama_hl_info`. You can modify these |
| 158 | +groups using the normal way. |
| 159 | + |
| 160 | +Example: |
| 161 | + |
| 162 | +vimscript: |
| 163 | +>vim |
| 164 | + highlight llama_hl_hint guifg=#f8732e ctermfg=209 |
| 165 | + highlight llama_hl_info guifg=#50fa7b ctermfg=119 |
| 166 | +< |
| 167 | +lua: |
| 168 | +>lua |
| 169 | + vim.api.nvim_set_hl(0, "llama_hl_hint", {fg = "#f8732e", ctermfg=209}) |
| 170 | + vim.api.nvim_set_hl(0, "llama_hl_info", {fg = "#50fa7b", ctermfg=119}) |
| 171 | +< |
| 172 | + |
| 173 | +================================================================================ |
| 174 | +More Info |
52 | 175 |
|
53 | 176 | - https://github.com/ggml-org/llama.vim |
54 | 177 | - https://github.com/ggerganov/llama.cpp/pull/9787 |
|
0 commit comments