@@ -155,6 +155,7 @@ class llama_token_data_array(Structure):
155155# int n_gpu_layers; // number of layers to store in VRAM
156156# int main_gpu; // the GPU that is used for scratch and small tensors
157157# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
158+ # bool low_vram; // if true, reduce VRAM usage at the cost of performance
158159# int seed; // RNG seed, -1 for random
159160
160161# bool f16_kv; // use fp16 for KV cache
@@ -177,6 +178,7 @@ class llama_context_params(Structure):
177178 ("n_gpu_layers" , c_int ),
178179 ("main_gpu" , c_int ),
179180 ("tensor_split" , c_float * LLAMA_MAX_DEVICES .value ),
181+ ("low_vram" , c_bool ),
180182 ("seed" , c_int ),
181183 ("f16_kv" , c_bool ),
182184 (
@@ -555,6 +557,26 @@ def llama_n_embd(ctx: llama_context_p) -> int:
555557_lib .llama_n_embd .restype = c_int
556558
557559
560+ # // Get the vocabulary as output parameters.
561+ # // Returns number of results.
562+ # LLAMA_API int llama_get_vocab(
563+ # const struct llama_context * ctx,
564+ # const char * * strings,
565+ # float * scores,
566+ # int capacity);
567+ def llama_get_vocab (
568+ ctx : llama_context_p ,
569+ strings , # type: Array[c_char_p] # type: ignore
570+ scores , # type: Array[c_float] # type: ignore
571+ capacity : c_int ,
572+ ) -> int :
573+ return _lib .llama_get_vocab (ctx , strings , scores , capacity )
574+
575+
576+ _lib .llama_get_vocab .argtypes = [llama_context_p , c_char_p , c_float , c_int ]
577+ _lib .llama_get_vocab .restype = c_int
578+
579+
558580# Token logits obtained from the last call to llama_eval()
559581# The logits for the last token are stored in the last row
560582# Can be mutated in order to change the probabilities of the next token
0 commit comments