@@ -942,6 +942,7 @@ class llama_context_params(ctypes.Structure):
942942# void * imatrix; // pointer to importance matrix data
943943# void * kv_overrides; // pointer to vector containing overrides
944944# void * tensor_types; // pointer to vector containing tensor types
945+ # void * prune_layers; // pointer to vector containing layer indices to prune
945946# } llama_model_quantize_params;
946947class llama_model_quantize_params (ctypes .Structure ):
947948 """Parameters for llama_model_quantize
@@ -959,6 +960,7 @@ class llama_model_quantize_params(ctypes.Structure):
959960 imatrix (ctypes.c_void_p): pointer to importance matrix data
960961 kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
961962 tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
963+ prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune
962964 """
963965
964966 if TYPE_CHECKING :
@@ -974,6 +976,7 @@ class llama_model_quantize_params(ctypes.Structure):
974976 imatrix : ctypes .c_void_p
975977 kv_overrides : ctypes .c_void_p
976978 tensor_types : ctypes .c_void_p
979+ prune_layers : ctypes .c_void_p
977980
978981 _fields_ = [
979982 ("nthread" , ctypes .c_int32 ),
@@ -988,6 +991,7 @@ class llama_model_quantize_params(ctypes.Structure):
988991 ("imatrix" , ctypes .c_void_p ),
989992 ("kv_overrides" , ctypes .c_void_p ),
990993 ("tensor_types" , ctypes .c_void_p ),
994+ ("prune_layers" , ctypes .c_void_p ),
991995 ]
992996
993997
@@ -2473,23 +2477,26 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
24732477# // Requires the context to have a memory.
24742478# // For encode-decoder contexts, processes the batch using the decoder.
24752479# // Positive return values does not mean a fatal error, but rather a warning.
2476- # // Upon non-zero return values, the memory state is restored to the state before this call
2480+ # // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
2481+ # // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
2482+ # // Upon other return values, the memory state is restored to the state before this call
24772483# // 0 - success
24782484# // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
2479- # // 2 - aborted
2485+ # // 2 - aborted (processed ubatches will remain in the context's memory)
24802486# // -1 - invalid input batch
2481- # // < -1 - error
2487+ # // < -1 - fatal error (processed ubatches will remain in the context's memory)
24822488# LLAMA_API int32_t llama_decode(
24832489# struct llama_context * ctx,
2484- # struct llama_batch batch);
2490+ # struct llama_batch batch);
24852491@ctypes_function ("llama_decode" , [llama_context_p_ctypes , llama_batch ], ctypes .c_int32 )
24862492def llama_decode (ctx : llama_context_p , batch : llama_batch , / ) -> int :
24872493 """Positive return values does not mean a fatal error, but rather a warning.
24882494 0 - success
24892495 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
2490- 2 - aborted
2496+ 2 - aborted (processed ubatches will remain in the context's memory)
24912497 -1 - invalid input batch
2492- < -1 - error"""
2498+ < -1 - fatal error (processed ubatches will remain in the context's memory)
2499+ """
24932500 ...
24942501
24952502
@@ -3096,6 +3103,7 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
30963103# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
30973104# /// @return Returns the number of tokens on success, no more than n_tokens_max
30983105# /// @return Returns a negative number on failure - the number of tokens that would have been returned
3106+ # /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
30993107# /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
31003108# /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
31013109# /// as plaintext. Does not insert a leading space.
0 commit comments