Skip to content

Commit 81e2ed6

Browse files
committed
Sync kv-cells : fix tracking of seq_pos
1 parent 67a952d commit 81e2ed6

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

llama_cpp/llama_cpp.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,7 @@ class llama_context_params(ctypes.Structure):
942942
# void * imatrix; // pointer to importance matrix data
943943
# void * kv_overrides; // pointer to vector containing overrides
944944
# void * tensor_types; // pointer to vector containing tensor types
945+
# void * prune_layers; // pointer to vector containing layer indices to prune
945946
# } llama_model_quantize_params;
946947
class llama_model_quantize_params(ctypes.Structure):
947948
"""Parameters for llama_model_quantize
@@ -959,6 +960,7 @@ class llama_model_quantize_params(ctypes.Structure):
959960
imatrix (ctypes.c_void_p): pointer to importance matrix data
960961
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
961962
tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
963+
prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune
962964
"""
963965

964966
if TYPE_CHECKING:
@@ -974,6 +976,7 @@ class llama_model_quantize_params(ctypes.Structure):
974976
imatrix: ctypes.c_void_p
975977
kv_overrides: ctypes.c_void_p
976978
tensor_types: ctypes.c_void_p
979+
prune_layers: ctypes.c_void_p
977980

978981
_fields_ = [
979982
("nthread", ctypes.c_int32),
@@ -988,6 +991,7 @@ class llama_model_quantize_params(ctypes.Structure):
988991
("imatrix", ctypes.c_void_p),
989992
("kv_overrides", ctypes.c_void_p),
990993
("tensor_types", ctypes.c_void_p),
994+
("prune_layers", ctypes.c_void_p),
991995
]
992996

993997

@@ -2473,23 +2477,26 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
24732477
# // Requires the context to have a memory.
24742478
# // For encode-decoder contexts, processes the batch using the decoder.
24752479
# // Positive return values does not mean a fatal error, but rather a warning.
2476-
# // Upon non-zero return values, the memory state is restored to the state before this call
2480+
# // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
2481+
# // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
2482+
# // Upon other return values, the memory state is restored to the state before this call
24772483
# // 0 - success
24782484
# // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
2479-
# // 2 - aborted
2485+
# // 2 - aborted (processed ubatches will remain in the context's memory)
24802486
# // -1 - invalid input batch
2481-
# // < -1 - error
2487+
# // < -1 - fatal error (processed ubatches will remain in the context's memory)
24822488
# LLAMA_API int32_t llama_decode(
24832489
# struct llama_context * ctx,
2484-
# struct llama_batch batch);
2490+
# struct llama_batch batch);
24852491
@ctypes_function("llama_decode", [llama_context_p_ctypes, llama_batch], ctypes.c_int32)
24862492
def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
24872493
"""Positive return values does not mean a fatal error, but rather a warning.
24882494
0 - success
24892495
1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
2490-
2 - aborted
2496+
2 - aborted (processed ubatches will remain in the context's memory)
24912497
-1 - invalid input batch
2492-
< -1 - error"""
2498+
< -1 - fatal error (processed ubatches will remain in the context's memory)
2499+
"""
24932500
...
24942501

24952502

@@ -3096,6 +3103,7 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
30963103
# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
30973104
# /// @return Returns the number of tokens on success, no more than n_tokens_max
30983105
# /// @return Returns a negative number on failure - the number of tokens that would have been returned
3106+
# /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
30993107
# /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
31003108
# /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
31013109
# /// as plaintext. Does not insert a leading space.

0 commit comments

Comments
 (0)