@@ -751,7 +751,7 @@ class llama_model_params(ctypes.Structure):
751751
752752
753753# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
754- # // https://github.com/ggerganov /llama.cpp/pull/7544
754+ # // https://github.com/ggml-org /llama.cpp/pull/7544
755755# struct llama_context_params {
756756# uint32_t n_ctx; // text context, 0 = from model
757757# uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
@@ -764,7 +764,7 @@ class llama_model_params(ctypes.Structure):
764764# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
765765# enum llama_attention_type attention_type; // attention type to use for embeddings
766766
767- # // ref: https://github.com/ggerganov /llama.cpp/pull/2054
767+ # // ref: https://github.com/ggml-org /llama.cpp/pull/2054
768768# float rope_freq_base; // RoPE base frequency, 0 = from model
769769# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
770770# float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
@@ -779,20 +779,17 @@ class llama_model_params(ctypes.Structure):
779779
780780# enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
781781# enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
782-
783- # // Keep the booleans together to avoid misalignment during copy-by-value.
784- # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
785- # bool embeddings; // if true, extract embeddings (together with logits)
786- # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
787- # bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
788- # bool no_perf; // whether to measure performance timings
789-
790-
791782# // Abort callback
792783# // if it returns true, execution of llama_decode() will be aborted
793784# // currently works only with CPU execution
794785# ggml_abort_callback abort_callback;
795786# void * abort_callback_data;
787+
788+ # // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
789+ # bool embeddings; // if true, extract embeddings (together with logits)
790+ # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
791+ # bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
792+ # bool no_perf; // whether to measure performance timings
796793# };
797794class llama_context_params (ctypes .Structure ):
798795 """Parameters for llama_context
@@ -819,13 +816,12 @@ class llama_context_params(ctypes.Structure):
819816 cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
820817 type_k (int): data type for K cache
821818 type_v (int): data type for V cache
822- logits_all (bool): the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
819+ abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
820+ abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
823821 embeddings (bool): if true, extract embeddings (together with logits)
824822 offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
825823 flash_attn (bool): whether to use flash attention
826824 no_perf (bool): whether to measure performance timings
827- abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
828- abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
829825 """
830826
831827 if TYPE_CHECKING :
@@ -850,13 +846,12 @@ class llama_context_params(ctypes.Structure):
850846 cb_eval_user_data : ctypes .c_void_p
851847 type_k : int
852848 type_v : int
853- logits_all : bool
849+ abort_callback : Callable [[ctypes .c_void_p ], bool ]
850+ abort_callback_data : ctypes .c_void_p
854851 embeddings : bool
855852 offload_kqv : bool
856853 flash_attn : bool
857854 no_perf : bool
858- abort_callback : Callable [[ctypes .c_void_p ], bool ]
859- abort_callback_data : ctypes .c_void_p
860855
861856 _fields_ = [
862857 ("n_ctx" , ctypes .c_uint32 ),
@@ -880,13 +875,12 @@ class llama_context_params(ctypes.Structure):
880875 ("cb_eval_user_data" , ctypes .c_void_p ),
881876 ("type_k" , ctypes .c_int ),
882877 ("type_v" , ctypes .c_int ),
883- ("logits_all" , ctypes .c_bool ),
878+ ("abort_callback" , ggml_abort_callback ),
879+ ("abort_callback_data" , ctypes .c_void_p ),
884880 ("embeddings" , ctypes .c_bool ),
885881 ("offload_kqv" , ctypes .c_bool ),
886882 ("flash_attn" , ctypes .c_bool ),
887883 ("no_perf" , ctypes .c_bool ),
888- ("abort_callback" , ggml_abort_callback ),
889- ("abort_callback_data" , ctypes .c_void_p ),
890884 ]
891885
892886
@@ -2683,10 +2677,12 @@ def llama_batch_free(batch: llama_batch, /):
26832677 ...
26842678
26852679
2686- # // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
2687- # // Stores the encoder output internally for later use by the decoder cross-attention layers.
2680+ # // Process a batch of tokens.
2681+ # // In contrast to llama_decode() - this call does not use KV cache.
2682+ # // For encode-decoder contexts, processes the batch using the encoder.
2683+ # // Can store the encoder output internally for later use by the decoder's cross-attention layers.
26882684# // 0 - success
2689- # // < 0 - error
2685+ # // < 0 - error. the KV cache state is restored to the state before this call
26902686# LLAMA_API int32_t llama_encode(
26912687# struct llama_context * ctx,
26922688# struct llama_batch batch);
@@ -2699,10 +2695,13 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
26992695 ...
27002696
27012697
2698+ # // Process a batch of tokens.
2699+ # // Requires KV cache.
2700+ # // For encode-decoder contexts, processes the batch using the decoder.
27022701# // Positive return values does not mean a fatal error, but rather a warning.
27032702# // 0 - success
27042703# // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
2705- # // < 0 - error
2704+ # // < 0 - error. the KV cache state is restored to the state before this call
27062705# LLAMA_API int32_t llama_decode(
27072706# struct llama_context * ctx,
27082707# struct llama_batch batch);
0 commit comments