@@ -320,10 +320,12 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
320320LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
321321
322322# enum llama_pooling_type {
323+ # LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
323324# LLAMA_POOLING_TYPE_NONE = 0,
324325# LLAMA_POOLING_TYPE_MEAN = 1,
325326# LLAMA_POOLING_TYPE_CLS = 2,
326327# };
328+ LLAMA_POOLING_TYPE_UNSPECIFIED = - 1
327329LLAMA_POOLING_TYPE_NONE = 0
328330LLAMA_POOLING_TYPE_MEAN = 1
329331LLAMA_POOLING_TYPE_CLS = 2
@@ -547,7 +549,10 @@ class llama_model_params(ctypes.Structure):
547549# uint32_t n_batch; // prompt processing maximum batch size
548550# uint32_t n_threads; // number of threads to use for generation
549551# uint32_t n_threads_batch; // number of threads to use for batch processing
550- # int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
552+
553+ # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
554+ # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
555+ # // (ignored if no pooling layer)
551556
552557# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
553558# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -569,7 +574,6 @@ class llama_model_params(ctypes.Structure):
569574# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
570575# bool embedding; // embedding mode only
571576# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
572- # bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
573577
574578# // Abort callback
575579# // if it returns true, execution of llama_decode() will be aborted
@@ -587,6 +591,7 @@ class llama_context_params(ctypes.Structure):
587591 n_threads (int): number of threads to use for generation
588592 n_threads_batch (int): number of threads to use for batch processing
589593 rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
594+ pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
590595 rope_freq_base (float): RoPE base frequency, 0 = from model
591596 rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
592597 yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -602,7 +607,6 @@ class llama_context_params(ctypes.Structure):
602607 logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
603608 embedding (bool): embedding mode only
604609 offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
605- do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606610 abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
607611 abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
608612 """
@@ -613,7 +617,8 @@ class llama_context_params(ctypes.Structure):
613617 ("n_batch" , ctypes .c_uint32 ),
614618 ("n_threads" , ctypes .c_uint32 ),
615619 ("n_threads_batch" , ctypes .c_uint32 ),
616- ("rope_scaling_type" , ctypes .c_int32 ),
620+ ("rope_scaling_type" , ctypes .c_int ),
621+ ("pooling_type" , ctypes .c_int ),
617622 ("rope_freq_base" , ctypes .c_float ),
618623 ("rope_freq_scale" , ctypes .c_float ),
619624 ("yarn_ext_factor" , ctypes .c_float ),
@@ -629,7 +634,6 @@ class llama_context_params(ctypes.Structure):
629634 ("logits_all" , ctypes .c_bool ),
630635 ("embedding" , ctypes .c_bool ),
631636 ("offload_kqv" , ctypes .c_bool ),
632- ("do_pooling" , ctypes .c_bool ),
633637 ("abort_callback" , ggml_abort_callback ),
634638 ("abort_callback_data" , ctypes .c_void_p ),
635639 ]
0 commit comments