Sync LLAMA_API names with ggml-org/llama.cpp 20250309, support LLAMA_VOCAB_PRE_TYPE_GPT4O

JamePeng · JamePeng · commit d5ce3fe97f7a · 2025-03-09T09:54:00.000+08:00
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -66,8 +66,8 @@ Try the following:
 3. `rm -rf _skbuild/` # delete any old builds
 4. `python -m pip install .`
 5. `cd ./vendor/llama.cpp`
-6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
-7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
+6. Follow [llama.cpp's instructions](https://github.com/ggml-org/llama.cpp#build) to `cmake` llama.cpp
+7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggml-org/llama.cpp/issues)
 
 # Failure Logs
 
diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass, field
 from typing import List
 
-# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+# Based on https://github.com/ggml-org/llama.cpp/blob/master/examples/common.cpp
 
 
 @dataclass
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -89,6 +89,12 @@ def n_ctx_train(self) -> int:
     def n_embd(self) -> int:
         return llama_cpp.llama_n_embd(self.model)
 
+    def n_head_kv(self) -> int:
+        return llama_cpp.llama_model_n_head_kv(self.model)
+
+    def n_params(self) -> int:
+        return llama_cpp.llama_model_n_params(self.model)
+
     def rope_freq_scale_train(self) -> float:
         return llama_cpp.llama_model_rope_freq_scale_train(self.model)
 
@@ -100,9 +106,6 @@ def desc(self) -> str:
     def size(self) -> int:
         return llama_cpp.llama_model_size(self.model)
 
-    def n_params(self) -> int:
-        return llama_cpp.llama_model_n_params(self.model)
-
     def get_tensor(self, name: str) -> ctypes.c_void_p:
         raise NotImplementedError("get_tensor is not implemented in llama.cpp")
 
@@ -747,9 +750,7 @@ def free_wrapper(sampler: llama_cpp.llama_sampler_p):
         sampler_i.clone = llama_cpp.llama_sampler_i_clone(0)
         sampler_i.free = llama_cpp.llama_sampler_i_free(0)
 
-        self.sampler = llama_cpp.llama_sampler()
-        self.sampler.iface = ctypes.pointer(sampler_i)
-        self.sampler.ctx = None
+        self.sampler = llama_cpp.llama_sampler_init(ctypes.pointer(sampler_i), None)
 
     def get_sampler(self) -> llama_cpp.llama_sampler_p:
         return ctypes.pointer(self.sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -161,7 +161,7 @@ def __init__(
             n_ubatch: Physical batch size
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
-            rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
+            rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggml-org/llama.cpp/pull/2054
             pooling_type: Pooling type, from `enum llama_pooling_type`.
             rope_freq_base: RoPE base frequency, 0 = from model
             rope_freq_scale: RoPE frequency scaling factor, 0 = from model
@@ -1774,7 +1774,7 @@ def create_completion(
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
             typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
             logprobs: The number of logprobs to return. If None, no logprobs are returned.
             echo: Whether to echo the prompt.
@@ -1871,7 +1871,7 @@ def __call__(
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
             typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
             logprobs: The number of logprobs to return. If None, no logprobs are returned.
             echo: Whether to echo the prompt.
@@ -1971,7 +1971,7 @@ def create_chat_completion(
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
             typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
             stream: Whether to stream the results.
             stop: A list of strings to stop generation when encountered.
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -103,7 +103,10 @@
 GGML_TYPE_I64 = 27
 GGML_TYPE_F64 = 28
 GGML_TYPE_IQ1_M = 29
-GGML_TYPE_COUNT = 30
+GGML_TYPE_BF16 = 30
+GGML_TYPE_TQ1_0 = 34
+GGML_TYPE_TQ2_0 = 35
+GGML_TYPE_COUNT = 39
 
 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
@@ -257,6 +260,7 @@
 LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
+LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -708,7 +712,7 @@ class llama_model_params(ctypes.Structure):
 
 
 # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
-# //       https://github.com/ggerganov/llama.cpp/pull/7544
+# //       https://github.com/ggml-org/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t n_ctx;             // text context, 0 = from model
 #     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
@@ -721,7 +725,7 @@ class llama_model_params(ctypes.Structure):
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
 
-#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
 #     float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
 #     float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
@@ -1276,7 +1280,10 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int:
 def llama_n_seq_max(ctx: llama_context_p, /) -> int:
     ...
 
-
+# LLAMA_API int32_t llama_model_n_head_kv(const struct llama_model * model);
+@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_uint32)
+def llama_model_n_head_kv(model: llama_model_p, /) -> int:
+    ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
@@ -1357,6 +1364,11 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_head_kv(model: llama_model_p, /) -> int:
+
+
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 @ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -3235,7 +3247,7 @@ def llama_detokenize(
 
 # /// Apply chat template. Inspired by hf apply_chat_template() on python.
 # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
 # /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
 # /// @param chat Pointer to a list of multiple llama_chat_message
 # /// @param n_msg Number of llama_chat_message in this chat
@@ -3375,8 +3387,8 @@ class llama_sampler_i(ctypes.Structure):
 
 
 # struct llama_sampler {
-#     struct llama_sampler_i  * iface;
-#     llama_sampler_context_t   ctx;
+#     const struct llama_sampler_i * iface;
+#     llama_sampler_context_t        ctx;
 # };
 class llama_sampler(ctypes.Structure):
     _fields_ = [
@@ -3410,6 +3422,16 @@ class llama_sampler(ctypes.Structure):
 
 
 # // mirror of llama_sampler_i:
+
+# LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+@ctypes_function(
+    "llama_sampler_init",
+    [ctypes.POINTER(llama_sampler_i), llama_sampler_context_t],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init(smpl: llama_sampler_p, /) -> llama_sampler_p:
+    ...
+
 # LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
 @ctypes_function(
     "llama_sampler_name",
@@ -3549,7 +3571,7 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-#     "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+#     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
 @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
 def llama_sampler_init_softmax() -> llama_sampler_p:
     ...
@@ -3573,7 +3595,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
-# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+# /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
 # LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
 @ctypes_function(
     "llama_sampler_init_min_p",
@@ -3627,6 +3649,13 @@ def llama_sampler_init_xtc(
     ...
 
 
+# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
+# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float   n);
+@ctypes_function("llama_sampler_init_top_n_sigma",[ctypes.c_float], llama_sampler_p_ctypes)
+def llama_sampler_init_top_n_sigma(n: float) -> llama_sampler_p:
+    ...
+
+
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
 # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
 # /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3684,6 +3713,76 @@ def llama_sampler_init_grammar(
 ) -> llama_sampler_p:
     ...
 
+# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+#             const struct llama_vocab * vocab,
+#                         const char * grammar_str,
+#                         const char * grammar_root,
+#                         const char ** trigger_words,
+#                                 size_t num_trigger_words,
+#                 const llama_token * trigger_tokens,
+#                                 size_t num_trigger_tokens),
+#         "use llama_sampler_init_grammar_lazy_patterns instead");
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        llama_token_p,
+        ctypes.c_size_t
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_words: CtypesArray[bytes],
+    num_trigger_words: int,
+    trigger_tokens:CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /
+) -> llama_sampler_p:
+    ...
+
+# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+# /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+# /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
+# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+#     const struct llama_vocab * vocab,
+#                     const char * grammar_str,
+#                     const char * grammar_root,
+#                     const char ** trigger_patterns,
+#                         size_t num_trigger_patterns,
+#             const llama_token * trigger_tokens,
+#                         size_t num_trigger_tokens);
+@ctypes_function(
+    "llama_sampler_init_grammar_lazy_patterns",
+    [
+        llama_vocab_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+        llama_token_p,
+        ctypes.c_size_t
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_grammar_lazy_patterns(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_patterns: CtypesArray[bytes],
+    num_trigger_patterns: int,
+    trigger_tokens:CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /
+) -> llama_sampler_p:
+    ...
+
 
 # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
 # LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
@@ -502,7 +502,7 @@ def _visit_pattern(self, pattern, name):
         Transforms a regular expression pattern into a GBNF rule.
 
         Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
-        Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+        Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md
 
         Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.