Skip to content

Commit d5ce3fe

Browse files
committed
Sync LLAMA_API names with ggml-org/llama.cpp 20250309, support LLAMA_VOCAB_PRE_TYPE_GPT4O
1 parent 6f7cd45 commit d5ce3fe

File tree

6 files changed

+123
-23
lines changed

6 files changed

+123
-23
lines changed

.github/ISSUE_TEMPLATE/bug_report.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ Try the following:
6666
3. `rm -rf _skbuild/` # delete any old builds
6767
4. `python -m pip install .`
6868
5. `cd ./vendor/llama.cpp`
69-
6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
70-
7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
69+
6. Follow [llama.cpp's instructions](https://github.com/ggml-org/llama.cpp#build) to `cmake` llama.cpp
70+
7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggml-org/llama.cpp/issues)
7171

7272
# Failure Logs
7373

examples/low_level_api/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from dataclasses import dataclass, field
66
from typing import List
77

8-
# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
8+
# Based on https://github.com/ggml-org/llama.cpp/blob/master/examples/common.cpp
99

1010

1111
@dataclass

llama_cpp/_internals.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ def n_ctx_train(self) -> int:
8989
def n_embd(self) -> int:
9090
return llama_cpp.llama_n_embd(self.model)
9191

92+
def n_head_kv(self) -> int:
93+
return llama_cpp.llama_model_n_head_kv(self.model)
94+
95+
def n_params(self) -> int:
96+
return llama_cpp.llama_model_n_params(self.model)
97+
9298
def rope_freq_scale_train(self) -> float:
9399
return llama_cpp.llama_model_rope_freq_scale_train(self.model)
94100

@@ -100,9 +106,6 @@ def desc(self) -> str:
100106
def size(self) -> int:
101107
return llama_cpp.llama_model_size(self.model)
102108

103-
def n_params(self) -> int:
104-
return llama_cpp.llama_model_n_params(self.model)
105-
106109
def get_tensor(self, name: str) -> ctypes.c_void_p:
107110
raise NotImplementedError("get_tensor is not implemented in llama.cpp")
108111

@@ -747,9 +750,7 @@ def free_wrapper(sampler: llama_cpp.llama_sampler_p):
747750
sampler_i.clone = llama_cpp.llama_sampler_i_clone(0)
748751
sampler_i.free = llama_cpp.llama_sampler_i_free(0)
749752

750-
self.sampler = llama_cpp.llama_sampler()
751-
self.sampler.iface = ctypes.pointer(sampler_i)
752-
self.sampler.ctx = None
753+
self.sampler = llama_cpp.llama_sampler_init(ctypes.pointer(sampler_i), None)
753754

754755
def get_sampler(self) -> llama_cpp.llama_sampler_p:
755756
return ctypes.pointer(self.sampler)

llama_cpp/llama.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def __init__(
161161
n_ubatch: Physical batch size
162162
n_threads: Number of threads to use for generation
163163
n_threads_batch: Number of threads to use for batch processing
164-
rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
164+
rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggml-org/llama.cpp/pull/2054
165165
pooling_type: Pooling type, from `enum llama_pooling_type`.
166166
rope_freq_base: RoPE base frequency, 0 = from model
167167
rope_freq_scale: RoPE frequency scaling factor, 0 = from model
@@ -1774,7 +1774,7 @@ def create_completion(
17741774
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
17751775
temperature: The temperature to use for sampling.
17761776
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1777-
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1777+
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
17781778
typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
17791779
logprobs: The number of logprobs to return. If None, no logprobs are returned.
17801780
echo: Whether to echo the prompt.
@@ -1871,7 +1871,7 @@ def __call__(
18711871
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
18721872
temperature: The temperature to use for sampling.
18731873
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1874-
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1874+
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
18751875
typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
18761876
logprobs: The number of logprobs to return. If None, no logprobs are returned.
18771877
echo: Whether to echo the prompt.
@@ -1971,7 +1971,7 @@ def create_chat_completion(
19711971
temperature: The temperature to use for sampling.
19721972
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
19731973
top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1974-
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1974+
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
19751975
typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
19761976
stream: Whether to stream the results.
19771977
stop: A list of strings to stop generation when encountered.

llama_cpp/llama_cpp.py

Lines changed: 108 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,10 @@
103103
GGML_TYPE_I64 = 27
104104
GGML_TYPE_F64 = 28
105105
GGML_TYPE_IQ1_M = 29
106-
GGML_TYPE_COUNT = 30
106+
GGML_TYPE_BF16 = 30
107+
GGML_TYPE_TQ1_0 = 34
108+
GGML_TYPE_TQ2_0 = 35
109+
GGML_TYPE_COUNT = 39
107110

108111
# from ggml-backend.h
109112
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
@@ -257,6 +260,7 @@
257260
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
258261
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
259262
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
263+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
260264

261265

262266
# // note: these values should be synchronized with ggml_rope
@@ -708,7 +712,7 @@ class llama_model_params(ctypes.Structure):
708712

709713

710714
# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
711-
# // https://github.com/ggerganov/llama.cpp/pull/7544
715+
# // https://github.com/ggml-org/llama.cpp/pull/7544
712716
# struct llama_context_params {
713717
# uint32_t n_ctx; // text context, 0 = from model
714718
# uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
@@ -721,7 +725,7 @@ class llama_model_params(ctypes.Structure):
721725
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
722726
# enum llama_attention_type attention_type; // attention type to use for embeddings
723727

724-
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
728+
# // ref: https://github.com/ggml-org/llama.cpp/pull/2054
725729
# float rope_freq_base; // RoPE base frequency, 0 = from model
726730
# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
727731
# float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
@@ -1276,7 +1280,10 @@ def llama_n_ubatch(ctx: llama_context_p, /) -> int:
12761280
def llama_n_seq_max(ctx: llama_context_p, /) -> int:
12771281
...
12781282

1279-
1283+
# LLAMA_API int32_t llama_model_n_head_kv(const struct llama_model * model);
1284+
@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_uint32)
1285+
def llama_model_n_head_kv(model: llama_model_p, /) -> int:
1286+
...
12801287

12811288

12821289
# DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
@@ -1357,6 +1364,11 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
13571364
...
13581365

13591366

1367+
# LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
1368+
@ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
1369+
def llama_model_n_head_kv(model: llama_model_p, /) -> int:
1370+
1371+
13601372
# // Get the model's RoPE frequency scaling factor
13611373
# LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
13621374
@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -3235,7 +3247,7 @@ def llama_detokenize(
32353247

32363248
# /// Apply chat template. Inspired by hf apply_chat_template() on python.
32373249
# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
3238-
# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
3250+
# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
32393251
# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
32403252
# /// @param chat Pointer to a list of multiple llama_chat_message
32413253
# /// @param n_msg Number of llama_chat_message in this chat
@@ -3375,8 +3387,8 @@ class llama_sampler_i(ctypes.Structure):
33753387

33763388

33773389
# struct llama_sampler {
3378-
# struct llama_sampler_i * iface;
3379-
# llama_sampler_context_t ctx;
3390+
# const struct llama_sampler_i * iface;
3391+
# llama_sampler_context_t ctx;
33803392
# };
33813393
class llama_sampler(ctypes.Structure):
33823394
_fields_ = [
@@ -3410,6 +3422,16 @@ class llama_sampler(ctypes.Structure):
34103422

34113423

34123424
# // mirror of llama_sampler_i:
3425+
3426+
# LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
3427+
@ctypes_function(
3428+
"llama_sampler_init",
3429+
[ctypes.POINTER(llama_sampler_i), llama_sampler_context_t],
3430+
llama_sampler_p_ctypes,
3431+
)
3432+
def llama_sampler_init(smpl: llama_sampler_p, /) -> llama_sampler_p:
3433+
...
3434+
34133435
# LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
34143436
@ctypes_function(
34153437
"llama_sampler_name",
@@ -3549,7 +3571,7 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
35493571
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
35503572
# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
35513573
# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
3552-
# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
3574+
# "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
35533575
@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
35543576
def llama_sampler_init_softmax() -> llama_sampler_p:
35553577
...
@@ -3573,7 +3595,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
35733595
...
35743596

35753597

3576-
# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
3598+
# /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
35773599
# LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
35783600
@ctypes_function(
35793601
"llama_sampler_init_min_p",
@@ -3627,6 +3649,13 @@ def llama_sampler_init_xtc(
36273649
...
36283650

36293651

3652+
# /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
3653+
# LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
3654+
@ctypes_function("llama_sampler_init_top_n_sigma",[ctypes.c_float], llama_sampler_p_ctypes)
3655+
def llama_sampler_init_top_n_sigma(n: float) -> llama_sampler_p:
3656+
...
3657+
3658+
36303659
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
36313660
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
36323661
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3684,6 +3713,76 @@ def llama_sampler_init_grammar(
36843713
) -> llama_sampler_p:
36853714
...
36863715

3716+
# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
3717+
# const struct llama_vocab * vocab,
3718+
# const char * grammar_str,
3719+
# const char * grammar_root,
3720+
# const char ** trigger_words,
3721+
# size_t num_trigger_words,
3722+
# const llama_token * trigger_tokens,
3723+
# size_t num_trigger_tokens),
3724+
# "use llama_sampler_init_grammar_lazy_patterns instead");
3725+
@ctypes_function(
3726+
"llama_sampler_init_grammar_lazy",
3727+
[
3728+
llama_vocab_p_ctypes,
3729+
ctypes.c_char_p,
3730+
ctypes.c_char_p,
3731+
ctypes.POINTER(ctypes.c_char_p),
3732+
ctypes.c_size_t,
3733+
llama_token_p,
3734+
ctypes.c_size_t
3735+
],
3736+
llama_sampler_p_ctypes,
3737+
)
3738+
def llama_sampler_init_grammar_lazy(
3739+
vocab: llama_vocab_p,
3740+
grammar_str: bytes,
3741+
grammar_root: bytes,
3742+
trigger_words: CtypesArray[bytes],
3743+
num_trigger_words: int,
3744+
trigger_tokens:CtypesArray[llama_token],
3745+
num_trigger_tokens: int,
3746+
/
3747+
) -> llama_sampler_p:
3748+
...
3749+
3750+
# /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
3751+
# /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
3752+
# /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
3753+
# LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
3754+
# const struct llama_vocab * vocab,
3755+
# const char * grammar_str,
3756+
# const char * grammar_root,
3757+
# const char ** trigger_patterns,
3758+
# size_t num_trigger_patterns,
3759+
# const llama_token * trigger_tokens,
3760+
# size_t num_trigger_tokens);
3761+
@ctypes_function(
3762+
"llama_sampler_init_grammar_lazy_patterns",
3763+
[
3764+
llama_vocab_p_ctypes,
3765+
ctypes.c_char_p,
3766+
ctypes.c_char_p,
3767+
ctypes.POINTER(ctypes.c_char_p),
3768+
ctypes.c_size_t,
3769+
llama_token_p,
3770+
ctypes.c_size_t
3771+
],
3772+
llama_sampler_p_ctypes,
3773+
)
3774+
def llama_sampler_init_grammar_lazy_patterns(
3775+
vocab: llama_vocab_p,
3776+
grammar_str: bytes,
3777+
grammar_root: bytes,
3778+
trigger_patterns: CtypesArray[bytes],
3779+
num_trigger_patterns: int,
3780+
trigger_tokens:CtypesArray[llama_token],
3781+
num_trigger_tokens: int,
3782+
/
3783+
) -> llama_sampler_p:
3784+
...
3785+
36873786

36883787
# /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
36893788
# LLAMA_API struct llama_sampler * llama_sampler_init_penalties(

llama_cpp/llama_grammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ def _visit_pattern(self, pattern, name):
502502
Transforms a regular expression pattern into a GBNF rule.
503503
504504
Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
505-
Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
505+
Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md
506506
507507
Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
508508

0 commit comments

Comments
 (0)