Skip to content

Commit dbb30a8

Browse files
committed
Sync abetlen/llama-cpp-python 0.3.10 code
1 parent 737c0ce commit dbb30a8

File tree

8 files changed

+275
-629
lines changed

8 files changed

+275
-629
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.3.10]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@28657a8229b5adc6028cf1c4ed62191792d2fdb0
13+
- feat: Add support for llama.cpp multimodal, add Qwen2.5-VL chat handler by @abetlen in cd548bd0f14210627798237d5c2ea78acfb88ccb
14+
1015
## [0.3.9]
1116

1217
- feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,13 @@ if (LLAMA_BUILD)
9797
endif()
9898

9999
add_subdirectory(vendor/llama.cpp)
100+
101+
if (WIN32)
102+
if (TARGET llama)
103+
set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
104+
endif()
105+
endif()
106+
100107
llama_cpp_python_install_target(llama)
101108
llama_cpp_python_install_target(ggml)
102109

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
505505
| [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
506506
| [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
507507
| [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
508+
| [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
508509

509510
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
510511

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.3.9"
4+
__version__ = "0.3.10"

llama_cpp/_internals.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -537,17 +537,18 @@ def n_tokens(self) -> int:
537537
def reset(self):
538538
self.batch.n_tokens = 0
539539

540-
def set_batch(self, batch: Sequence[int], n_past: int):
540+
def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
541541
n_tokens = len(batch)
542542
self.batch.n_tokens = n_tokens
543543
for i in range(n_tokens):
544544
self.batch.token[i] = batch[i]
545545
self.batch.pos[i] = n_past + i
546546
self.batch.seq_id[i][0] = 0
547547
self.batch.n_seq_id[i] = 1
548+
self.batch.logits[i] = logits_all
548549
self.batch.logits[n_tokens - 1] = True
549550

550-
def add_sequence(self, batch: Sequence[int], seq_id: int):
551+
def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
551552
n_tokens = len(batch)
552553
n_tokens0 = self.batch.n_tokens
553554
self.batch.n_tokens += n_tokens
@@ -557,6 +558,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int):
557558
self.batch.pos[j] = i
558559
self.batch.seq_id[j][0] = seq_id
559560
self.batch.n_seq_id[j] = 1
561+
self.batch.logits[j] = logits_all
560562
self.batch.logits[n_tokens - 1] = True
561563

562564

llama_cpp/llama.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def __init__(
6666
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
6767
main_gpu: int = 0,
6868
tensor_split: Optional[List[float]] = None,
69-
rpc_servers: Optional[str] = None,
7069
vocab_only: bool = False,
7170
use_mmap: bool = True,
7271
use_mlock: bool = False,
@@ -90,11 +89,12 @@ def __init__(
9089
yarn_beta_slow: float = 1.0,
9190
yarn_orig_ctx: int = 0,
9291
defrag_thold: float = -1.0,
92+
logits_all: bool = False,
9393
embedding: bool = False,
9494
offload_kqv: bool = True,
9595
flash_attn: bool = False,
96-
op_offload: bool = True,
97-
swa_full: bool = True,
96+
op_offload: Optional[bool] = None,
97+
swa_full: Optional[bool] = None,
9898
# Sampling Params
9999
no_perf: bool = False,
100100
last_n_tokens_size: int = 64,
@@ -152,7 +152,6 @@ def __init__(
152152
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
153153
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
154154
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
155-
rpc_servers: Comma separated list of RPC servers to use for offloading
156155
vocab_only: Only load the vocabulary no weights.
157156
use_mmap: Use mmap if possible.
158157
use_mlock: Force the system to keep the model in RAM.
@@ -173,6 +172,7 @@ def __init__(
173172
yarn_beta_slow: YaRN high correction dim
174173
yarn_orig_ctx: YaRN original context size
175174
defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
175+
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
176176
embedding: Embedding mode only.
177177
offload_kqv: Offload K, Q, V to GPU.
178178
flash_attn: Use flash attention.
@@ -230,11 +230,6 @@ def __init__(
230230
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
231231
self.model_params.split_mode = split_mode
232232
self.model_params.main_gpu = main_gpu
233-
if rpc_servers is not None:
234-
self.model_params.rpc_servers = rpc_servers.encode("utf-8")
235-
self._rpc_servers = rpc_servers
236-
else:
237-
self._rpc_servers = None
238233
self.tensor_split = tensor_split
239234
self._c_tensor_split = None
240235
if self.tensor_split is not None:
@@ -346,11 +341,17 @@ def __init__(
346341
)
347342
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
348343
self.context_params.defrag_thold = defrag_thold
344+
self._logits_all = logits_all if draft_model is None else True
349345
self.context_params.embeddings = embedding # TODO: Rename to embeddings
350346
self.context_params.offload_kqv = offload_kqv
351347
self.context_params.flash_attn = flash_attn
352-
self.context_params.op_offload = op_offload
353-
self.context_params.swa_full = swa_full
348+
349+
if op_offload is not None:
350+
self.context_params.op_offload = op_offload
351+
352+
if swa_full is not None:
353+
self.context_params.swa_full = swa_full
354+
354355
# KV cache quantization
355356
if type_k is not None:
356357
self.context_params.type_k = type_k
@@ -570,7 +571,7 @@ def eval_tokens(self) -> Deque[int]:
570571
def eval_logits(self) -> Deque[List[float]]:
571572
return deque(
572573
self.scores[: self.n_tokens, :].tolist(),
573-
maxlen = 1
574+
maxlen=self._n_ctx if self._logits_all else 1,
574575
)
575576

576577
def tokenize(
@@ -643,12 +644,28 @@ def eval(self, tokens: Sequence[int]):
643644
n_past = self.n_tokens
644645
n_tokens = len(batch)
645646
self._batch.set_batch(
646-
batch=batch, n_past=n_past
647+
batch=batch, n_past=n_past, logits_all=self._logits_all
647648
)
648649
self._ctx.decode(self._batch)
649650
# Save tokens
650651
self.input_ids[n_past : n_past + n_tokens] = batch
651-
652+
# Save logits
653+
if self._logits_all:
654+
rows = n_tokens
655+
cols = self._n_vocab
656+
logits = np.ctypeslib.as_array(
657+
self._ctx.get_logits(), shape=(rows * cols,)
658+
)
659+
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
660+
else:
661+
# rows = 1
662+
# cols = self._n_vocab
663+
# logits = np.ctypeslib.as_array(
664+
# self._ctx.get_logits(), shape=(rows * cols,)
665+
# )
666+
# self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
667+
# NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
668+
pass
652669
# Update n_tokens
653670
self.n_tokens += n_tokens
654671

@@ -1325,9 +1342,9 @@ def logit_bias_processor(
13251342
else:
13261343
stop_sequences = []
13271344

1328-
if logprobs is not None:
1345+
if logprobs is not None and self._logits_all is False:
13291346
raise ValueError(
1330-
"logprobs is not supported for models"
1347+
"logprobs is not supported for models created with logits_all=False"
13311348
)
13321349

13331350
if self.cache:
@@ -2199,6 +2216,7 @@ def __getstate__(self):
21992216
yarn_beta_slow=self.context_params.yarn_beta_slow,
22002217
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
22012218
defrag_thold=self.context_params.defrag_thold,
2219+
logits_all=self._logits_all,
22022220
embedding=self.context_params.embeddings,
22032221
offload_kqv=self.context_params.offload_kqv,
22042222
flash_attn=self.context_params.flash_attn,

0 commit comments

Comments
 (0)