Sync abetlen/llama-cpp-python 0.3.10 code

JamePeng · JamePeng · commit dbb30a8a94fe · 2025-07-04T08:03:05.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.10]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@28657a8229b5adc6028cf1c4ed62191792d2fdb0
+- feat: Add support for llama.cpp multimodal, add Qwen2.5-VL chat handler by @abetlen in cd548bd0f14210627798237d5c2ea78acfb88ccb
+
 ## [0.3.9]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,6 +97,13 @@ if (LLAMA_BUILD)
     endif()
 
     add_subdirectory(vendor/llama.cpp)
+
+    if (WIN32)
+        if (TARGET llama)
+            set_target_properties(llama PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+        endif()
+    endif()
+
     llama_cpp_python_install_target(llama)
     llama_cpp_python_install_target(ggml)
 
diff --git a/README.md b/README.md
@@ -505,6 +505,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
+| [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.9"
+__version__ = "0.3.10"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -537,17 +537,18 @@ def n_tokens(self) -> int:
     def reset(self):
         self.batch.n_tokens = 0
 
-    def set_batch(self, batch: Sequence[int], n_past: int):
+    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
         n_tokens = len(batch)
         self.batch.n_tokens = n_tokens
         for i in range(n_tokens):
             self.batch.token[i] = batch[i]
             self.batch.pos[i] = n_past + i
             self.batch.seq_id[i][0] = 0
             self.batch.n_seq_id[i] = 1
+            self.batch.logits[i] = logits_all
         self.batch.logits[n_tokens - 1] = True
 
-    def add_sequence(self, batch: Sequence[int], seq_id: int):
+    def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
         n_tokens = len(batch)
         n_tokens0 = self.batch.n_tokens
         self.batch.n_tokens += n_tokens
@@ -557,6 +558,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int):
             self.batch.pos[j] = i
             self.batch.seq_id[j][0] = seq_id
             self.batch.n_seq_id[j] = 1
+            self.batch.logits[j] = logits_all
         self.batch.logits[n_tokens - 1] = True
 
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -66,7 +66,6 @@ def __init__(
         split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
-        rpc_servers: Optional[str] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
@@ -90,11 +89,12 @@ def __init__(
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
         defrag_thold: float = -1.0,
+        logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
-        op_offload: bool = True,
-        swa_full: bool = True,
+        op_offload: Optional[bool] = None,
+        swa_full: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -152,7 +152,6 @@ def __init__(
             split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
             main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
-            rpc_servers: Comma separated list of RPC servers to use for offloading
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
@@ -173,6 +172,7 @@ def __init__(
             yarn_beta_slow: YaRN high correction dim
             yarn_orig_ctx: YaRN original context size
             defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
+            logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
@@ -230,11 +230,6 @@ def __init__(
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
         self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
-        if rpc_servers is not None:
-            self.model_params.rpc_servers = rpc_servers.encode("utf-8")
-            self._rpc_servers = rpc_servers
-        else:
-            self._rpc_servers = None
         self.tensor_split = tensor_split
         self._c_tensor_split = None
         if self.tensor_split is not None:
@@ -346,11 +341,17 @@ def __init__(
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
         self.context_params.defrag_thold = defrag_thold
+        self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
-        self.context_params.op_offload = op_offload
-        self.context_params.swa_full = swa_full
+
+        if op_offload is not None:
+            self.context_params.op_offload = op_offload
+
+        if swa_full is not None:
+            self.context_params.swa_full = swa_full
+
         #  KV cache quantization
         if type_k is not None:
             self.context_params.type_k = type_k
@@ -570,7 +571,7 @@ def eval_tokens(self) -> Deque[int]:
     def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
-            maxlen = 1
+            maxlen=self._n_ctx if self._logits_all else 1,
         )
 
     def tokenize(
@@ -643,12 +644,28 @@ def eval(self, tokens: Sequence[int]):
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
-                batch=batch, n_past=n_past
+                batch=batch, n_past=n_past, logits_all=self._logits_all
             )
             self._ctx.decode(self._batch)
             # Save tokens
             self.input_ids[n_past : n_past + n_tokens] = batch
-
+            # Save logits
+            if self._logits_all:
+                rows = n_tokens
+                cols = self._n_vocab
+                logits = np.ctypeslib.as_array(
+                    self._ctx.get_logits(), shape=(rows * cols,)
+                )
+                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
+            else:
+                # rows = 1
+                # cols = self._n_vocab
+                # logits = np.ctypeslib.as_array(
+                #     self._ctx.get_logits(), shape=(rows * cols,)
+                # )
+                # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
+                # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
+                pass
             # Update n_tokens
             self.n_tokens += n_tokens
 
@@ -1325,9 +1342,9 @@ def logit_bias_processor(
         else:
             stop_sequences = []
 
-        if logprobs is not None:
+        if logprobs is not None and self._logits_all is False:
             raise ValueError(
-                "logprobs is not supported for models"
+                "logprobs is not supported for models created with logits_all=False"
             )
 
         if self.cache:
@@ -2199,6 +2216,7 @@ def __getstate__(self):
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
             defrag_thold=self.context_params.defrag_thold,
+            logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py