Skip to content

Commit 8d981f0

Browse files
committed
fix memory_seq_rm crash bug
1 parent 7ee7577 commit 8d981f0

File tree

4 files changed

+8
-5
lines changed

4 files changed

+8
-5
lines changed

llama_cpp/_internals.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,10 @@ def memory_clear(self, data: bool):
312312
llama_cpp.llama_memory_clear(self.get_memory(), data)
313313

314314
def memory_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
315-
return llama_cpp.llama_memory_seq_rm(self.get_memory(), seq_id, p0, p1)
315+
if self.ctx is not None and seq_id >= 0:
316+
return llama_cpp.llama_memory_seq_rm(self.get_memory(), seq_id, p0, p1)
317+
else:
318+
return False
316319

317320
def memory_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
318321
llama_cpp.llama_memory_seq_cp(self.get_memory(), seq_id_src, seq_id_dst, p0, p1)

llama_cpp/llama.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ def eval(self, tokens: Sequence[int]):
645645
Args:
646646
tokens: The list of tokens to evaluate.
647647
"""
648-
self._ctx.memory_seq_rm(-1, self.n_tokens, -1)
648+
self._ctx.memory_seq_rm(0, self.n_tokens, -1)
649649
for i in range(0, len(tokens), self.n_batch):
650650
batch = tokens[i : min(len(tokens), i + self.n_batch)]
651651
n_past = self.n_tokens
@@ -985,7 +985,7 @@ def generate(
985985

986986
if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
987987
self.n_tokens = sample_idx
988-
self._ctx.memory_seq_rm(-1, self.n_tokens, -1)
988+
self._ctx.memory_seq_rm(0, self.n_tokens, -1)
989989
break
990990

991991
if self.draft_model is not None:

llama_cpp/server/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ class ModelSettings(BaseSettings):
114114
default=True, description="Whether to use full-size SWA cache"
115115
)
116116
kv_unified: bool = Field(
117-
default=True, description="use single unified KV buffer for the KV cache of all sequences"
117+
default=True, description="enable single unified KV buffer for the KV cache of all sequences"
118118
)
119119
# Sampling Params
120120
last_n_tokens_size: int = Field(

vendor/llama.cpp

0 commit comments

Comments
 (0)