fix memory_seq_rm crash bug

JamePeng · JamePeng · commit 8d981f0455b3 · 2025-07-17T20:26:38.000+08:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -312,7 +312,10 @@ def memory_clear(self, data: bool):
         llama_cpp.llama_memory_clear(self.get_memory(), data)
 
     def memory_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
-        return llama_cpp.llama_memory_seq_rm(self.get_memory(), seq_id, p0, p1)
+        if self.ctx is not None and seq_id >= 0:
+            return llama_cpp.llama_memory_seq_rm(self.get_memory(), seq_id, p0, p1)
+        else:
+            return False
 
     def memory_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
         llama_cpp.llama_memory_seq_cp(self.get_memory(), seq_id_src, seq_id_dst, p0, p1)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -645,7 +645,7 @@ def eval(self, tokens: Sequence[int]):
         Args:
             tokens: The list of tokens to evaluate.
         """
-        self._ctx.memory_seq_rm(-1, self.n_tokens, -1)
+        self._ctx.memory_seq_rm(0, self.n_tokens, -1)
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = self.n_tokens
@@ -985,7 +985,7 @@ def generate(
 
                 if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
                     self.n_tokens = sample_idx
-                    self._ctx.memory_seq_rm(-1, self.n_tokens, -1)
+                    self._ctx.memory_seq_rm(0, self.n_tokens, -1)
                     break
 
             if self.draft_model is not None:
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -114,7 +114,7 @@ class ModelSettings(BaseSettings):
         default=True, description="Whether to use full-size SWA cache"
     )
     kv_unified: bool = Field(
-        default=True, description="use single unified KV buffer for the KV cache of all sequences"
+        default=True, description="enable single unified KV buffer for the KV cache of all sequences"
     )
     # Sampling Params
     last_n_tokens_size: int = Field(
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 21c021745d781edf9c44b4972ef6cbbf53b0ecff
+Subproject commit 086cf81e88fb75287b71ff19c08a206b7bc2e02f

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ class ModelSettings(BaseSettings):`
`114`	`114`	`default=True, description="Whether to use full-size SWA cache"`
`115`	`115`	`)`
`116`	`116`	`kv_unified: bool = Field(`
`117`		`- default=True, description="use single unified KV buffer for the KV cache of all sequences"`
	`117`	`+ default=True, description="enable single unified KV buffer for the KV cache of all sequences"`
`118`	`118`	`)`
`119`	`119`	`# Sampling Params`
`120`	`120`	`last_n_tokens_size: int = Field(`