Remove usage of eval_tokens for cache check

abetlen · abetlen · commit 7fc7bc30e712 · 2023-05-26T20:12:05.000-04:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -735,10 +735,10 @@ def _create_completion(
             try:
                 cache_item = self.cache[prompt_tokens]
                 cache_prefix_len = Llama.longest_token_prefix(
-                    cache_item.eval_tokens, prompt_tokens
+                    cache_item.input_ids.tolist(), prompt_tokens
                 )
                 eval_prefix_len = Llama.longest_token_prefix(
-                    self.eval_tokens, prompt_tokens
+                    self._input_ids.tolist(), prompt_tokens
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)

Original file line number	Diff line number	Diff line change
`@@ -735,10 +735,10 @@ def _create_completion(`
`735`	`735`	`try:`
`736`	`736`	`cache_item = self.cache[prompt_tokens]`
`737`	`737`	`cache_prefix_len = Llama.longest_token_prefix(`
`738`		`- cache_item.eval_tokens, prompt_tokens`
	`738`	`+ cache_item.input_ids.tolist(), prompt_tokens`
`739`	`739`	`)`
`740`	`740`	`eval_prefix_len = Llama.longest_token_prefix(`
`741`		`- self.eval_tokens, prompt_tokens`
	`741`	`+ self._input_ids.tolist(), prompt_tokens`
`742`	`742`	`)`
`743`	`743`	`if cache_prefix_len > eval_prefix_len:`
`744`	`744`	`self.load_state(cache_item)`