feat: Update llama.cpp

abetlen · abetlen · commit 159cc4e5d924 · 2024-04-21T20:46:40.000-04:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -181,20 +181,20 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool):
                 )
         return list(tokens[:n_tokens])
 
-    def token_to_piece(self, token: int) -> bytes:
+    def token_to_piece(self, token: int, special: bool = False) -> bytes:
         assert self.model is not None
         buf = ctypes.create_string_buffer(32)
-        llama_cpp.llama_token_to_piece(self.model, token, buf, 32)
+        llama_cpp.llama_token_to_piece(self.model, token, buf, 32, special)
         return bytes(buf)
 
-    def detokenize(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         assert self.model is not None
         output = b""
         size = 32
         buffer = (ctypes.c_char * size)()
         for token in tokens:
             n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size
+                self.model, llama_cpp.llama_token(token), buffer, size, special
             )
             assert n <= size
             output += bytes(buffer[:n])
@@ -597,13 +597,13 @@ def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> li
     return list(result)
 
 
-def _token_to_piece(model: _LlamaModel, token: int) -> str:
+def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> str:
     assert model.model is not None
     result = (ctypes.c_char * 8)(0)
-    n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
+    n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special)
     if n_tokens < 0:
         result = (ctypes.c_char * -n_tokens)(0)
-        check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
+        check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special)
         if check != -n_tokens:
             raise RuntimeError(f"Failed to get piece: token={token}")
     else:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -2380,6 +2380,18 @@ def llama_token_get_type(
     ...
 
 
+# // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
+# LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+@ctypes_function(
+    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+)
+def llama_token_is_eog(
+    model: llama_model_p, token: Union[llama_token, int], /
+) -> bool:
+    """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
+    ...
+
+
 # // Special tokens
 
 
@@ -2434,7 +2446,7 @@ def llama_add_eos_token(model: llama_model_p, /) -> int:
     ...
 
 
-# // codellama infill tokens
+# // Codellama infill tokens
 # LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
 @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
 def llama_token_prefix(model: llama_model_p) -> int:
@@ -2524,18 +2536,21 @@ def llama_tokenize(
 # // Uses the vocabulary in the provided context.
 # // Does not write null terminator to the buffer.
 # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+# // @param special If true, special tokens are rendered in the output.
 # LLAMA_API int32_t llama_token_to_piece(
 #           const struct llama_model * model,
 #                        llama_token   token,
 #                               char * buf,
-#                            int32_t   length);
+#                            int32_t   length,
+#                               bool   special);
 @ctypes_function(
     "llama_token_to_piece",
     [
         llama_model_p_ctypes,
         llama_token,
         ctypes.c_char_p,
         ctypes.c_int32,
+        ctypes.c_bool,
     ],
     ctypes.c_int32,
 )
@@ -2544,13 +2559,20 @@ def llama_token_to_piece(
     token: Union[llama_token, int],
     buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
     length: Union[ctypes.c_int, int],
+    special: Union[ctypes.c_bool, bool],
     /,
 ) -> int:
     """Token Id -> Piece.
     Uses the vocabulary in the provided context.
     Does not write null terminator to the buffer.
     User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    """
+
+    Args:
+        model: The model to use for tokenization.
+        token: The token to convert.
+        buf: The buffer to write the token to.
+        length: The length of the buffer.
+        special: If true, special tokens are rendered in the output."""
     ...
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 3b8f1ec4b18770531d0b1d792f3edf08254e4f0c
+Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb