Merge pull request #267 from utilityai/update-llama-cpp-2024-04-21

MarcusDunn · web-flow · commit 6d6be16e7abe · 2024-04-21T12:40:35.000-07:00
Updated llama-cpp (bot)
diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
@@ -20,7 +20,7 @@ use llama_cpp_2::ggml_time_us;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::AddBos;
+use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
 
 #[derive(clap::Parser, Debug, Clone)]
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
         eprintln!("Prompt {i}");
         for token in token_line {
             // Attempt to convert token to string and print it; if it fails, print the token instead
-            match model.token_to_str(*token) {
+            match model.token_to_str(*token, Special::Tokenize) {
                 Ok(token_str) => eprintln!(" {} --> {}", token, token_str),
                 Err(e) => {
                     eprintln!("Failed to convert token to string, error: {}", e);
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -2,8 +2,6 @@
 use std::fmt::Debug;
 use std::num::NonZeroU32;
 
-use llama_cpp_sys_2;
-
 /// A rusty wrapper around `rope_scaling_type`.
 #[repr(i8)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -51,6 +51,15 @@ pub enum AddBos {
     Never,
 }
 
+/// How to determine if we should tokenize special tokens
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Special {
+    /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
+    Tokenize,
+    /// Treat special and/or control tokens as plaintext.
+    Plaintext,
+}
+
 unsafe impl Send for LlamaModel {}
 
 unsafe impl Sync for LlamaModel {}
@@ -71,10 +80,11 @@ impl LlamaModel {
     /// Get all tokens in the model.
     pub fn tokens(
         &self,
+        special: Special,
     ) -> impl Iterator<Item = (LlamaToken, Result<String, TokenToStringError>)> + '_ {
         (0..self.n_vocab())
             .map(LlamaToken::new)
-            .map(|llama_token| (llama_token, self.token_to_str(llama_token)))
+            .map(move |llama_token| (llama_token, self.token_to_str(llama_token, special)))
     }
 
     /// Get the beginning of stream token.
@@ -103,27 +113,27 @@ impl LlamaModel {
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn token_to_str(&self, token: LlamaToken) -> Result<String, TokenToStringError> {
-        self.token_to_str_with_size(token, 32)
+    pub fn token_to_str(&self, token: LlamaToken, special: Special) -> Result<String, TokenToStringError> {
+        self.token_to_str_with_size(token, 32, special)
     }
 
     /// Convert single token to bytes.
     ///
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn token_to_bytes(&self, token: LlamaToken) -> Result<Vec<u8>, TokenToStringError> {
-        self.token_to_bytes_with_size(token, 32)
+    pub fn token_to_bytes(&self, token: LlamaToken, special: Special) -> Result<Vec<u8>, TokenToStringError> {
+        self.token_to_bytes_with_size(token, 32, special)
     }
 
     /// Convert a vector of tokens to a single string.
     ///
     /// # Errors
     ///
     /// See [`TokenToStringError`] for more information.
-    pub fn tokens_to_str(&self, tokens: &[LlamaToken]) -> Result<String, TokenToStringError> {
+    pub fn tokens_to_str(&self, tokens: &[LlamaToken], special: Special) -> Result<String, TokenToStringError> {
         let mut builder = String::with_capacity(tokens.len() * 4);
-        for str in tokens.iter().copied().map(|t| self.token_to_str(t)) {
+        for str in tokens.iter().copied().map(|t| self.token_to_str(t, special)) {
             builder += &str?;
         }
         Ok(builder)
@@ -236,8 +246,9 @@ impl LlamaModel {
         &self,
         token: LlamaToken,
         buffer_size: usize,
+        special: Special,
     ) -> Result<String, TokenToStringError> {
-        let bytes = self.token_to_bytes_with_size(token, buffer_size)?;
+        let bytes = self.token_to_bytes_with_size(token, buffer_size, special)?;
         Ok(String::from_utf8(bytes)?)
     }
 
@@ -259,11 +270,13 @@ impl LlamaModel {
         &self,
         token: LlamaToken,
         buffer_size: usize,
+        special: Special,
     ) -> Result<Vec<u8>, TokenToStringError> {
         if token == self.token_nl() {
             return Ok(String::from("\n").into_bytes());
         }
 
+        // unsure what to do with this in the face of the 'special' arg
         match self.token_type(token) {
             LlamaTokenType::Normal | LlamaTokenType::UserDefined => {}
             LlamaTokenType::Control => {
@@ -279,12 +292,17 @@ impl LlamaModel {
             }
         }
 
+        let special = match special {
+            Special::Tokenize => true,
+            Special::Plaintext => false,
+        };
+
         let string = CString::new(vec![b'*'; buffer_size]).expect("no null");
         let len = string.as_bytes().len();
         let len = c_int::try_from(len).expect("length fits into c_int");
         let buf = string.into_raw();
         let size = unsafe {
-            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len)
+            llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, special)
         };
 
         match size {
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
@@ -33,7 +33,11 @@ include = [
     "/llama.cpp/llama.h",
     "/llama.cpp/unicode.h",
     "/llama.cpp/unicode.cpp",
-    "/llama.cpp/ggml-common.h"
+    "/llama.cpp/unicode-data.h",
+    "/llama.cpp/unicode-data.h",
+    "/llama.cpp/unicode-data.cpp",
+    "/llama.cpp/ggml-common.h",
+    "/llama.cpp/ggml-cuda"
 ]
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
@@ -1,4 +1,5 @@
 use std::env;
+use std::ffi::OsStr;
 use std::path::Path;
 use std::path::PathBuf;
 
@@ -67,6 +68,12 @@ fn main() {
             .cuda(true)
             .flag("-arch=all")
             .file("llama.cpp/ggml-cuda.cu")
+            .files(std::fs::read_dir("llama.cpp/ggml-cuda")
+                .expect("failed to read 'llama.cpp/ggml-cuda'")
+                .map(|e| e.expect("failed to ready entry").path())
+                .filter(|p| p.extension().is_some_and(|it| it == OsStr::new("cu")))
+            )
+            .include("llama.cpp/ggml-cuda")
             .include("llama.cpp");
 
         if ggml_cuda.get_compiler().is_like_msvc() {
@@ -75,9 +82,9 @@ fn main() {
             ggml_cuda.flag("-std=c++11").std("c++11");
         }
 
-        ggml.define("GGML_USE_CUBLAS", None);
-        ggml_cuda.define("GGML_USE_CUBLAS", None);
-        llama_cpp.define("GGML_USE_CUBLAS", None);
+        ggml.define("GGML_USE_CUDA", None);
+        ggml_cuda.define("GGML_USE_CUDA", None);
+        llama_cpp.define("GGML_USE_CUDA", None);
     }
 
     for build in [&mut ggml, &mut llama_cpp] {
@@ -177,7 +184,8 @@ fn main() {
         .include("llama.cpp")
         .std("c++11")
         .file("llama.cpp/llama.cpp")
-        .file("llama.cpp/unicode.cpp");
+        .file("llama.cpp/unicode.cpp")
+        .file("llama.cpp/unicode-data.cpp");
 
     // Remove debug log output from `llama.cpp`
     let is_release = env::var("PROFILE").unwrap() == "release";
@@ -193,18 +201,18 @@ fn main() {
     }
 
     if let Some(ggml_cuda) = ggml_cuda {
-        println!("compiling ggml-cuda");
+        eprintln!("compiling ggml-cuda");
         ggml_cuda.compile("ggml-cuda");
-        println!("compiled ggml-cuda");
+        eprintln!("compiled ggml-cuda");
     }
 
-    println!("compiling ggml");
+    eprintln!("compiling ggml");
     ggml.compile("ggml");
-    println!("compiled ggml");
+    eprintln!("compiled ggml");
 
-    println!("compiling llama");
+    eprintln!("compiling llama");
     llama_cpp.compile("llama");
-    println!("compiled llama");
+    eprintln!("compiled llama");
 
     let header = "llama.cpp/llama.h";
 
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit a32b77c4b2c1808654d0b952f26c37d73d2e746b
+Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb
diff --git a/simple/src/main.rs b/simple/src/main.rs
@@ -15,7 +15,7 @@ use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
 use llama_cpp_2::model::params::LlamaModelParams;
-use llama_cpp_2::model::AddBos;
+use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 use std::ffi::CString;
@@ -214,7 +214,7 @@ either reduce n_len or increase n_ctx"
     eprintln!();
 
     for token in &tokens_list {
-        eprint!("{}", model.token_to_str(*token)?);
+        eprint!("{}", model.token_to_str(*token, Special::Tokenize)?);
     }
 
     std::io::stderr().flush()?;
@@ -259,7 +259,7 @@ either reduce n_len or increase n_ctx"
                 break;
             }
 
-            let output_bytes = model.token_to_bytes(new_token_id)?;
+            let output_bytes = model.token_to_bytes(new_token_id, Special::Tokenize)?;
             // use `Decoder.decode_to_string()` to avoid the intermediate buffer
             let mut output_string = String::with_capacity(32);
             let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);