Skip to content

Commit 6d6be16

Browse files
authored
Merge pull request #267 from utilityai/update-llama-cpp-2024-04-21
Updated llama-cpp (bot)
2 parents 6dd0d12 + 34054a1 commit 6d6be16

File tree

7 files changed

+56
-28
lines changed

7 files changed

+56
-28
lines changed

embeddings/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use llama_cpp_2::ggml_time_us;
2020
use llama_cpp_2::llama_backend::LlamaBackend;
2121
use llama_cpp_2::llama_batch::LlamaBatch;
2222
use llama_cpp_2::model::params::LlamaModelParams;
23-
use llama_cpp_2::model::AddBos;
23+
use llama_cpp_2::model::{AddBos, Special};
2424
use llama_cpp_2::model::LlamaModel;
2525

2626
#[derive(clap::Parser, Debug, Clone)]
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
138138
eprintln!("Prompt {i}");
139139
for token in token_line {
140140
// Attempt to convert token to string and print it; if it fails, print the token instead
141-
match model.token_to_str(*token) {
141+
match model.token_to_str(*token, Special::Tokenize) {
142142
Ok(token_str) => eprintln!(" {} --> {}", token, token_str),
143143
Err(e) => {
144144
eprintln!("Failed to convert token to string, error: {}", e);

llama-cpp-2/src/context/params.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
use std::fmt::Debug;
33
use std::num::NonZeroU32;
44

5-
use llama_cpp_sys_2;
6-
75
/// A rusty wrapper around `rope_scaling_type`.
86
#[repr(i8)]
97
#[derive(Copy, Clone, Debug, PartialEq, Eq)]

llama-cpp-2/src/model.rs

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,15 @@ pub enum AddBos {
5151
Never,
5252
}
5353

54+
/// How to determine if we should tokenize special tokens
55+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56+
pub enum Special {
57+
/// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
58+
Tokenize,
59+
/// Treat special and/or control tokens as plaintext.
60+
Plaintext,
61+
}
62+
5463
unsafe impl Send for LlamaModel {}
5564

5665
unsafe impl Sync for LlamaModel {}
@@ -71,10 +80,11 @@ impl LlamaModel {
7180
/// Get all tokens in the model.
7281
pub fn tokens(
7382
&self,
83+
special: Special,
7484
) -> impl Iterator<Item = (LlamaToken, Result<String, TokenToStringError>)> + '_ {
7585
(0..self.n_vocab())
7686
.map(LlamaToken::new)
77-
.map(|llama_token| (llama_token, self.token_to_str(llama_token)))
87+
.map(move |llama_token| (llama_token, self.token_to_str(llama_token, special)))
7888
}
7989

8090
/// Get the beginning of stream token.
@@ -103,27 +113,27 @@ impl LlamaModel {
103113
/// # Errors
104114
///
105115
/// See [`TokenToStringError`] for more information.
106-
pub fn token_to_str(&self, token: LlamaToken) -> Result<String, TokenToStringError> {
107-
self.token_to_str_with_size(token, 32)
116+
pub fn token_to_str(&self, token: LlamaToken, special: Special) -> Result<String, TokenToStringError> {
117+
self.token_to_str_with_size(token, 32, special)
108118
}
109119

110120
/// Convert single token to bytes.
111121
///
112122
/// # Errors
113123
///
114124
/// See [`TokenToStringError`] for more information.
115-
pub fn token_to_bytes(&self, token: LlamaToken) -> Result<Vec<u8>, TokenToStringError> {
116-
self.token_to_bytes_with_size(token, 32)
125+
pub fn token_to_bytes(&self, token: LlamaToken, special: Special) -> Result<Vec<u8>, TokenToStringError> {
126+
self.token_to_bytes_with_size(token, 32, special)
117127
}
118128

119129
/// Convert a vector of tokens to a single string.
120130
///
121131
/// # Errors
122132
///
123133
/// See [`TokenToStringError`] for more information.
124-
pub fn tokens_to_str(&self, tokens: &[LlamaToken]) -> Result<String, TokenToStringError> {
134+
pub fn tokens_to_str(&self, tokens: &[LlamaToken], special: Special) -> Result<String, TokenToStringError> {
125135
let mut builder = String::with_capacity(tokens.len() * 4);
126-
for str in tokens.iter().copied().map(|t| self.token_to_str(t)) {
136+
for str in tokens.iter().copied().map(|t| self.token_to_str(t, special)) {
127137
builder += &str?;
128138
}
129139
Ok(builder)
@@ -236,8 +246,9 @@ impl LlamaModel {
236246
&self,
237247
token: LlamaToken,
238248
buffer_size: usize,
249+
special: Special,
239250
) -> Result<String, TokenToStringError> {
240-
let bytes = self.token_to_bytes_with_size(token, buffer_size)?;
251+
let bytes = self.token_to_bytes_with_size(token, buffer_size, special)?;
241252
Ok(String::from_utf8(bytes)?)
242253
}
243254

@@ -259,11 +270,13 @@ impl LlamaModel {
259270
&self,
260271
token: LlamaToken,
261272
buffer_size: usize,
273+
special: Special,
262274
) -> Result<Vec<u8>, TokenToStringError> {
263275
if token == self.token_nl() {
264276
return Ok(String::from("\n").into_bytes());
265277
}
266278

279+
// unsure what to do with this in the face of the 'special' arg
267280
match self.token_type(token) {
268281
LlamaTokenType::Normal | LlamaTokenType::UserDefined => {}
269282
LlamaTokenType::Control => {
@@ -279,12 +292,17 @@ impl LlamaModel {
279292
}
280293
}
281294

295+
let special = match special {
296+
Special::Tokenize => true,
297+
Special::Plaintext => false,
298+
};
299+
282300
let string = CString::new(vec![b'*'; buffer_size]).expect("no null");
283301
let len = string.as_bytes().len();
284302
let len = c_int::try_from(len).expect("length fits into c_int");
285303
let buf = string.into_raw();
286304
let size = unsafe {
287-
llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len)
305+
llama_cpp_sys_2::llama_token_to_piece(self.model.as_ptr(), token.0, buf, len, special)
288306
};
289307

290308
match size {

llama-cpp-sys-2/Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@ include = [
3333
"/llama.cpp/llama.h",
3434
"/llama.cpp/unicode.h",
3535
"/llama.cpp/unicode.cpp",
36-
"/llama.cpp/ggml-common.h"
36+
"/llama.cpp/unicode-data.h",
37+
"/llama.cpp/unicode-data.h",
38+
"/llama.cpp/unicode-data.cpp",
39+
"/llama.cpp/ggml-common.h",
40+
"/llama.cpp/ggml-cuda"
3741
]
3842

3943
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

llama-cpp-sys-2/build.rs

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::env;
2+
use std::ffi::OsStr;
23
use std::path::Path;
34
use std::path::PathBuf;
45

@@ -67,6 +68,12 @@ fn main() {
6768
.cuda(true)
6869
.flag("-arch=all")
6970
.file("llama.cpp/ggml-cuda.cu")
71+
.files(std::fs::read_dir("llama.cpp/ggml-cuda")
72+
.expect("failed to read 'llama.cpp/ggml-cuda'")
73+
.map(|e| e.expect("failed to ready entry").path())
74+
.filter(|p| p.extension().is_some_and(|it| it == OsStr::new("cu")))
75+
)
76+
.include("llama.cpp/ggml-cuda")
7077
.include("llama.cpp");
7178

7279
if ggml_cuda.get_compiler().is_like_msvc() {
@@ -75,9 +82,9 @@ fn main() {
7582
ggml_cuda.flag("-std=c++11").std("c++11");
7683
}
7784

78-
ggml.define("GGML_USE_CUBLAS", None);
79-
ggml_cuda.define("GGML_USE_CUBLAS", None);
80-
llama_cpp.define("GGML_USE_CUBLAS", None);
85+
ggml.define("GGML_USE_CUDA", None);
86+
ggml_cuda.define("GGML_USE_CUDA", None);
87+
llama_cpp.define("GGML_USE_CUDA", None);
8188
}
8289

8390
for build in [&mut ggml, &mut llama_cpp] {
@@ -177,7 +184,8 @@ fn main() {
177184
.include("llama.cpp")
178185
.std("c++11")
179186
.file("llama.cpp/llama.cpp")
180-
.file("llama.cpp/unicode.cpp");
187+
.file("llama.cpp/unicode.cpp")
188+
.file("llama.cpp/unicode-data.cpp");
181189

182190
// Remove debug log output from `llama.cpp`
183191
let is_release = env::var("PROFILE").unwrap() == "release";
@@ -193,18 +201,18 @@ fn main() {
193201
}
194202

195203
if let Some(ggml_cuda) = ggml_cuda {
196-
println!("compiling ggml-cuda");
204+
eprintln!("compiling ggml-cuda");
197205
ggml_cuda.compile("ggml-cuda");
198-
println!("compiled ggml-cuda");
206+
eprintln!("compiled ggml-cuda");
199207
}
200208

201-
println!("compiling ggml");
209+
eprintln!("compiling ggml");
202210
ggml.compile("ggml");
203-
println!("compiled ggml");
211+
eprintln!("compiled ggml");
204212

205-
println!("compiling llama");
213+
eprintln!("compiling llama");
206214
llama_cpp.compile("llama");
207-
println!("compiled llama");
215+
eprintln!("compiled llama");
208216

209217
let header = "llama.cpp/llama.h";
210218

llama-cpp-sys-2/llama.cpp

simple/src/main.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use llama_cpp_2::llama_backend::LlamaBackend;
1515
use llama_cpp_2::llama_batch::LlamaBatch;
1616
use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
1717
use llama_cpp_2::model::params::LlamaModelParams;
18-
use llama_cpp_2::model::AddBos;
18+
use llama_cpp_2::model::{AddBos, Special};
1919
use llama_cpp_2::model::LlamaModel;
2020
use llama_cpp_2::token::data_array::LlamaTokenDataArray;
2121
use std::ffi::CString;
@@ -214,7 +214,7 @@ either reduce n_len or increase n_ctx"
214214
eprintln!();
215215

216216
for token in &tokens_list {
217-
eprint!("{}", model.token_to_str(*token)?);
217+
eprint!("{}", model.token_to_str(*token, Special::Tokenize)?);
218218
}
219219

220220
std::io::stderr().flush()?;
@@ -259,7 +259,7 @@ either reduce n_len or increase n_ctx"
259259
break;
260260
}
261261

262-
let output_bytes = model.token_to_bytes(new_token_id)?;
262+
let output_bytes = model.token_to_bytes(new_token_id, Special::Tokenize)?;
263263
// use `Decoder.decode_to_string()` to avoid the intermediate buffer
264264
let mut output_string = String::with_capacity(32);
265265
let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);

0 commit comments

Comments
 (0)