Skip to content

Commit b6523a6

Browse files
authored
Merge pull request #68 from utilityai/update-llama-cpp-2024-02-05
updated llama.cpp
2 parents 7c95eab + aec18f1 commit b6523a6

File tree

8 files changed

+78
-59
lines changed

8 files changed

+78
-59
lines changed

llama-cpp-2/benches/grammar_bias.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ fn criterion_benchmark(c: &mut Criterion) {
3030
.unwrap();
3131
let backend = LlamaBackend::init().unwrap();
3232
let model_params = LlamaModelParams::default();
33-
let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap();
33+
let model = LlamaModel::load_from_file(&backend, file, &model_params).unwrap();
3434
let mut ctx = model
35-
.new_context(&backend, &LlamaContextParams::default())
35+
.new_context(&backend, LlamaContextParams::default())
3636
.unwrap();
3737
let grammar = LlamaGrammar::from_str(include_str!("../src/grammar/json.gbnf")).unwrap();
3838

llama-cpp-2/examples/simple.rs

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
//! This is an translation of simple.cpp in llama.cpp using llama-cpp-2.
2-
#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
2+
#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
33

4-
use std::io::Write;
5-
use std::num::NonZeroU32;
6-
use std::path::PathBuf;
7-
use std::time::Duration;
4+
use anyhow::{bail, Context, Result};
85
use clap::Parser;
96
use llama_cpp_2::context::params::LlamaContextParams;
10-
use llama_cpp_2::llama_backend::LlamaBackend;
11-
use llama_cpp_2::model::LlamaModel;
12-
use llama_cpp_2::model::params::LlamaModelParams;
13-
use anyhow::{bail, Context, Result};
147
use llama_cpp_2::ggml_time_us;
8+
use llama_cpp_2::llama_backend::LlamaBackend;
159
use llama_cpp_2::llama_batch::LlamaBatch;
16-
use llama_cpp_2::token::data_array::LlamaTokenDataArray;
10+
use llama_cpp_2::model::params::LlamaModelParams;
1711
use llama_cpp_2::model::AddBos;
18-
12+
use llama_cpp_2::model::LlamaModel;
13+
use llama_cpp_2::token::data_array::LlamaTokenDataArray;
14+
use std::io::Write;
15+
use std::num::NonZeroU32;
16+
use std::path::PathBuf;
17+
use std::time::Duration;
1918

2019
#[derive(clap::Parser)]
2120
struct Args {
@@ -30,7 +29,6 @@ struct Args {
3029
disable_gpu: bool,
3130
}
3231

33-
3432
fn main() -> Result<()> {
3533
let params = Args::parse();
3634

@@ -60,12 +58,14 @@ fn main() -> Result<()> {
6058
.with_n_ctx(NonZeroU32::new(2048))
6159
.with_seed(1234);
6260

63-
let mut ctx = model.new_context(&backend, ctx_params)
61+
let mut ctx = model
62+
.new_context(&backend, ctx_params)
6463
.with_context(|| "unable to create the llama_context")?;
6564

6665
// tokenize the prompt
6766

68-
let tokens_list = model.str_to_token(&params.prompt, AddBos::Always)
67+
let tokens_list = model
68+
.str_to_token(&params.prompt, AddBos::Always)
6969
.with_context(|| format!("failed to tokenize {}", params.prompt))?;
7070

7171
let n_cxt = ctx.n_ctx() as i32;
@@ -75,8 +75,10 @@ fn main() -> Result<()> {
7575

7676
// make sure the KV cache is big enough to hold all the prompt and generated tokens
7777
if n_kv_req > n_cxt {
78-
bail!("n_kv_req > n_ctx, the required kv cache size is not big enough
79-
either reduce n_len or increase n_ctx")
78+
bail!(
79+
"n_kv_req > n_ctx, the required kv cache size is not big enough
80+
either reduce n_len or increase n_ctx"
81+
)
8082
}
8183

8284
// print the prompt token-by-token
@@ -137,7 +139,6 @@ either reduce n_len or increase n_ctx")
137139
ctx.decode(&mut batch).with_context(|| "failed to eval")?;
138140

139141
n_decode += 1;
140-
141142
}
142143

143144
eprintln!("\n");
@@ -146,10 +147,14 @@ either reduce n_len or increase n_ctx")
146147

147148
let duration = Duration::from_micros((t_main_end - t_main_start) as u64);
148149

149-
eprintln!("decoded {} tokens in {:.2} s, speed {:.2} t/s\n", n_decode, duration.as_secs_f32(), n_decode as f32 / duration.as_secs_f32());
150+
eprintln!(
151+
"decoded {} tokens in {:.2} s, speed {:.2} t/s\n",
152+
n_decode,
153+
duration.as_secs_f32(),
154+
n_decode as f32 / duration.as_secs_f32()
155+
);
150156

151157
println!("{}", ctx.timings());
152158

153159
Ok(())
154-
155-
}
160+
}

llama-cpp-2/src/context/params.rs

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ pub enum RopeScalingType {
1919

2020
/// Create a `RopeScalingType` from a `c_int` - returns `RopeScalingType::ScalingUnspecified` if
2121
/// the value is not recognized.
22-
impl From<i8> for RopeScalingType {
23-
fn from(value: i8) -> Self {
22+
impl From<i32> for RopeScalingType {
23+
fn from(value: i32) -> Self {
2424
match value {
2525
0 => Self::None,
2626
1 => Self::Linear,
@@ -31,7 +31,7 @@ impl From<i8> for RopeScalingType {
3131
}
3232

3333
/// Create a `c_int` from a `RopeScalingType`.
34-
impl From<RopeScalingType> for i8 {
34+
impl From<RopeScalingType> for i32 {
3535
fn from(value: RopeScalingType) -> Self {
3636
match value {
3737
RopeScalingType::None => 0,
@@ -84,7 +84,7 @@ impl LlamaContextParams {
8484
/// let params = params.with_seed(1234);
8585
/// assert_eq!(params.seed(), 1234);
8686
/// ```
87-
pub fn with_seed(mut self, seed: u32) -> Self {
87+
#[must_use] pub fn with_seed(mut self, seed: u32) -> Self {
8888
self.context_params.seed = seed;
8989
self
9090
}
@@ -99,7 +99,7 @@ impl LlamaContextParams {
9999
/// .with_seed(1234);
100100
/// assert_eq!(params.seed(), 1234);
101101
/// ```
102-
pub fn seed(&self) -> u32 {
102+
#[must_use] pub fn seed(&self) -> u32 {
103103
self.context_params.seed
104104
}
105105

@@ -114,8 +114,8 @@ impl LlamaContextParams {
114114
/// let params = params.with_n_ctx(NonZeroU32::new(2048));
115115
/// assert_eq!(params.n_ctx(), NonZeroU32::new(2048));
116116
/// ```
117-
pub fn with_n_ctx(mut self, n_ctx: Option<NonZeroU32>) -> Self {
118-
self.context_params.n_ctx = n_ctx.map_or(0, |n_ctx| n_ctx.get());
117+
#[must_use] pub fn with_n_ctx(mut self, n_ctx: Option<NonZeroU32>) -> Self {
118+
self.context_params.n_ctx = n_ctx.map_or(0, std::num::NonZeroU32::get);
119119
self
120120
}
121121

@@ -128,11 +128,11 @@ impl LlamaContextParams {
128128
/// ```rust
129129
/// let params = llama_cpp_2::context::params::LlamaContextParams::default();
130130
/// assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512));
131-
pub fn n_ctx(&self) -> Option<NonZeroU32> {
131+
#[must_use] pub fn n_ctx(&self) -> Option<NonZeroU32> {
132132
NonZeroU32::new(self.context_params.n_ctx)
133133
}
134134

135-
/// Set the n_batch
135+
/// Set the `n_batch`
136136
///
137137
/// # Examples
138138
///
@@ -143,12 +143,12 @@ impl LlamaContextParams {
143143
/// .with_n_batch(2048);
144144
/// assert_eq!(params.n_batch(), 2048);
145145
/// ```
146-
pub fn with_n_batch(mut self, n_batch: u32) -> Self {
146+
#[must_use] pub fn with_n_batch(mut self, n_batch: u32) -> Self {
147147
self.context_params.n_batch = n_batch;
148148
self
149149
}
150150

151-
/// Get the n_batch
151+
/// Get the `n_batch`
152152
///
153153
/// # Examples
154154
///
@@ -157,7 +157,7 @@ impl LlamaContextParams {
157157
/// let params = LlamaContextParams::default();
158158
/// assert_eq!(params.n_batch(), 512);
159159
/// ```
160-
pub fn n_batch(&self) -> u32 {
160+
#[must_use] pub fn n_batch(&self) -> u32 {
161161
self.context_params.n_batch
162162
}
163163

@@ -171,8 +171,8 @@ impl LlamaContextParams {
171171
/// .with_rope_scaling_type(RopeScalingType::Linear);
172172
/// assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);
173173
/// ```
174-
pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self {
175-
self.context_params.rope_scaling_type = i8::from(rope_scaling_type);
174+
#[must_use] pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self {
175+
self.context_params.rope_scaling_type = i32::from(rope_scaling_type);
176176
self
177177
}
178178

@@ -184,7 +184,7 @@ impl LlamaContextParams {
184184
/// let params = llama_cpp_2::context::params::LlamaContextParams::default();
185185
/// assert_eq!(params.rope_scaling_type(), llama_cpp_2::context::params::RopeScalingType::Unspecified);
186186
/// ```
187-
pub fn rope_scaling_type(&self) -> RopeScalingType {
187+
#[must_use] pub fn rope_scaling_type(&self) -> RopeScalingType {
188188
RopeScalingType::from(self.context_params.rope_scaling_type)
189189
}
190190

@@ -198,7 +198,7 @@ impl LlamaContextParams {
198198
/// .with_rope_freq_base(0.5);
199199
/// assert_eq!(params.rope_freq_base(), 0.5);
200200
/// ```
201-
pub fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self {
201+
#[must_use] pub fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self {
202202
self.context_params.rope_freq_base = rope_freq_base;
203203
self
204204
}
@@ -211,7 +211,7 @@ impl LlamaContextParams {
211211
/// let params = llama_cpp_2::context::params::LlamaContextParams::default();
212212
/// assert_eq!(params.rope_freq_base(), 0.0);
213213
/// ```
214-
pub fn rope_freq_base(&self) -> f32 {
214+
#[must_use] pub fn rope_freq_base(&self) -> f32 {
215215
self.context_params.rope_freq_base
216216
}
217217

@@ -225,7 +225,7 @@ impl LlamaContextParams {
225225
/// .with_rope_freq_scale(0.5);
226226
/// assert_eq!(params.rope_freq_scale(), 0.5);
227227
/// ```
228-
pub fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self {
228+
#[must_use] pub fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self {
229229
self.context_params.rope_freq_scale = rope_freq_scale;
230230
self
231231
}
@@ -238,7 +238,7 @@ impl LlamaContextParams {
238238
/// let params = llama_cpp_2::context::params::LlamaContextParams::default();
239239
/// assert_eq!(params.rope_freq_scale(), 0.0);
240240
/// ```
241-
pub fn rope_freq_scale(&self) -> f32 {
241+
#[must_use] pub fn rope_freq_scale(&self) -> f32 {
242242
self.context_params.rope_freq_scale
243243
}
244244

@@ -250,7 +250,7 @@ impl LlamaContextParams {
250250
/// let params = llama_cpp_2::context::params::LlamaContextParams::default();
251251
/// assert_eq!(params.n_threads(), 4);
252252
/// ```
253-
pub fn n_threads(&self) -> u32 {
253+
#[must_use] pub fn n_threads(&self) -> u32 {
254254
self.context_params.n_threads
255255
}
256256

@@ -264,7 +264,7 @@ impl LlamaContextParams {
264264
/// .with_n_threads(8);
265265
/// assert_eq!(params.n_threads(), 8);
266266
/// ```
267-
pub fn with_n_threads(mut self, n_threads: u32) -> Self {
267+
#[must_use] pub fn with_n_threads(mut self, n_threads: u32) -> Self {
268268
self.context_params.n_threads = n_threads;
269269
self
270270
}

llama-cpp-2/src/llama_batch.rs

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,21 @@ impl LlamaBatch {
3737
///
3838
/// - [`self.llama_batch.n_tokens`] does not fit into a usize
3939
/// - [`seq_ids.len()`] does not fit into a [`llama_seq_id`]
40+
///
41+
/// # Errors
42+
///
43+
/// returns a error if there is insufficient space in the buffer
4044
pub fn add(
4145
&mut self,
4246
LlamaToken(id): LlamaToken,
4347
pos: llama_pos,
4448
seq_ids: &[i32],
4549
logits: bool,
4650
) -> Result<(), BatchAddError> {
47-
if self.allocated < usize::try_from(self.n_tokens() + 1).expect("cannot fit n_tokens into a usize") {
48-
return Err(BatchAddError::InsufficientSpace(self.allocated))
51+
if self.allocated
52+
< usize::try_from(self.n_tokens() + 1).expect("cannot fit n_tokens into a usize")
53+
{
54+
return Err(BatchAddError::InsufficientSpace(self.allocated));
4955
}
5056
let offset = self.llama_batch.n_tokens;
5157
let offset_usize = usize::try_from(offset).expect("cannot fit n_tokens into a usize");
@@ -55,8 +61,10 @@ impl LlamaBatch {
5561
// batch.pos [batch.n_tokens] = pos,
5662
self.llama_batch.pos.add(offset_usize).write(pos);
5763
// batch.n_seq_id[batch.n_tokens] = seq_ids.size();
58-
self.llama_batch.n_seq_id.add(offset_usize).write(llama_seq_id::try_from(seq_ids.len())
59-
.expect("cannot fit seq_ids.len() into a llama_seq_id"));
64+
self.llama_batch.n_seq_id.add(offset_usize).write(
65+
llama_seq_id::try_from(seq_ids.len())
66+
.expect("cannot fit seq_ids.len() into a llama_seq_id"),
67+
);
6068
// for (size_t i = 0; i < seq_ids.size(); ++i) {
6169
// batch.seq_id[batch.n_tokens][i] = seq_ids[i];
6270
// }
@@ -65,7 +73,10 @@ impl LlamaBatch {
6573
tmp.add(i).write(*seq_id);
6674
}
6775
// batch.logits [batch.n_tokens] = logits;
68-
self.llama_batch.logits.add(offset_usize).write(i8::from(logits));
76+
self.llama_batch
77+
.logits
78+
.add(offset_usize)
79+
.write(i8::from(logits));
6980
}
7081

7182
if logits {

llama-cpp-2/src/model.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ impl LlamaModel {
126126
) -> Result<Vec<LlamaToken>, StringToTokenError> {
127127
let add_bos = match add_bos {
128128
AddBos::Always => true,
129-
AddBos::Never => false
129+
AddBos::Never => false,
130130
};
131131

132132
let tokens_estimation = std::cmp::max(8, (str.len() / 2) + usize::from(add_bos));
@@ -136,8 +136,6 @@ impl LlamaModel {
136136
let buffer_capacity =
137137
c_int::try_from(buffer.capacity()).expect("buffer capacity should fit into a c_int");
138138

139-
140-
141139
let size = unsafe {
142140
llama_cpp_sys_2::llama_tokenize(
143141
self.model.as_ptr(),

llama-cpp-2/src/token.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ pub mod data_array;
1010
#[repr(transparent)]
1111
#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
1212
#[allow(clippy::module_name_repetitions)]
13-
pub struct LlamaToken( pub llama_cpp_sys_2::llama_token);
13+
pub struct LlamaToken(pub llama_cpp_sys_2::llama_token);
1414

1515
impl Display for LlamaToken {
1616
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {

llama-cpp-sys-2/build.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,32 @@
11
use std::env;
2-
use std::path::PathBuf;
32
use std::path::Path;
3+
use std::path::PathBuf;
44

55
fn main() {
66
println!("cargo:rerun-if-changed=llama.cpp");
77

88
let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok();
99

1010
if !Path::new("llama.cpp/ggml.c").exists() {
11-
panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
11+
panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.")
1212
}
1313

1414
let mut ggml = cc::Build::new();
15-
let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None };
15+
let mut ggml_cuda = if cublas_enabled {
16+
Some(cc::Build::new())
17+
} else {
18+
None
19+
};
1620
let mut llama_cpp = cc::Build::new();
1721

1822
ggml.cpp(false);
1923
llama_cpp.cpp(true);
2024

2125
// https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368
2226
if let Some(ggml_cuda) = &mut ggml_cuda {
23-
for lib in ["cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt"] {
27+
for lib in [
28+
"cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt",
29+
] {
2430
println!("cargo:rustc-link-lib={}", lib);
2531
}
2632

@@ -66,8 +72,7 @@ fn main() {
6672
ggml.define("_GNU_SOURCE", None);
6773
}
6874

69-
ggml
70-
.std("c17")
75+
ggml.std("c17")
7176
.file("llama.cpp/ggml.c")
7277
.file("llama.cpp/ggml-alloc.c")
7378
.file("llama.cpp/ggml-backend.c")

llama-cpp-sys-2/llama.cpp

0 commit comments

Comments
 (0)