Skip to content

Commit b888e98

Browse files
committed
update to b3750, implementing new llama_perf api
1 parent c530eda commit b888e98

File tree

4 files changed

+144
-1
lines changed

4 files changed

+144
-1
lines changed

llama-cpp-2/src/context.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::slice;
77

88
use crate::llama_batch::LlamaBatch;
99
use crate::model::{LlamaLoraAdapter, LlamaModel};
10+
use crate::timing::LlamaTimings;
1011
use crate::token::data::LlamaTokenData;
1112
use crate::token::LlamaToken;
1213
use crate::{
@@ -263,6 +264,17 @@ impl<'model> LlamaContext<'model> {
263264
unsafe { slice::from_raw_parts(data, len) }
264265
}
265266

267+
/// Reset the timings for the context.
268+
pub fn reset_timings(&mut self) {
269+
unsafe { llama_cpp_sys_2::llama_perf_context_reset(self.context.as_ptr()) }
270+
}
271+
272+
/// Returns the timings for the context.
273+
pub fn timings(&mut self) -> LlamaTimings {
274+
let timings = unsafe { llama_cpp_sys_2::llama_perf_context(self.context.as_ptr()) };
275+
LlamaTimings { timings }
276+
}
277+
266278
/// Sets a lora adapter.
267279
///
268280
/// # Errors

llama-cpp-2/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ pub mod context;
2626
pub mod llama_backend;
2727
pub mod llama_batch;
2828
pub mod model;
29+
pub mod timing;
2930
pub mod token;
3031
pub mod token_type;
3132

llama-cpp-2/src/timing.rs

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
//! Safe wrapper around `llama_timings`.
2+
use std::fmt::{Debug, Display, Formatter};
3+
4+
/// A wrapper around `llama_timings`.
5+
#[derive(Clone, Copy, Debug)]
6+
pub struct LlamaTimings {
7+
pub(crate) timings: llama_cpp_sys_2::llama_perf_context_data,
8+
}
9+
10+
impl LlamaTimings {
11+
/// Create a new `LlamaTimings`.
12+
/// ```
13+
/// # use llama_cpp_2::timing::LlamaTimings;
14+
/// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6);
15+
/// let timings_str = "load time = 2.00 ms
16+
/// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second)
17+
/// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n";
18+
/// assert_eq!(timings_str, format!("{}", timings));
19+
/// ```
20+
#[allow(clippy::too_many_arguments)]
21+
#[must_use]
22+
pub fn new(
23+
t_start_ms: f64,
24+
t_load_ms: f64,
25+
t_p_eval_ms: f64,
26+
t_eval_ms: f64,
27+
n_p_eval: i32,
28+
n_eval: i32,
29+
) -> Self {
30+
Self {
31+
timings: llama_cpp_sys_2::llama_perf_context_data {
32+
t_start_ms,
33+
t_load_ms,
34+
t_p_eval_ms,
35+
t_eval_ms,
36+
n_p_eval,
37+
n_eval,
38+
},
39+
}
40+
}
41+
42+
/// Get the start time in milliseconds.
43+
#[must_use]
44+
pub fn t_start_ms(&self) -> f64 {
45+
self.timings.t_start_ms
46+
}
47+
48+
/// Get the load time in milliseconds.
49+
#[must_use]
50+
pub fn t_load_ms(&self) -> f64 {
51+
self.timings.t_load_ms
52+
}
53+
54+
/// Get the prompt evaluation time in milliseconds.
55+
#[must_use]
56+
pub fn t_p_eval_ms(&self) -> f64 {
57+
self.timings.t_p_eval_ms
58+
}
59+
60+
/// Get the evaluation time in milliseconds.
61+
#[must_use]
62+
pub fn t_eval_ms(&self) -> f64 {
63+
self.timings.t_eval_ms
64+
}
65+
66+
/// Get the number of prompt evaluations.
67+
#[must_use]
68+
pub fn n_p_eval(&self) -> i32 {
69+
self.timings.n_p_eval
70+
}
71+
72+
/// Get the number of evaluations.
73+
#[must_use]
74+
pub fn n_eval(&self) -> i32 {
75+
self.timings.n_eval
76+
}
77+
78+
/// Set the start time in milliseconds.
79+
pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
80+
self.timings.t_start_ms = t_start_ms;
81+
}
82+
83+
/// Set the load time in milliseconds.
84+
pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
85+
self.timings.t_load_ms = t_load_ms;
86+
}
87+
88+
/// Set the prompt evaluation time in milliseconds.
89+
pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
90+
self.timings.t_p_eval_ms = t_p_eval_ms;
91+
}
92+
93+
/// Set the evaluation time in milliseconds.
94+
pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
95+
self.timings.t_eval_ms = t_eval_ms;
96+
}
97+
98+
/// Set the number of prompt evaluations.
99+
pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
100+
self.timings.n_p_eval = n_p_eval;
101+
}
102+
103+
/// Set the number of evaluations.
104+
pub fn set_n_eval(&mut self, n_eval: i32) {
105+
self.timings.n_eval = n_eval;
106+
}
107+
}
108+
109+
impl Display for LlamaTimings {
110+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
111+
writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
112+
writeln!(
113+
f,
114+
"prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
115+
self.t_p_eval_ms(),
116+
self.n_p_eval(),
117+
self.t_p_eval_ms() / f64::from(self.n_p_eval()),
118+
1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
119+
)?;
120+
writeln!(
121+
f,
122+
"eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
123+
self.t_eval_ms(),
124+
self.n_eval(),
125+
self.t_eval_ms() / f64::from(self.n_eval()),
126+
1e3 / self.t_eval_ms() * f64::from(self.n_eval())
127+
)?;
128+
Ok(())
129+
}
130+
}

llama-cpp-sys-2/llama.cpp

0 commit comments

Comments
 (0)