Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ members = [
# core library deps
thiserror = "1"
tracing = "0.1"
tracing-core = "0.1"

# examples and benchmarks
hf-hub = { version = "0.3.2" }
Expand All @@ -21,6 +22,7 @@ cc = "1.2.11"
anyhow = "1.0.95"
clap = "4.5.27"
encoding_rs = "0.8.35"
tracing-subscriber = { version = "0.3", features = ["json"] }

[workspace.lints.rust]
missing_docs = { level = "warn" }
Expand Down
1 change: 1 addition & 0 deletions examples/simple/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ hf-hub = { workspace = true }
clap = { workspace = true , features = ["derive"] }
anyhow = { workspace = true }
encoding_rs = { workspace = true }
tracing-subscriber = { workspace = true }

[features]
cuda = ["llama-cpp-2/cuda"]
Expand Down
14 changes: 13 additions & 1 deletion examples/simple/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use anyhow::{anyhow, bail, Context, Result};
use clap::Parser;
use hf_hub::api::sync::ApiBuilder;
use llama_cpp_2::context::params::LlamaContextParams;
use llama_cpp_2::ggml_time_us;
use llama_cpp_2::{ggml_time_us, send_logs_to_tracing, LogOptions};
use llama_cpp_2::llama_backend::LlamaBackend;
use llama_cpp_2::llama_batch::LlamaBatch;
use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
Expand Down Expand Up @@ -67,6 +67,12 @@ struct Args {
help = "size of the prompt context (default: loaded from themodel)"
)]
ctx_size: Option<NonZeroU32>,
#[arg(
short = 'v',
long,
help = "enable verbose llama.cpp logs",
)]
verbose: bool,
}

/// Parse a single key-value pair
Expand Down Expand Up @@ -132,8 +138,14 @@ fn main() -> Result<()> {
threads,
threads_batch,
ctx_size,
verbose,
} = Args::parse();

if verbose {
tracing_subscriber::fmt().init();
}
send_logs_to_tracing(LogOptions::default().with_logs_enabled(verbose));

// init LLM
let backend = LlamaBackend::init()?;

Expand Down
1 change: 1 addition & 0 deletions llama-cpp-2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ enumflags2 = "0.7.11"
llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" }
thiserror = { workspace = true }
tracing = { workspace = true }
tracing-core = { workspace = true }

[dev-dependencies]
encoding_rs = { workspace = true }
Expand Down
74 changes: 74 additions & 0 deletions llama-cpp-2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use std::string::FromUtf8Error;
pub mod context;
pub mod llama_backend;
pub mod llama_batch;
mod log;
pub mod model;
pub mod sampling;
pub mod timing;
Expand Down Expand Up @@ -323,3 +324,76 @@ pub fn ggml_time_us() -> i64 {
pub fn llama_supports_mlock() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
}

/// Options to configure how llama.cpp logs are intercepted.
#[derive(Default, Debug, Clone)]
pub struct LogOptions {
disabled: bool,
}

impl LogOptions {
/// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
/// logs to be sent to tracing.
pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
self.disabled = !enabled;
self
}
}

extern "C" fn logs_to_trace(
level: llama_cpp_sys_2::ggml_log_level,
text: *const ::std::os::raw::c_char,
data: *mut ::std::os::raw::c_void,
) {
// In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
// lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
// by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
// newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
use std::borrow::Borrow;

let log_state = unsafe { &*(data as *const log::State) };

let text = unsafe { std::ffi::CStr::from_ptr(text) };
let text = text.to_string_lossy();
let text: &str = text.borrow();

if log_state.options.disabled {
return;
}

// As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
// If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
// distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
// to know how to flush it.

if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
log_state.cont_buffered_log(text);
} else if text.ends_with('\n') {
log_state.emit_non_cont_line(level, text);
} else {
log_state.buffer_non_cont(level, text);
}
}

/// Redirect llama.cpp logs into tracing.
pub fn send_logs_to_tracing(options: LogOptions) {
// TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.

// We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
// can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
// newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
// will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
let llama_heap_state = Box::as_ref(
log::LLAMA_STATE
.get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
) as *const _;
let ggml_heap_state = Box::as_ref(
log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
) as *const _;

unsafe {
// GGML has to be set after llama since setting llama sets ggml as well.
llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
}
}
Loading
Loading