utilityai · MarcusDunn · Feb 5, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,6 +11,7 @@ members = [
 # core library deps
 thiserror = "1"
 tracing = "0.1"
+tracing-core = "0.1"
 
 # examples and benchmarks
 hf-hub = { version = "0.3.2" }
@@ -21,6 +22,7 @@ cc = "1.2.11"
 anyhow = "1.0.95"
 clap = "4.5.27"
 encoding_rs = "0.8.35"
+tracing-subscriber = { version = "0.3", features = ["json"] }
 
 [workspace.lints.rust]
 missing_docs = { level = "warn" }

diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
@@ -11,6 +11,7 @@ hf-hub = { workspace = true }
 clap = { workspace = true , features = ["derive"] }
 anyhow = { workspace = true }
 encoding_rs = { workspace = true }
+tracing-subscriber = { workspace = true }
 
 [features]
 cuda = ["llama-cpp-2/cuda"]

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
@@ -10,7 +10,7 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::Parser;
 use hf_hub::api::sync::ApiBuilder;
 use llama_cpp_2::context::params::LlamaContextParams;
-use llama_cpp_2::ggml_time_us;
+use llama_cpp_2::{ggml_time_us, send_logs_to_tracing, LogOptions};
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
@@ -67,6 +67,12 @@ struct Args {
         help = "size of the prompt context (default: loaded from themodel)"
     )]
     ctx_size: Option<NonZeroU32>,
+    #[arg(
+        short = 'v',
+        long,
+        help = "enable verbose llama.cpp logs",
+    )]
+    verbose: bool,
 }
 
 /// Parse a single key-value pair
@@ -132,8 +138,14 @@ fn main() -> Result<()> {
         threads,
         threads_batch,
         ctx_size,
+        verbose,
     } = Args::parse();
 
+    if verbose {
+        tracing_subscriber::fmt().init();
+    }
+    send_logs_to_tracing(LogOptions::default().with_logs_enabled(verbose));
+
     // init LLM
     let backend = LlamaBackend::init()?;
 

diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
@@ -13,6 +13,7 @@ enumflags2 = "0.7.11"
 llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" }
 thiserror = { workspace = true }
 tracing = { workspace = true }
+tracing-core = { workspace = true }
 
 [dev-dependencies]
 encoding_rs = { workspace = true }

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -25,6 +25,7 @@ use std::string::FromUtf8Error;
 pub mod context;
 pub mod llama_backend;
 pub mod llama_batch;
+mod log;
 pub mod model;
 pub mod sampling;
 pub mod timing;
@@ -323,3 +324,76 @@ pub fn ggml_time_us() -> i64 {
 pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
+
+/// Options to configure how llama.cpp logs are intercepted.
+#[derive(Default, Debug, Clone)]
+pub struct LogOptions {
+    disabled: bool,
+}
+
+impl LogOptions {
+    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
+    /// logs to be sent to tracing.
+    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
+        self.disabled = !enabled;
+        self
+    }
+}
+
+extern "C" fn logs_to_trace(
+    level: llama_cpp_sys_2::ggml_log_level,
+    text: *const ::std::os::raw::c_char,
+    data: *mut ::std::os::raw::c_void,
+) {
+    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
+    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
+    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
+    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
+    use std::borrow::Borrow;
+
+    let log_state = unsafe { &*(data as *const log::State) };
+
+    let text = unsafe { std::ffi::CStr::from_ptr(text) };
+    let text = text.to_string_lossy();
+    let text: &str = text.borrow();
+
+    if log_state.options.disabled {
+        return;
+    }
+
+    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
+    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
+    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
+    // to know how to flush it.
+
+    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
+        log_state.cont_buffered_log(text);
+    } else if text.ends_with('\n') {
+        log_state.emit_non_cont_line(level, text);
+    } else {
+        log_state.buffer_non_cont(level, text);
+    }
+}
+
+/// Redirect llama.cpp logs into tracing.
+pub fn send_logs_to_tracing(options: LogOptions) {
+    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
+
+    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
+    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
+    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
+    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
+    let llama_heap_state = Box::as_ref(
+        log::LLAMA_STATE
+            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
+    ) as *const _;
+    let ggml_heap_state = Box::as_ref(
+        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
+    ) as *const _;
+
+    unsafe {
+        // GGML has to be set after llama since setting llama sets ggml as well.
+        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
+        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
+    }
+}