Merge branch 'dep/update_llama_cpp_b6002' into dennis/feat/multi-modal

fellhorn · fellhorn · commit a27029d83d4b · 2025-07-31T00:26:07.000+02:00
Signed-off-by: Dennis Keck &lt;26092524+fellhorn@users.noreply.github.com&gt;
diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
@@ -32,9 +32,13 @@ jobs:
       - name: Clippy
         run: cargo clippy
       - name: Fmt
-        run: cargo fmt
+        run: cargo fmt --check
       - name: Test
         run: cargo test --features sampler
+      - name: Dry-Run Publishing llama-cpp-sys-2 Crate
+        run: RUST_BACKTRACE=1 cargo publish --package llama-cpp-sys-2 --verbose --dry-run
+      - name: Dry-Run Publishing llama-cpp-2 Crate
+        run: RUST_BACKTRACE=1 cargo publish --package llama-cpp-2 --verbose --dry-run
   arm64:
     name: Check that it builds on various targets
     runs-on: ubuntu-latest
@@ -49,7 +53,7 @@ jobs:
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435
       - name: Build
         uses: docker/build-push-action@v6
         with:
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "llama-cpp-sys-2/llama.cpp"]
 	path = llama-cpp-sys-2/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp
+	url = https://github.com/ggml-org/llama.cpp
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,9 +20,9 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.23"
+cc = "1.2.30"
 anyhow = "1.0.98"
-clap = "4.5.38"
+clap = "4.5.41"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.107"
+version = "0.1.113"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.107"
+version = "0.1.113"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
@@ -138,7 +138,8 @@ fn main() -> Result<()> {
     } = Args::parse();
 
     if verbose {
-        tracing_subscriber::fmt().init();
+        // tracing_subscriber::fmt().init();
+        tracing_subscriber::fmt::init();
     }
     send_logs_to_tracing(LogOptions::default().with_logs_enabled(verbose));
 
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
@@ -1,16 +1,16 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.107"
+version = "0.1.113"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-enumflags2 = "0.7.11"
-llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" }
+enumflags2 = "0.7.12"
+llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.113" }
 thiserror = { workspace = true }
 tracing = { workspace = true }
 tracing-core = { workspace = true }
@@ -33,7 +33,7 @@ android-shared-stdcxx = ["llama-cpp-sys-2/shared-stdcxx"]
 
 
 [target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]
-llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69", features = [
+llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.113", features = [
     "metal",
 ] }
 
diff --git a/llama-cpp-2/src/context/kv_cache.rs b/llama-cpp-2/src/context/kv_cache.rs
@@ -209,11 +209,4 @@ impl LlamaContext<'_> {
     pub fn kv_cache_update(&mut self) {
         unsafe { llama_cpp_sys_2::llama_kv_self_update(self.context.as_ptr()) }
     }
-
-    /// Returns the number of tokens in the KV cache (slow, use only for debug)
-    /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    #[must_use]
-    pub fn get_kv_cache_token_count(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_kv_self_n_tokens(self.context.as_ptr()) }
-    }
 }
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -514,6 +514,36 @@ impl LlamaContextParams {
     pub fn pooling_type(&self) -> LlamaPoolingType {
         LlamaPoolingType::from(self.context_params.pooling_type)
     }
+
+    /// Set whether to use full sliding window attention
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use llama_cpp_2::context::params::LlamaContextParams;
+    /// let params = LlamaContextParams::default()
+    ///     .with_swa_full(false);
+    /// assert_eq!(params.swa_full(), false);
+    /// ```
+    #[must_use]
+    pub fn with_swa_full(mut self, enabled: bool) -> Self {
+        self.context_params.swa_full = enabled;
+        self
+    }
+
+    /// Get whether full sliding window attention is enabled
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use llama_cpp_2::context::params::LlamaContextParams;
+    /// let params = LlamaContextParams::default();
+    /// assert_eq!(params.swa_full(), true);
+    /// ```
+    #[must_use]
+    pub fn swa_full(&self) -> bool {
+        self.context_params.swa_full
+    }
 }
 
 /// Default parameters for `LlamaContext`. (as defined in llama.cpp by `llama_context_default_params`)
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -7,7 +7,7 @@
 //!
 //! # Examples
 //!
-//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/simple)
+//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/examples/simple)
 //!
 //! # Feature Flags
 //!
@@ -376,6 +376,11 @@ extern "C" fn logs_to_trace(
 
     let log_state = unsafe { &*(data as *const log::State) };
 
+    // If the log level is disabled, we can just return early
+    if !log_state.is_enabled_for_level(level) {
+        return;
+    }
+
     let text = unsafe { std::ffi::CStr::from_ptr(text) };
     let text = text.to_string_lossy();
     let text: &str = text.borrow();
diff --git a/llama-cpp-2/src/log.rs b/llama-cpp-2/src/log.rs
@@ -142,18 +142,16 @@ impl State {
         let (meta, fields) = meta_for_level(level);
 
         tracing::dispatcher::get_default(|dispatcher| {
-            if dispatcher.enabled(meta) {
-                dispatcher.event(&tracing::Event::new(
-                    meta,
-                    &meta.fields().value_set(&[
-                        (&fields.message, Some(&text as &dyn tracing::field::Value)),
-                        (
-                            &fields.target,
-                            module.as_ref().map(|s| s as &dyn tracing::field::Value),
-                        ),
-                    ]),
-                ));
-            }
+            dispatcher.event(&tracing::Event::new(
+                meta,
+                &meta.fields().value_set(&[
+                    (&fields.message, Some(&text as &dyn tracing::field::Value)),
+                    (
+                        &fields.target,
+                        module.as_ref().map(|s| s as &dyn tracing::field::Value),
+                    ),
+                ]),
+            ));
         });
     }
 
@@ -253,6 +251,16 @@ impl State {
             }
         }
     }
+
+    /// Checks whether the given log level is enabled by the current tracing subscriber.
+    pub(super) fn is_enabled_for_level(&self, level: llama_cpp_sys_2::ggml_log_level) -> bool {
+        // CONT logs do not need to check if they are enabled.
+        if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
+            return true;
+        }
+        let (meta, _) = meta_for_level(level);
+        tracing::dispatcher::get_default(|dispatcher| dispatcher.enabled(meta))
+    }
 }
 
 pub(super) static LLAMA_STATE: OnceLock<Box<State>> = OnceLock::new();
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
@@ -278,7 +278,7 @@ impl LlamaSampler {
     /// # Panics
     /// If either of ``grammar_str`` or ``grammar_root`` contain null bytes.
     #[must_use]
-    pub fn grammar(model: &LlamaModel, grammar_str: &str, grammar_root: &str) -> Self {
+    pub fn grammar(model: &LlamaModel, grammar_str: &str, grammar_root: &str) -> Option<Self> {
         let grammar_str = CString::new(grammar_str).unwrap();
         let grammar_root = CString::new(grammar_root).unwrap();
 
@@ -289,7 +289,12 @@ impl LlamaSampler {
                 grammar_root.as_ptr(),
             )
         };
-        Self { sampler }
+
+        if sampler.is_null() {
+            None
+        } else {
+            Some(Self { sampler })
+        }
     }
 
     /// Lazy grammar sampler, introduced in <https://github.com/ggerganov/llama.cpp/pull/9639>
@@ -306,7 +311,7 @@ impl LlamaSampler {
         grammar_root: &str,
         trigger_words: impl IntoIterator<Item = impl AsRef<[u8]>>,
         trigger_tokens: &[LlamaToken],
-    ) -> Self {
+    ) -> Option<Self> {
         let grammar_str = CString::new(grammar_str).unwrap();
         let grammar_root = CString::new(grammar_root).unwrap();
 
@@ -330,7 +335,11 @@ impl LlamaSampler {
             )
         };
 
-        Self { sampler }
+        if sampler.is_null() {
+            None
+        } else {
+            Some(Self { sampler })
+        }
     }
 
     /// DRY sampler, designed by p-e-w, as described in:
diff --git a/llama-cpp-2/src/timing.rs b/llama-cpp-2/src/timing.rs
@@ -11,7 +11,7 @@ impl LlamaTimings {
     /// Create a new `LlamaTimings`.
     /// ```
     /// # use llama_cpp_2::timing::LlamaTimings;
-    /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6);
+    /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6, 1);
     /// let timings_str = "load time = 2.00 ms
     /// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second)
     /// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n";
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs

Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,8 @@ fn main() -> Result<()> {`
`138`	`138`	`} = Args::parse();`
`139`	`139`
`140`	`140`	`if verbose {`
`141`		`- tracing_subscriber::fmt().init();`
	`141`	`+ // tracing_subscriber::fmt().init();`
	`142`	`+ tracing_subscriber::fmt::init();`
`142`	`143`	`}`
`143`	`144`	`send_logs_to_tracing(LogOptions::default().with_logs_enabled(verbose));`
`144`	`145`
Original file line number	Diff line number	Diff line change
`@@ -209,11 +209,4 @@ impl LlamaContext<'_> {`
`209`	`209`	`pub fn kv_cache_update(&mut self) {`
`210`	`210`	`unsafe { llama_cpp_sys_2::llama_kv_self_update(self.context.as_ptr()) }`
`211`	`211`	`}`
`212`		`-`
`213`		`- /// Returns the number of tokens in the KV cache (slow, use only for debug)`
`214`		`- /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times`
`215`		`- #[must_use]`
`216`		`- pub fn get_kv_cache_token_count(&self) -> i32 {`
`217`		`- unsafe { llama_cpp_sys_2::llama_kv_self_n_tokens(self.context.as_ptr()) }`
`218`		`- }`
`219`	`212`	`}`