Update for the (hopefully stable!) llama.cpp changes.

Hirtol · Hirtol · commit 32b53edf5162 · 2024-03-05T18:00:14.000+01:00
diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
@@ -109,7 +109,7 @@ fn main() -> Result<()> {
     // initialize the context
     let ctx_params = LlamaContextParams::default()
         .with_n_threads_batch(std::thread::available_parallelism()?.get() as u32)
-        .with_embedding(true);
+        .with_embeddings(true);
 
     let mut ctx = model
         .new_context(&backend, ctx_params)
@@ -193,10 +193,9 @@ fn main() -> Result<()> {
 fn batch_decode(ctx: &mut LlamaContext, batch: &mut LlamaBatch, s_batch: i32, output: &mut Vec<Vec<f32>>, normalise: bool) -> Result<()> {
     ctx.clear_kv_cache();
     ctx.decode(batch).with_context(|| "llama_decode() failed")?;
-    batch.clear();
 
     for i in 0..s_batch {
-        let embedding = ctx.embeddings_ith(i).with_context(|| "Failed to get embeddings")?;
+        let embedding = ctx.embeddings_seq_ith(i).with_context(|| "Failed to get embeddings")?;
         let output_embeddings = if normalise {
             normalize(embedding)
         } else {
@@ -206,6 +205,8 @@ fn batch_decode(ctx: &mut LlamaContext, batch: &mut LlamaBatch, s_batch: i32, ou
         output.push(output_embeddings);
     }
 
+    batch.clear();
+
     Ok(())
 }
 
diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
@@ -5,12 +5,12 @@ use std::num::NonZeroI32;
 use std::ptr::NonNull;
 use std::slice;
 
-use crate::{DecodeError, EmbeddingsError};
 use crate::llama_batch::LlamaBatch;
 use crate::model::LlamaModel;
 use crate::timing::LlamaTimings;
 use crate::token::data::LlamaTokenData;
 use crate::token::LlamaToken;
+use crate::{DecodeError, EmbeddingsError};
 
 pub mod kv_cache;
 pub mod params;
@@ -92,17 +92,51 @@ impl<'model> LlamaContext<'model> {
     ///
     /// # Errors
     ///
-    /// When the current context was constructed without enabling embeddings.
+    /// - When the current context was constructed without enabling embeddings.
+    /// - If the current model had a pooling type of [`llama_cpp_sys_2::LLAMA_POOLING_TYPE_NONE`]
+    /// - If the given sequence index exceeds the max sequence id.
+    pub fn embeddings_seq_ith(&self, i: i32) -> Result<&[f32], EmbeddingsError> {
+        if !self.embeddings_enabled {
+            return Err(EmbeddingsError::NotEnabled);
+        }
+
+        unsafe {
+            let embedding = llama_cpp_sys_2::llama_get_embeddings_seq(self.context.as_ptr(), i);
+
+            // Technically also possible whenever `i >= max(batch.n_seq)`, but can't check that here.
+            if embedding.is_null() {
+                Err(EmbeddingsError::NonePoolType)
+            } else {
+                Ok(std::slice::from_raw_parts(embedding, self.model.n_embd() as usize))
+            }
+        }
+    }
+
+    /// Get the embeddings for the `i`th token in the current context.
+    ///
+    /// # Returns
+    ///
+    /// A slice containing the embeddings for the last decoded batch of the given token.
+    /// The size corresponds to the `n_embd` parameter of the context's model.
+    ///
+    /// # Errors
+    ///
+    /// - When the current context was constructed without enabling embeddings.
+    /// - When the given token didn't have logits enabled when it was passed.
+    /// - If the given token index exceeds the max token id.
     pub fn embeddings_ith(&self, i: i32) -> Result<&[f32], EmbeddingsError> {
         if !self.embeddings_enabled {
-            return Err(EmbeddingsError::NotEnabled)
+            return Err(EmbeddingsError::NotEnabled);
         }
 
         unsafe {
-            Ok(std::slice::from_raw_parts(
-                llama_cpp_sys_2::llama_get_embeddings_ith(self.context.as_ptr(), i),
-                self.model.n_embd() as usize,
-            ))
+            let embedding = llama_cpp_sys_2::llama_get_embeddings_ith(self.context.as_ptr(), i);
+            // Technically also possible whenever `i >= batch.n_tokens`, but no good way of checking `n_tokens` here.
+            if embedding.is_null() {
+                Err(EmbeddingsError::LogitsNotEnabled)
+            } else {
+                Ok(std::slice::from_raw_parts(embedding, self.model.n_embd() as usize))
+            }
         }
     }
 
@@ -155,6 +189,11 @@ impl<'model> LlamaContext<'model> {
         let timings = unsafe { llama_cpp_sys_2::llama_get_timings(self.context.as_ptr()) };
         LlamaTimings { timings }
     }
+
+    /// Returns a reference to the raw [llama_cpp_sys_2::llama_context] pointer.
+    pub fn raw_ctx(&self) -> &NonNull<llama_cpp_sys_2::llama_context> {
+        &self.context
+    }
 }
 
 impl Drop for LlamaContext<'_> {
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -319,11 +319,11 @@ impl LlamaContextParams {
     ///
     /// ```rust
     /// let params = llama_cpp_2::context::params::LlamaContextParams::default();
-    /// assert!(!params.embedding());
+    /// assert!(!params.embeddings());
     /// ```
     #[must_use]
-    pub fn embedding(&self) -> bool {
-        self.context_params.embedding
+    pub fn embeddings(&self) -> bool {
+        self.context_params.embeddings
     }
 
     /// Enable the use of embeddings
@@ -333,12 +333,12 @@ impl LlamaContextParams {
     /// ```rust
     /// use llama_cpp_2::context::params::LlamaContextParams;
     /// let params = LlamaContextParams::default()
-    ///    .with_embedding(true);
-    /// assert!(params.embedding());
+    ///    .with_embeddings(true);
+    /// assert!(params.embeddings());
     /// ```
     #[must_use]
-    pub fn with_embedding(mut self, embedding: bool) -> Self {
-        self.context_params.embedding = embedding;
+    pub fn with_embeddings(mut self, embedding: bool) -> Self {
+        self.context_params.embeddings = embedding;
         self
     }
 }
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -83,6 +83,10 @@ pub enum DecodeError {
 pub enum EmbeddingsError {
     #[error("Embeddings weren't enabled in the context options")]
     NotEnabled,
+    #[error("Logits were not enabled for the given token")]
+    LogitsNotEnabled,
+    #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
+    NonePoolType,
 }
 
 /// Decode a error from llama.cpp into a [`DecodeError`].
diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
@@ -121,14 +121,13 @@ impl LlamaBatch {
                 let seq_id_ptr = *self.llama_batch.seq_id.add(j);
                 seq_id_ptr.write(seq_id);
                 self.llama_batch.n_seq_id.add(j).write(1);
-                self.llama_batch.logits.add(j).write(logits_all as i8)
+
+                let write_logits = logits_all || i == n_tokens - 1;
+                self.llama_batch.logits.add(j).write(write_logits as i8)
             }
         }
 
-        unsafe {
-            self.llama_batch.logits.add(n_tokens - 1).write(true as i8);
-            self.initialized_logits.push(self.llama_batch.n_tokens - 1);
-        }
+        self.initialized_logits.push(self.llama_batch.n_tokens - 1);
 
         Ok(())
     }
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -320,7 +320,7 @@ impl LlamaModel {
         };
         let context = NonNull::new(context).ok_or(LlamaContextLoadError::NullReturn)?;
 
-        Ok(LlamaContext::new(self, context, params.embedding()))
+        Ok(LlamaContext::new(self, context, params.embeddings()))
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -121,14 +121,13 @@ impl LlamaBatch {`
`121`	`121`	`let seq_id_ptr = *self.llama_batch.seq_id.add(j);`
`122`	`122`	`seq_id_ptr.write(seq_id);`
`123`	`123`	`self.llama_batch.n_seq_id.add(j).write(1);`
`124`		`- self.llama_batch.logits.add(j).write(logits_all as i8)`
	`124`	`+`
	`125`	`+ let write_logits = logits_all \|\| i == n_tokens - 1;`
	`126`	`+ self.llama_batch.logits.add(j).write(write_logits as i8)`
`125`	`127`	`}`
`126`	`128`	`}`
`127`	`129`
`128`		`- unsafe {`
`129`		`- self.llama_batch.logits.add(n_tokens - 1).write(true as i8);`
`130`		`- self.initialized_logits.push(self.llama_batch.n_tokens - 1);`
`131`		`- }`
	`130`	`+ self.initialized_logits.push(self.llama_batch.n_tokens - 1);`
`132`	`131`
`133`	`132`	`Ok(())`
`134`	`133`	`}`
Original file line number	Diff line number	Diff line change
`@@ -320,7 +320,7 @@ impl LlamaModel {`
`320`	`320`	`};`
`321`	`321`	`let context = NonNull::new(context).ok_or(LlamaContextLoadError::NullReturn)?;`
`322`	`322`
`323`		`- Ok(LlamaContext::new(self, context, params.embedding()))`
	`323`	`+ Ok(LlamaContext::new(self, context, params.embeddings()))`
`324`	`324`	`}`
`325`	`325`	`}`
`326`	`326`