Add functionality for creating embeddings

Hirtol · Hirtol · commit 542a410c2388 · 2024-03-02T16:45:33.000+01:00
diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
@@ -2,15 +2,15 @@
 
 use std::fmt::{Debug, Formatter};
 use std::num::NonZeroI32;
+use std::ptr::NonNull;
+use std::slice;
 
+use crate::{DecodeError, EmbeddingsError};
 use crate::llama_batch::LlamaBatch;
 use crate::model::LlamaModel;
 use crate::timing::LlamaTimings;
 use crate::token::data::LlamaTokenData;
 use crate::token::LlamaToken;
-use crate::DecodeError;
-use std::ptr::NonNull;
-use std::slice;
 
 pub mod kv_cache;
 pub mod params;
@@ -24,6 +24,7 @@ pub struct LlamaContext<'a> {
     /// a reference to the contexts model.
     pub model: &'a LlamaModel,
     initialized_logits: Vec<i32>,
+    embeddings_enabled: bool,
 }
 
 impl Debug for LlamaContext<'_> {
@@ -38,11 +39,13 @@ impl<'model> LlamaContext<'model> {
     pub(crate) fn new(
         llama_model: &'model LlamaModel,
         llama_context: NonNull<llama_cpp_sys_2::llama_context>,
+        embeddings_enabled: bool,
     ) -> Self {
         Self {
             context: llama_context,
             model: llama_model,
             initialized_logits: Vec::new(),
+            embeddings_enabled,
         }
     }
 
@@ -80,6 +83,29 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
+    /// Get the embeddings for the `i`th sequence in the current context.
+    ///
+    /// # Returns
+    ///
+    /// A slice containing the embeddings for the last decoded batch.
+    /// The size corresponds to the `n_embd` parameter of the context's model.
+    ///
+    /// # Errors
+    ///
+    /// When the current context was constructed without enabling embeddings.
+    pub fn embeddings_ith(&self, i: i32) -> Result<&[f32], EmbeddingsError> {
+        if !self.embeddings_enabled {
+            return Err(EmbeddingsError::NotEnabled)
+        }
+
+        unsafe {
+            Ok(std::slice::from_raw_parts(
+                llama_cpp_sys_2::llama_get_embeddings_ith(self.context.as_ptr(), i),
+                self.model.n_embd() as usize,
+            ))
+        }
+    }
+
     /// Get the logits for the ith token in the context.
     ///
     /// # Panics
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -1,8 +1,9 @@
 //! A safe wrapper around `llama_context_params`.
-use llama_cpp_sys_2;
 use std::fmt::Debug;
 use std::num::NonZeroU32;
 
+use llama_cpp_sys_2;
+
 /// A rusty wrapper around `rope_scaling_type`.
 #[repr(i8)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -267,6 +268,19 @@ impl LlamaContextParams {
         self.context_params.n_threads
     }
 
+    /// Get the number of threads allocated for batches.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// let params = llama_cpp_2::context::params::LlamaContextParams::default();
+    /// assert_eq!(params.n_threads_batch(), 4);
+    /// ```
+    #[must_use]
+    pub fn n_threads_batch(&self) -> u32 {
+        self.context_params.n_threads_batch
+    }
+
     /// Set the number of threads.
     ///
     /// # Examples
@@ -282,6 +296,51 @@ impl LlamaContextParams {
         self.context_params.n_threads = n_threads;
         self
     }
+
+    /// Set the number of threads allocated for batches.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use llama_cpp_2::context::params::LlamaContextParams;
+    /// let params = LlamaContextParams::default()
+    ///    .with_n_threads_batch(8);
+    /// assert_eq!(params.n_threads_batch(), 8);
+    /// ```
+    #[must_use]
+    pub fn with_n_threads_batch(mut self, n_threads: u32) -> Self {
+        self.context_params.n_threads_batch = n_threads;
+        self
+    }
+
+    /// Check whether embeddings are enabled
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// let params = llama_cpp_2::context::params::LlamaContextParams::default();
+    /// assert!(!params.embedding());
+    /// ```
+    #[must_use]
+    pub fn embedding(&self) -> bool {
+        self.context_params.embedding
+    }
+
+    /// Enable the use of embeddings
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use llama_cpp_2::context::params::LlamaContextParams;
+    /// let params = LlamaContextParams::default()
+    ///    .with_embedding(true);
+    /// assert!(params.embedding());
+    /// ```
+    #[must_use]
+    pub fn with_embedding(mut self, embedding: bool) -> Self {
+        self.context_params.embedding = embedding;
+        self
+    }
 }
 
 /// Default parameters for `LlamaContext`. (as defined in llama.cpp by `llama_context_default_params`)
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -52,6 +52,8 @@ pub enum LLamaCppError {
     /// There was an error adding a token to a batch.
     #[error["{0}"]]
     BatchAddError(#[from] BatchAddError),
+    #[error(transparent)]
+    EmbeddingError(#[from] EmbeddingsError),
 }
 
 /// Failed to Load context
@@ -76,6 +78,13 @@ pub enum DecodeError {
     Unknown(c_int),
 }
 
+/// When embedding related functions fail
+#[derive(Debug, Eq, PartialEq, thiserror::Error)]
+pub enum EmbeddingsError {
+    #[error("Embeddings weren't enabled in the context options")]
+    NotEnabled,
+}
+
 /// Decode a error from llama.cpp into a [`DecodeError`].
 impl From<NonZeroI32> for DecodeError {
     fn from(value: NonZeroI32) -> Self {
diff --git a/llama-cpp-2/src/llama_backend.rs b/llama-cpp-2/src/llama_backend.rs
@@ -3,6 +3,7 @@
 use crate::LLamaCppError;
 use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering::SeqCst;
+use llama_cpp_sys_2::ggml_log_level;
 
 /// Representation of an initialized llama backend
 /// This is required as a parameter for most llama functions as the backend must be initialized
@@ -68,6 +69,19 @@ impl LlamaBackend {
         }
         Ok(LlamaBackend {})
     }
+
+    /// Change the output of llama.cpp's logging to be voided instead of pushed to `stderr`.
+    pub fn void_logs(&mut self) {
+        unsafe extern "C" fn void_log(
+            _level: ggml_log_level,
+            _text: *const ::std::os::raw::c_char,
+            _user_data: *mut ::std::os::raw::c_void,
+        ) {}
+
+        unsafe {
+            llama_cpp_sys_2::llama_log_set(Some(void_log), std::ptr::null_mut())
+        }
+    }
 }
 
 /// A rusty wrapper around `numa_strategy`.
diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
@@ -6,11 +6,11 @@ use llama_cpp_sys_2::{llama_batch, llama_batch_free, llama_batch_init, llama_pos
 /// A safe wrapper around `llama_batch`.
 #[derive(Debug)]
 pub struct LlamaBatch {
-    /// The number of tokens the batch was allocated with. they are safe to write to - but not necessarily read from as they are not necessarily initilized
+    /// The number of tokens the batch was allocated with. they are safe to write to - but not necessarily read from as they are not necessarily initialized
     allocated: usize,
-    /// The logits that are initilized. Used by [`LlamaContext`] to ensure that only initilized logits are accessed.
+    /// The logits that are initialized. Used by [`LlamaContext`] to ensure that only initialized logits are accessed.
     pub(crate) initialized_logits: Vec<i32>,
-    /// The llama_cpp batch. always initilize by `llama_cpp_sys_2::llama_batch_init(allocated, <unknown>, <unknown>)`
+    /// The llama_cpp batch. always initialize by `llama_cpp_sys_2::llama_batch_init(allocated, <unknown>, <unknown>)`
     pub(crate) llama_batch: llama_batch,
 }
 
@@ -31,7 +31,7 @@ impl LlamaBatch {
     }
 
     /// add a token to the batch for sequences [`seq_ids`] at position [pos]. If [logits] is true, the
-    /// token will be initilized and can be read from after the next decode.
+    /// token will be initialized and can be read from after the next decode.
     ///
     /// # Panics
     ///
@@ -90,7 +90,49 @@ impl LlamaBatch {
 
         Ok(())
     }
-    /// Create a new `LlamaBatch` that cab contain up to `n_tokens` tokens.
+
+    /// Add a sequence of tokens to the batch for the given sequence id. If [logits_all] is true, the
+    /// tokens will be initialized and can be read from after the next decode.
+    ///
+    /// Either way the last token in the sequence will have its logits set to `true`.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if there is insufficient space in the buffer
+    pub fn add_sequence(&mut self, tokens: &[LlamaToken],
+                        seq_id: i32,
+                        logits_all: bool) -> Result<(), BatchAddError> {
+        let n_tokens_0 = self.llama_batch.n_tokens;
+        let n_tokens = tokens.len();
+
+        if self.allocated < n_tokens_0 as usize + n_tokens {
+            return Err(BatchAddError::InsufficientSpace(self.allocated));
+        }
+        if n_tokens == 0 {
+            return Ok(())
+        }
+
+        self.llama_batch.n_tokens += n_tokens as i32;
+        for (i, token) in tokens.iter().enumerate() {
+            let j = n_tokens_0 as usize + i;
+            unsafe {
+                self.llama_batch.token.add(j).write(token.0);
+                self.llama_batch.pos.add(j).write(i as i32);
+                let seq_id_ptr = *self.llama_batch.seq_id.add(j);
+                seq_id_ptr.write(seq_id);
+                self.llama_batch.n_seq_id.add(j).write(1);
+                self.llama_batch.logits.add(j).write(logits_all as i8)
+            }
+        }
+
+        unsafe {
+            self.llama_batch.logits.add(n_tokens - 1).write(true as i8);
+        }
+
+        Ok(())
+    }
+
+    /// Create a new `LlamaBatch` that can contain up to `n_tokens` tokens.
     ///
     /// # Arguments
     ///
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -1,15 +1,16 @@
 //! A safe wrapper around `llama_model`.
-use crate::context::params::LlamaContextParams;
+use std::ffi::CString;
+use std::os::raw::c_int;
+use std::path::Path;
+use std::ptr::NonNull;
+
+use crate::{LlamaContextLoadError, LlamaModelLoadError, StringToTokenError, TokenToStringError};
 use crate::context::LlamaContext;
+use crate::context::params::LlamaContextParams;
 use crate::llama_backend::LlamaBackend;
 use crate::model::params::LlamaModelParams;
 use crate::token::LlamaToken;
 use crate::token_type::LlamaTokenType;
-use crate::{LlamaContextLoadError, LlamaModelLoadError, StringToTokenError, TokenToStringError};
-use std::ffi::CString;
-use std::os::raw::c_int;
-use std::path::Path;
-use std::ptr::NonNull;
 
 pub mod params;
 
@@ -29,6 +30,7 @@ pub enum AddBos {
     /// Do not add the beginning of stream token to the start of the string.
     Never,
 }
+
 unsafe impl Send for LlamaModel {}
 
 unsafe impl Sync for LlamaModel {}
@@ -38,12 +40,12 @@ impl LlamaModel {
     ///
     /// # Panics
     ///
-    /// If the number of tokens the model was trained on does not fit into an `u16`. This should be impossible on most
+    /// If the number of tokens the model was trained on does not fit into an `u32`. This should be impossible on most
     /// platforms due to llama.cpp returning a `c_int` (i32 on most platforms) which is almost certainly positive.
     #[must_use]
-    pub fn n_ctx_train(&self) -> u16 {
+    pub fn n_ctx_train(&self) -> u32 {
         let n_ctx_train = unsafe { llama_cpp_sys_2::llama_n_ctx_train(self.model.as_ptr()) };
-        u16::try_from(n_ctx_train).expect("n_ctx_train fits into an u16")
+        u32::try_from(n_ctx_train).expect("n_ctx_train fits into an u32")
     }
 
     /// Get all tokens in the model.
@@ -54,6 +56,7 @@ impl LlamaModel {
             .map(LlamaToken::new)
             .map(|llama_token| (llama_token, self.token_to_str(llama_token)))
     }
+
     /// Get the beginning of stream token.
     #[must_use]
     pub fn token_bos(&self) -> LlamaToken {
@@ -276,7 +279,7 @@ impl LlamaModel {
     /// # Errors
     ///
     /// See [`LlamaModelLoadError`] for more information.
-    #[tracing::instrument(skip_all)]
+    #[tracing::instrument(skip_all, fields(params))]
     pub fn load_from_file(
         _: &LlamaBackend,
         path: impl AsRef<Path>,
@@ -290,13 +293,12 @@ impl LlamaModel {
 
         let cstr = CString::new(path)?;
         let llama_model = unsafe {
-            println!("{:?}", params.params);
             llama_cpp_sys_2::llama_load_model_from_file(cstr.as_ptr(), params.params)
         };
 
         let model = NonNull::new(llama_model).ok_or(LlamaModelLoadError::NullResult)?;
 
-        println!("Loaded {path:?}");
+        tracing::debug!(?path, "Loaded model");
         Ok(LlamaModel { model })
     }
 
@@ -318,7 +320,7 @@ impl LlamaModel {
         };
         let context = NonNull::new(context).ok_or(LlamaContextLoadError::NullReturn)?;
 
-        Ok(LlamaContext::new(self, context))
+        Ok(LlamaContext::new(self, context, params.embedding()))
     }
 }