Add en embeddings example

Hirtol · Hirtol · commit c0faaef37539 · 2024-03-02T17:23:29.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
     "llama-cpp-sys-2",
     "llama-cpp-2",
-    "simple",
+    "simple", "embeddings",
 ]
 
 [workspace.dependencies]
diff --git a/embeddings/Cargo.toml b/embeddings/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "embeddings"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+llama-cpp-2 = { path = "../llama-cpp-2", version = "0.1.34" }
+hf-hub = { workspace = true }
+clap = { workspace = true , features = ["derive"] }
+anyhow = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
@@ -0,0 +1,216 @@
+//! This is a translation of embedding.cpp in llama.cpp using llama-cpp-2.
+#![allow(
+clippy::cast_possible_wrap,
+clippy::cast_possible_truncation,
+clippy::cast_precision_loss,
+clippy::cast_sign_loss
+)]
+
+use std::io::Write;
+use std::path::PathBuf;
+use std::str::FromStr;
+use std::time::Duration;
+
+use anyhow::{bail, Context, Result};
+use clap::Parser;
+use hf_hub::api::sync::ApiBuilder;
+use llama_cpp_2::context::LlamaContext;
+
+use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::ggml_time_us;
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::AddBos;
+use llama_cpp_2::model::LlamaModel;
+use llama_cpp_2::model::params::LlamaModelParams;
+
+#[derive(clap::Parser, Debug, Clone)]
+struct Args {
+    /// The path to the model
+    #[command(subcommand)]
+    model: Model,
+    /// The prompt
+    #[clap(default_value = "Hello my name is")]
+    prompt: String,
+    /// Whether to normalise the produced embeddings
+    #[clap(short)]
+    normalise: bool,
+    /// Disable offloading layers to the gpu
+    #[cfg(feature = "cublas")]
+    #[clap(long)]
+    disable_gpu: bool,
+}
+
+
+#[derive(clap::Subcommand, Debug, Clone)]
+enum Model {
+    /// Use an already downloaded model
+    Local {
+        /// The path to the model. e.g. `/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/blobs/08a5566d61d7cb6b420c3e4387a39e0078e1f2fe5f055f3a03887385304d4bfa`
+        path: PathBuf,
+    },
+    /// Download a model from huggingface (or use a cached version)
+    #[clap(name = "hf-model")]
+    HuggingFace {
+        /// the repo containing the model. e.g. `TheBloke/Llama-2-7B-Chat-GGUF`
+        repo: String,
+        /// the model name. e.g. `llama-2-7b-chat.Q4_K_M.gguf`
+        model: String,
+    },
+}
+
+impl Model {
+    /// Convert the model to a path - may download from huggingface
+    fn get_or_load(self) -> Result<PathBuf> {
+        match self {
+            Model::Local { path } => Ok(path),
+            Model::HuggingFace { model, repo } => ApiBuilder::new()
+                .with_progress(true)
+                .build()
+                .with_context(|| "unable to create huggingface api")?
+                .model(repo)
+                .get(&model)
+                .with_context(|| "unable to download model"),
+        }
+    }
+}
+
+fn main() -> Result<()> {
+    let Args {
+        model,
+        prompt,
+        normalise,
+        #[cfg(feature = "cublas")]
+        disable_gpu,
+    } = Args::parse();
+
+    // init LLM
+    let backend = LlamaBackend::init()?;
+
+    // offload all layers to the gpu
+    let model_params = {
+        #[cfg(feature = "cublas")]
+        if !disable_gpu {
+            LlamaModelParams::default().with_n_gpu_layers(1000)
+        } else {
+            LlamaModelParams::default()
+        }
+        #[cfg(not(feature = "cublas"))]
+        LlamaModelParams::default()
+    };
+
+    let model_path = model
+        .get_or_load()
+        .with_context(|| "failed to get model from args")?;
+
+    let model = LlamaModel::load_from_file(&backend, model_path, &model_params)
+        .with_context(|| "unable to load model")?;
+
+    // initialize the context
+    let ctx_params = LlamaContextParams::default()
+        .with_n_threads_batch(std::thread::available_parallelism()?.get() as u32)
+        .with_embedding(true);
+
+    let mut ctx = model
+        .new_context(&backend, ctx_params)
+        .with_context(|| "unable to create the llama_context")?;
+
+    // Split the prompt to display the batching functionality
+    let prompt_lines = prompt.lines();
+
+    // tokenize the prompt
+    let tokens_lines_list = prompt_lines.map(|line| model.str_to_token(&line, AddBos::Always))
+        .collect::<Result<Vec<_>, _>>()
+        .with_context(|| format!("failed to tokenize {prompt}"))?;
+
+    let n_ctx = ctx.n_ctx() as usize;
+    let n_ctx_train = model.n_ctx_train();
+
+    eprintln!("n_ctx = {n_ctx}, n_ctx_train = {n_ctx_train}");
+
+    if tokens_lines_list.iter().any(|tok| n_ctx < tok.len()) {
+        bail!("One of the provided prompts exceeds the size of the context window");
+    }
+
+    // print the prompt token-by-token
+    eprintln!();
+
+    for (i, token_line) in tokens_lines_list.iter().enumerate() {
+        eprintln!("Prompt {i}");
+        for token in token_line {
+            eprintln!(" {} --> {}", token, model.token_to_str(*token)?);
+        }
+        eprintln!()
+    }
+
+    std::io::stderr().flush()?;
+
+    // create a llama_batch with the size of the context
+    // we use this object to submit token data for decoding
+    let mut batch = LlamaBatch::new(n_ctx, tokens_lines_list.len() as i32);
+
+    // Amount of tokens in the current batch
+    let mut s_batch = 0;
+    let mut output = Vec::with_capacity(tokens_lines_list.len());
+
+    let t_main_start = ggml_time_us();
+
+    for tokens in &tokens_lines_list {
+        // Flush the batch if the next prompt would exceed our batch size
+        if (batch.n_tokens() as usize + tokens.len()) > n_ctx {
+            batch_decode(&mut ctx, &mut batch, s_batch, &mut output, normalise)?;
+            s_batch = 0;
+        }
+
+        batch.add_sequence(&tokens, s_batch, false)?;
+        s_batch += 1;
+    }
+    // Handle final batch
+    batch_decode(&mut ctx, &mut batch, s_batch, &mut output, normalise)?;
+
+    let t_main_end = ggml_time_us();
+
+    for (i, embeddings) in output.iter().enumerate() {
+        eprintln!("Embeddings {i}: {embeddings:?}");
+        eprintln!("\n");
+    }
+
+    let duration = Duration::from_micros((t_main_end - t_main_start) as u64);
+    let total_tokens: usize = tokens_lines_list.iter().map(|v| v.len()).sum();
+
+    eprintln!(
+        "Created embeddings for {} tokens in {:.2} s, speed {:.2} t/s\n",
+        total_tokens,
+        duration.as_secs_f32(),
+        total_tokens as f32 / duration.as_secs_f32()
+    );
+
+    println!("{}", ctx.timings());
+
+    Ok(())
+}
+
+fn batch_decode(ctx: &mut LlamaContext, batch: &mut LlamaBatch, s_batch: i32, output: &mut Vec<Vec<f32>>, normalise: bool) -> Result<()> {
+    ctx.clear_kv_cache();
+    ctx.decode(batch).with_context(|| "llama_decode() failed")?;
+    batch.clear();
+
+    for i in 0..s_batch {
+        let embedding = ctx.embeddings_ith(i).with_context(|| "Failed to get embeddings")?;
+        let output_embeddings = if normalise {
+            normalize(embedding)
+        } else {
+            embedding.to_vec()
+        };
+
+        output.push(output_embeddings);
+    }
+
+    Ok(())
+}
+
+fn normalize(input: &[f32]) -> Vec<f32> {
+    let magnitude = input.iter().fold(0.0, |acc, &val| val.mul_add(val, acc)).sqrt();
+
+    input.iter().map(|&val| val / magnitude).collect()
+}
diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
@@ -127,6 +127,7 @@ impl LlamaBatch {
 
         unsafe {
             self.llama_batch.logits.add(n_tokens - 1).write(true as i8);
+            self.initialized_logits.push(self.llama_batch.n_tokens - 1);
         }
 
         Ok(())

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ resolver = "2"`
`3`	`3`	`members = [`
`4`	`4`	`"llama-cpp-sys-2",`
`5`	`5`	`"llama-cpp-2",`
`6`		`- "simple",`
	`6`	`+ "simple", "embeddings",`
`7`	`7`	`]`
`8`	`8`
`9`	`9`	`[workspace.dependencies]`
Original file line number	Diff line number	Diff line change
`@@ -127,6 +127,7 @@ impl LlamaBatch {`
`127`	`127`
`128`	`128`	`unsafe {`
`129`	`129`	`self.llama_batch.logits.add(n_tokens - 1).write(true as i8);`
	`130`	`+ self.initialized_logits.push(self.llama_batch.n_tokens - 1);`
`130`	`131`	`}`
`131`	`132`
`132`	`133`	`Ok(())`