Merge pull request #482 from thewh1teagle/feat/add-examples

MarcusDunn · web-flow · commit 5c1468a6cb3e · 2024-08-30T08:17:50.000-07:00
Feat/add examples
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,10 +1,6 @@
 [workspace]
 resolver = "2"
-members = [
-    "llama-cpp-sys-2",
-    "llama-cpp-2",
-    "simple", "embeddings",
-]
+members = ["llama-cpp-sys-2", "llama-cpp-2", "embeddings", "examples/usage", "examples/simple"]
 
 [workspace.dependencies]
 # core library deps
@@ -26,4 +22,4 @@ missing_docs = { level = "warn" }
 missing_debug_implementations = { level = "warn" }
 
 [workspace.lints.clippy]
-pedantic = { level = "warn"  }
+pedantic = { level = "warn" }
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-llama-cpp-2 = { path = "../llama-cpp-2", version = "0.1.69" }
+llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.69" }
 hf-hub = { workspace = true }
 clap = { workspace = true , features = ["derive"] }
 anyhow = { workspace = true }
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
diff --git a/examples/usage/Cargo.toml b/examples/usage/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "usage"
+version = "0.1.69"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.69" }
+encoding_rs = { workspace = true }
+
+[features]
+cuda = ["llama-cpp-2/cuda"]
+metal =  ["llama-cpp-2/metal"]
+native = ["llama-cpp-2/native"]
+vulkan = ["llama-cpp-2/vulkan"]
+
+[lints]
+workspace = true
diff --git a/examples/usage/src/main.rs b/examples/usage/src/main.rs
@@ -0,0 +1,85 @@
+/*
+git clone --recursive https://github.com/utilityai/llama-cpp-rs
+cd llama-cpp-rs/examples/usage
+wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
+cargo run qwen2-1_5b-instruct-q4_0.gguf
+*/
+use std::io::Write;
+use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::LlamaModel;
+use llama_cpp_2::model::{AddBos, Special};
+use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+
+fn main() {
+    let model_path = std::env::args().nth(1).expect("Please specify model path");
+    let backend = LlamaBackend::init().unwrap();
+    let params = LlamaModelParams::default();
+
+    let prompt = "<|im_start|>user\nHello! how are you?<|im_end|>\n<|im_start|>assistant\n".to_string();
+    LlamaContextParams::default();
+    let model =
+        LlamaModel::load_from_file(&backend, model_path, &params).expect("unable to load model");
+    let ctx_params = LlamaContextParams::default();
+    let mut ctx = model
+        .new_context(&backend, ctx_params)
+        .expect("unable to create the llama_context");
+    let tokens_list = model
+        .str_to_token(&prompt, AddBos::Always)
+        .expect(&format!("failed to tokenize {prompt}"));
+    let n_len = 64;
+
+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+    let mut batch = LlamaBatch::new(512, 1);
+
+    let last_index: i32 = (tokens_list.len() - 1) as i32;
+    for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
+        // llama_decode will output logits only for the last token of the prompt
+        let is_last = i == last_index;
+        batch.add(token, i, &[0], is_last).unwrap();
+    }
+    ctx.decode(&mut batch).expect("llama_decode() failed");
+
+
+    let mut n_cur = batch.n_tokens();
+
+
+    // The `Decoder`
+    let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+
+    while n_cur <= n_len {
+        // sample the next token
+        {
+            let candidates = ctx.candidates_ith(batch.n_tokens() - 1);
+
+            let candidates_p = LlamaTokenDataArray::from_iter(candidates, false);
+
+            // sample the most likely token
+            let new_token_id = ctx.sample_token_greedy(candidates_p);
+
+            // is it an end of stream?
+            if new_token_id == model.token_eos() {
+                eprintln!();
+                break;
+            }
+
+            let output_bytes = model.token_to_bytes(new_token_id, Special::Tokenize).unwrap();
+            // use `Decoder.decode_to_string()` to avoid the intermediate buffer
+            let mut output_string = String::with_capacity(32);
+            let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);
+            print!("{output_string}");
+            std::io::stdout().flush().unwrap();
+
+            batch.clear();
+            batch.add(new_token_id, n_cur, &[0], true).unwrap();
+        }
+
+        n_cur += 1;
+
+        ctx.decode(&mut batch).expect("failed to eval");
+    }
+}