utilityai
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 8 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/usage/src/main.rs‎ ‎examples/usage.rs‎examples/usage/src/main.rs renamed to examples/usage.rs
Lines changed: 10 additions & 10 deletions b/‎examples/usage/src/main.rs‎ ‎examples/usage.rs‎examples/usage/src/main.rs renamed to examples/usage.rs
Lines changed: 10 additions & 10 deletions
diff --git a/‎examples/usage/Cargo.toml‎
Lines changed: 0 additions & 19 deletions b/‎examples/usage/Cargo.toml‎
Lines changed: 0 additions & 19 deletions
diff --git a/‎llama-cpp-2/Cargo.toml‎
Lines changed: 7 additions & 0 deletions b/‎llama-cpp-2/Cargo.toml‎
Lines changed: 7 additions & 0 deletions
@@ -1,6 +1,6 @@
 [workspace]
 resolver = "2"
-members = ["llama-cpp-sys-2", "llama-cpp-2", "embeddings", "examples/usage", "examples/simple"]
+members = ["llama-cpp-sys-2", "llama-cpp-2", "embeddings", "examples/simple"]
 
 [workspace.dependencies]
 # core library deps
 
@@ -1,29 +1,30 @@
 //! # Usage
-//! 
+//!
 //! This is just about the smallest possible way to do inference. To fetch a model from hugging face:
-//! 
-//! ```bash
+//!
+//! ```console
 //! git clone --recursive https://github.com/utilityai/llama-cpp-rs
 //! cd llama-cpp-rs/examples/usage
 //! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
-//! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf
+//! cargo run --example usage -- qwen2-1_5b-instruct-q4_0.gguf
 //! ```
-use std::io::Write;
 use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+use std::io::Write;
 
 #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
 fn main() {
     let model_path = std::env::args().nth(1).expect("Please specify model path");
     let backend = LlamaBackend::init().unwrap();
     let params = LlamaModelParams::default();
 
-    let prompt = "<|im_start|>user\nHello! how are you?<|im_end|>\n<|im_start|>assistant\n".to_string();
+    let prompt =
+        "<|im_start|>user\nHello! how are you?<|im_end|>\n<|im_start|>assistant\n".to_string();
     LlamaContextParams::default();
     let model =
         LlamaModel::load_from_file(&backend, model_path, &params).expect("unable to load model");
@@ -48,14 +49,11 @@ fn main() {
     }
     ctx.decode(&mut batch).expect("llama_decode() failed");
 
-
     let mut n_cur = batch.n_tokens();
 
-
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
-
     while n_cur <= n_len {
         // sample the next token
         {
@@ -72,7 +70,9 @@ fn main() {
                 break;
             }
 
-            let output_bytes = model.token_to_bytes(new_token_id, Special::Tokenize).unwrap();
+            let output_bytes = model
+                .token_to_bytes(new_token_id, Special::Tokenize)
+                .unwrap();
             // use `Decoder.decode_to_string()` to avoid the intermediate buffer
             let mut output_string = String::with_capacity(32);
             let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);
 
@@ -14,6 +14,9 @@ llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" }
 thiserror = { workspace = true }
 tracing = { workspace = true }
 
+[dev-dependencies]
+encoding_rs = { workspace = true }
+
 [features]
 cuda = ["llama-cpp-sys-2/cuda"]
 metal = ["llama-cpp-sys-2/metal"]
@@ -32,3 +35,7 @@ workspace = true
 
 [package.metadata.docs.rs]
 features = ["sampler"]
+
+[[example]]
+name = "usage"
+path = "../examples/usage.rs"