11//! # Usage
2- //!
2+ //!
33//! This is just about the smallest possible way to do inference. To fetch a model from hugging face:
4- //!
5- //! ```bash
4+ //!
5+ //! ```console
66//! git clone --recursive https://github.com/utilityai/llama-cpp-rs
77//! cd llama-cpp-rs/examples/usage
88//! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
9- //! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf
9+ //! cargo run --example usage -- qwen2-1_5b-instruct-q4_0.gguf
1010//! ```
11- use std:: io:: Write ;
1211use llama_cpp_2:: context:: params:: LlamaContextParams ;
1312use llama_cpp_2:: llama_backend:: LlamaBackend ;
1413use llama_cpp_2:: llama_batch:: LlamaBatch ;
1514use llama_cpp_2:: model:: params:: LlamaModelParams ;
1615use llama_cpp_2:: model:: LlamaModel ;
1716use llama_cpp_2:: model:: { AddBos , Special } ;
1817use llama_cpp_2:: token:: data_array:: LlamaTokenDataArray ;
18+ use std:: io:: Write ;
1919
2020#[ allow( clippy:: cast_possible_wrap, clippy:: cast_possible_truncation) ]
2121fn main ( ) {
2222 let model_path = std:: env:: args ( ) . nth ( 1 ) . expect ( "Please specify model path" ) ;
2323 let backend = LlamaBackend :: init ( ) . unwrap ( ) ;
2424 let params = LlamaModelParams :: default ( ) ;
2525
26- let prompt = "<|im_start|>user\n Hello! how are you?<|im_end|>\n <|im_start|>assistant\n " . to_string ( ) ;
26+ let prompt =
27+ "<|im_start|>user\n Hello! how are you?<|im_end|>\n <|im_start|>assistant\n " . to_string ( ) ;
2728 LlamaContextParams :: default ( ) ;
2829 let model =
2930 LlamaModel :: load_from_file ( & backend, model_path, & params) . expect ( "unable to load model" ) ;
@@ -48,14 +49,11 @@ fn main() {
4849 }
4950 ctx. decode ( & mut batch) . expect ( "llama_decode() failed" ) ;
5051
51-
5252 let mut n_cur = batch. n_tokens ( ) ;
5353
54-
5554 // The `Decoder`
5655 let mut decoder = encoding_rs:: UTF_8 . new_decoder ( ) ;
5756
58-
5957 while n_cur <= n_len {
6058 // sample the next token
6159 {
@@ -72,7 +70,9 @@ fn main() {
7270 break ;
7371 }
7472
75- let output_bytes = model. token_to_bytes ( new_token_id, Special :: Tokenize ) . unwrap ( ) ;
73+ let output_bytes = model
74+ . token_to_bytes ( new_token_id, Special :: Tokenize )
75+ . unwrap ( ) ;
7676 // use `Decoder.decode_to_string()` to avoid the intermediate buffer
7777 let mut output_string = String :: with_capacity ( 32 ) ;
7878 let _decode_result = decoder. decode_to_string ( & output_bytes, & mut output_string, false ) ;
0 commit comments