diff --git a/tiktoken-rs/Cargo.toml b/tiktoken-rs/Cargo.toml index 75cc4ee..41fff6a 100644 --- a/tiktoken-rs/Cargo.toml +++ b/tiktoken-rs/Cargo.toml @@ -13,6 +13,10 @@ documentation = "https://docs.rs/crate/tiktoken-rs/" license = "MIT" readme = "../README.md" +[[bin]] +name = "tiktoken" +path = "src/main.rs" + [profile.release] debug = 1 @@ -21,11 +25,14 @@ anyhow = "1.0.76" async-openai = { version = "0.14.2", optional = true } base64 = "0.22.0" bstr = "1.6.2" +clap = { version = "4.4", features = ["derive"] } dhat = { version = "0.3.2", optional = true } fancy-regex = "0.13.0" lazy_static = "1.4.0" regex = "1.10.3" rustc-hash = "1.1.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" [features] async-openai = ["dep:async-openai"] diff --git a/tiktoken-rs/README.md b/tiktoken-rs/README.md index 162209c..f41954a 100644 --- a/tiktoken-rs/README.md +++ b/tiktoken-rs/README.md @@ -18,6 +18,49 @@ This library is built on top of the `tiktoken` library and includes some additio For full working examples for all supported features, see the [examples](https://github.com/zurawiki/tiktoken-rs/tree/main/tiktoken-rs/examples) directory in the repository. +# CLI Usage + +The project includes a command-line interface for token counting. + +## Installation + +```shell +cargo install tiktoken-rs +``` + +## Usage + +```bash +# Get help +tiktoken --help + +# List all available models +tiktoken --list-models + +# Count tokens in text from stdin +echo 'Hello, world!' | tiktoken + +# Count tokens with a specific model +echo 'Hello, world!' | tiktoken --model gpt-3.5-turbo + +# Count tokens with the o1 model +echo 'Hello, world!' | tiktoken --model o1 + +# Output JSON with usage percentage +echo 'Hello, world!' | tiktoken --json +``` + +The CLI outputs JSON with token count, model used, context size, and remaining tokens: + +```json +{ + "token_count": 4, + "model": "gpt-4", + "context_size": 8192, + "remaining_tokens": 8188 +} +``` + # Usage 1. Install this tool locally with `cargo` diff --git a/tiktoken-rs/src/main.rs b/tiktoken-rs/src/main.rs new file mode 100644 index 0000000..a39e500 --- /dev/null +++ b/tiktoken-rs/src/main.rs @@ -0,0 +1,106 @@ +use clap::Parser; +use serde::Serialize; +use std::io::{self, Read}; +use tiktoken_rs::{get_bpe_from_model, model::get_context_size, tokenizer::list_available_models}; + +#[derive(Parser)] +#[command( + name = "tiktoken", + about = "Count tokens in text using OpenAI's tiktoken library", + version +)] +struct Args { + /// Model to use for tokenization (e.g., gpt-4o, gpt-3.5-turbo, o1) + #[arg(short, long, default_value = "gpt-4.1")] + model: String, + + /// Output results in JSON format + #[arg(long)] + json: bool, + + /// List all available models and exit + #[arg(long)] + list_models: bool, + + /// Input text to count tokens for (reads from stdin if not provided) + #[arg(value_name = "TEXT")] + text: Vec, +} + +#[derive(Serialize)] +struct TokenCountResponse { + /// Number of tokens in the input text + token_count: usize, + /// Model used for tokenization + model: String, + /// Context size for the model + context_size: usize, + /// Remaining tokens available for completion + remaining_tokens: usize, + /// Percentage of context used (rounded to 3 decimal places) + usage_percentage: f64, +} + +fn main() -> Result<(), Box> { + let args = Args::parse(); + + // Handle list models command + if args.list_models { + println!("Available models:"); + println!(); + + // Get all models from the tokenizer module + let models = list_available_models(); + + for model in models.iter() { + let context_size = get_context_size(model); + println!(" {:<25} (context: {})", model, context_size); + } + + println!(); + println!( + "Note: Many models support version suffixes (e.g., gpt-4-0314, gpt-3.5-turbo-0125)" + ); + println!(" and fine-tuned models use the ft: prefix (e.g., ft:gpt-3.5-turbo:xxx:2023-11-11)"); + return Ok(()); + } + + // Get input text from argument or stdin + let input_text = if !args.text.is_empty() { + args.text.join(" ") + } else { + let mut buffer = String::new(); + eprintln!("🔎 Reading from stdin..."); + io::stdin().read_to_string(&mut buffer)?; + buffer + }; + + // Count tokens using the specified model + let bpe = get_bpe_from_model(&args.model)?; + let token_count = bpe.encode_with_special_tokens(&input_text).len(); + let context_size = get_context_size(&args.model); + let remaining_tokens = context_size.saturating_sub(token_count); + + // Calculate usage percentage rounded to 3 decimal places + let usage_percentage = if context_size > 0 { + ((token_count as f64 / context_size as f64) * 100.0 * 1000.0).round() / 1000.0 + } else { + 0.0 + }; + + // Output based on the json flag + if args.json { + let response = TokenCountResponse { + token_count, + model: args.model, + context_size, + remaining_tokens, + usage_percentage, + }; + println!("{}", serde_json::to_string_pretty(&response)?); + } else { + println!("{token_count}"); + } + + Ok(()) +} diff --git a/tiktoken-rs/src/tokenizer.rs b/tiktoken-rs/src/tokenizer.rs index 7ab0b19..f63c5a7 100644 --- a/tiktoken-rs/src/tokenizer.rs +++ b/tiktoken-rs/src/tokenizer.rs @@ -51,7 +51,7 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[ // Keep this in sync with: // https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22 -const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[ +pub const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[ // reasoning ("o1", Tokenizer::O200kBase), ("o3", Tokenizer::O200kBase), @@ -119,6 +119,27 @@ lazy_static! { }; } +/// Returns a list of all available model names. +/// +/// This function returns all the model names that are supported by the tokenizer. +/// The models are returned in the order they are defined in the `MODEL_TO_TOKENIZER` constant. +/// +/// # Examples +/// +/// ``` +/// use tiktoken_rs::tokenizer::list_available_models; +/// let models = list_available_models(); +/// assert!(models.contains(&"gpt-4")); +/// assert!(models.contains(&"gpt-3.5-turbo")); +/// ``` +/// +/// # Returns +/// +/// A vector of string slices containing all available model names. +pub fn list_available_models() -> Vec<&'static str> { + MODEL_TO_TOKENIZER.iter().map(|(model, _)| *model).collect() +} + /// Returns the tokenizer type used by a model. /// /// This function retrieves the corresponding tokenizer enum variant for the given model name. It first looks