Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions tiktoken-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ documentation = "https://docs.rs/crate/tiktoken-rs/"
license = "MIT"
readme = "../README.md"

[[bin]]
name = "tiktoken"
path = "src/main.rs"

[profile.release]
debug = 1

Expand All @@ -21,11 +25,14 @@ anyhow = "1.0.76"
async-openai = { version = "0.14.2", optional = true }
base64 = "0.22.0"
bstr = "1.6.2"
clap = { version = "4.4", features = ["derive"] }
dhat = { version = "0.3.2", optional = true }
fancy-regex = "0.13.0"
lazy_static = "1.4.0"
regex = "1.10.3"
rustc-hash = "1.1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

[features]
async-openai = ["dep:async-openai"]
Expand Down
43 changes: 43 additions & 0 deletions tiktoken-rs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,49 @@ This library is built on top of the `tiktoken` library and includes some additio

For full working examples for all supported features, see the [examples](https://github.com/zurawiki/tiktoken-rs/tree/main/tiktoken-rs/examples) directory in the repository.

# CLI Usage

The project includes a command-line interface for token counting.

## Installation

```shell
cargo install tiktoken-rs
```

## Usage

```bash
# Get help
tiktoken --help

# List all available models
tiktoken --list-models

# Count tokens in text from stdin
echo 'Hello, world!' | tiktoken

# Count tokens with a specific model
echo 'Hello, world!' | tiktoken --model gpt-3.5-turbo

# Count tokens with the o1 model
echo 'Hello, world!' | tiktoken --model o1

# Output JSON with usage percentage
echo 'Hello, world!' | tiktoken --json
```

The CLI outputs JSON with token count, model used, context size, and remaining tokens:

```json
{
"token_count": 4,
"model": "gpt-4",
"context_size": 8192,
"remaining_tokens": 8188
}
```

# Usage

1. Install this tool locally with `cargo`
Expand Down
106 changes: 106 additions & 0 deletions tiktoken-rs/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
use clap::Parser;
use serde::Serialize;
use std::io::{self, Read};
use tiktoken_rs::{get_bpe_from_model, model::get_context_size, tokenizer::list_available_models};

#[derive(Parser)]
#[command(
name = "tiktoken",
about = "Count tokens in text using OpenAI's tiktoken library",
version
)]
struct Args {
/// Model to use for tokenization (e.g., gpt-4o, gpt-3.5-turbo, o1)
#[arg(short, long, default_value = "gpt-4.1")]
model: String,

/// Output results in JSON format
#[arg(long)]
json: bool,

/// List all available models and exit
#[arg(long)]
list_models: bool,

/// Input text to count tokens for (reads from stdin if not provided)
#[arg(value_name = "TEXT")]
text: Vec<String>,
}

#[derive(Serialize)]
struct TokenCountResponse {
/// Number of tokens in the input text
token_count: usize,
/// Model used for tokenization
model: String,
/// Context size for the model
context_size: usize,
/// Remaining tokens available for completion
remaining_tokens: usize,
/// Percentage of context used (rounded to 3 decimal places)
usage_percentage: f64,
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();

// Handle list models command
if args.list_models {
println!("Available models:");
println!();

// Get all models from the tokenizer module
let models = list_available_models();

for model in models.iter() {
let context_size = get_context_size(model);
println!(" {:<25} (context: {})", model, context_size);
}

println!();
println!(
"Note: Many models support version suffixes (e.g., gpt-4-0314, gpt-3.5-turbo-0125)"
);
println!(" and fine-tuned models use the ft: prefix (e.g., ft:gpt-3.5-turbo:xxx:2023-11-11)");
return Ok(());
}

// Get input text from argument or stdin
let input_text = if !args.text.is_empty() {
args.text.join(" ")
} else {
let mut buffer = String::new();
eprintln!("🔎 Reading from stdin...");
io::stdin().read_to_string(&mut buffer)?;
buffer
};

// Count tokens using the specified model
let bpe = get_bpe_from_model(&args.model)?;
let token_count = bpe.encode_with_special_tokens(&input_text).len();
let context_size = get_context_size(&args.model);
let remaining_tokens = context_size.saturating_sub(token_count);

// Calculate usage percentage rounded to 3 decimal places
let usage_percentage = if context_size > 0 {
((token_count as f64 / context_size as f64) * 100.0 * 1000.0).round() / 1000.0
} else {
0.0
};

// Output based on the json flag
if args.json {
let response = TokenCountResponse {
token_count,
model: args.model,
context_size,
remaining_tokens,
usage_percentage,
};
println!("{}", serde_json::to_string_pretty(&response)?);
} else {
println!("{token_count}");
}

Ok(())
}
23 changes: 22 additions & 1 deletion tiktoken-rs/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[

// Keep this in sync with:
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
pub const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
// reasoning
("o1", Tokenizer::O200kBase),
("o3", Tokenizer::O200kBase),
Expand Down Expand Up @@ -119,6 +119,27 @@ lazy_static! {
};
}

/// Returns a list of all available model names.
///
/// This function returns all the model names that are supported by the tokenizer.
/// The models are returned in the order they are defined in the `MODEL_TO_TOKENIZER` constant.
///
/// # Examples
///
/// ```
/// use tiktoken_rs::tokenizer::list_available_models;
/// let models = list_available_models();
/// assert!(models.contains(&"gpt-4"));
/// assert!(models.contains(&"gpt-3.5-turbo"));
/// ```
///
/// # Returns
///
/// A vector of string slices containing all available model names.
pub fn list_available_models() -> Vec<&'static str> {
MODEL_TO_TOKENIZER.iter().map(|(model, _)| *model).collect()
}

/// Returns the tokenizer type used by a model.
///
/// This function retrieves the corresponding tokenizer enum variant for the given model name. It first looks
Expand Down