Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tiktoken-rs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ println!("max_tokens: {}", max_tokens);

| Encoding name | OpenAI models |
| ----------------------- | ------------------------------------------------------------------------- |
| `o200k_base` | GPT-4o models, GPT-4.1, o1, o3, and o4 models |
| `o200k_harmony` | gpt-oss models, `gpt-oss-20b`, `gpt-oss-120b` |
| `o200k_base` | GPT-4o models, GPT-4.1, GPT-5, o1, o3, and o4 models |
| `cl100k_base` | ChatGPT models, `text-embedding-ada-002` |
| `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` |
| `p50k_edit` | Use for edit models like `text-davinci-edit-001`, `code-davinci-edit-001` |
Expand Down
7 changes: 6 additions & 1 deletion tiktoken-rs/benches/init.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
#![feature(test)]
extern crate test;

use tiktoken_rs::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
use tiktoken_rs::{cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base};

#[bench]
fn bench_init_o200k_harmony(b: &mut test::Bencher) {
b.iter(|| o200k_harmony().unwrap());
}

#[bench]
fn bench_init_o200k_base(b: &mut test::Bencher) {
Expand Down
8 changes: 6 additions & 2 deletions tiktoken-rs/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use anyhow::{anyhow, Result};
use crate::{
cl100k_base,
model::get_context_size,
o200k_base, p50k_base, p50k_edit, r50k_base,
o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base,
tokenizer::{get_tokenizer, Tokenizer},
CoreBPE,
};
Expand Down Expand Up @@ -99,7 +99,10 @@ pub fn num_tokens_from_messages(
) -> Result<usize> {
let tokenizer =
get_tokenizer(model).ok_or_else(|| anyhow!("No tokenizer found for model {}", model))?;
if tokenizer != Tokenizer::Cl100kBase && tokenizer != Tokenizer::O200kBase {
if tokenizer != Tokenizer::Cl100kBase
&& tokenizer != Tokenizer::O200kBase
&& tokenizer != Tokenizer::O200kHarmony
{
anyhow::bail!("Chat completion is only supported chat models")
}
let bpe = get_bpe_from_tokenizer(tokenizer)?;
Expand Down Expand Up @@ -255,6 +258,7 @@ pub fn get_bpe_from_model(model: &str) -> Result<CoreBPE> {
/// If successful, the function returns a `Result` containing the `CoreBPE` instance corresponding to the given tokenizer.
pub fn get_bpe_from_tokenizer(tokenizer: Tokenizer) -> Result<CoreBPE> {
match tokenizer {
Tokenizer::O200kHarmony => o200k_harmony(),
Tokenizer::O200kBase => o200k_base(),
Tokenizer::Cl100kBase => cl100k_base(),
Tokenizer::R50kBase => r50k_base(),
Expand Down
6 changes: 6 additions & 0 deletions tiktoken-rs/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ pub fn get_context_size(model: &str) -> usize {
let base = rest.split(':').next().unwrap_or(rest);
return get_context_size(base);
}
if starts_with_any!(model, "gpt-5") {
return 400_000;
}
if starts_with_any!(model, "gpt-oss") {
return 131_072;
}
if starts_with_any!(model, "o1", "o3", "o4") {
return 200_000;
}
Expand Down
13 changes: 12 additions & 1 deletion tiktoken-rs/src/singleton.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use lazy_static::lazy_static;

use crate::vendor_tiktoken::CoreBPE;

use crate::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
use crate::{cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base};

/// Returns a singleton instance of the r50k_base tokenizer. (also known as `gpt2`)
/// Use for GPT-3 models like `davinci`
Expand Down Expand Up @@ -58,3 +58,14 @@ pub fn o200k_base_singleton() -> &'static CoreBPE {
}
&O200K_BASE
}

/// Returns a singleton instance of the o200k_harmony tokenizer.
/// Use for gpt-oss models like `gpt-oss-20b`, `gpt-oss-120b`.
///
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
pub fn o200k_harmony_singleton() -> &'static CoreBPE {
lazy_static! {
static ref O200K_HARMONY: CoreBPE = o200k_harmony().unwrap();
}
&O200K_HARMONY
}
70 changes: 65 additions & 5 deletions tiktoken-rs/src/tiktoken_ext/openai_public.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
pub const STARTOFTEXT: &str = "<|startoftext|>";
pub const ENDOFTEXT: &str = "<|endoftext|>";
pub const RESERVED_200000: &str = "<|reserved_200000|>";
pub const RESERVED_200001: &str = "<|reserved_200001|>";
pub const RETURN: &str = "<|return|>";
pub const CONSTRAIN: &str = "<|constrain|>";
pub const RESERVED_200004: &str = "<|reserved_200004|>";
pub const CHANNEL: &str = "<|channel|>";
pub const START: &str = "<|start|>";
pub const END: &str = "<|end|>";
pub const MESSAGE: &str = "<|message|>";
pub const RESERVED_200009: &str = "<|reserved_200009|>";
pub const RESERVED_200010: &str = "<|reserved_200010|>";
pub const RESERVED_200011: &str = "<|reserved_200011|>";
pub const CALL: &str = "<|call|>";
pub const RESERVED_200013: &str = "<|reserved_200013|>";
pub const FIM_PREFIX: &str = "<|fim_prefix|>";
pub const FIM_MIDDLE: &str = "<|fim_middle|>";
pub const FIM_SUFFIX: &str = "<|fim_suffix|>";
Expand Down Expand Up @@ -123,11 +138,7 @@ pub fn cl100k_base() -> Result<CoreBPE> {
pub fn o200k_base() -> Result<CoreBPE> {
let o200k_base = include_str!("../../assets/o200k_base.tiktoken");

let mut encoder: std::collections::HashMap<
Vec<u8>,
Rank,
std::hash::BuildHasherDefault<rustc_hash::FxHasher>,
> = HashMap::default();
let mut encoder = HashMap::default();
for line in o200k_base.lines() {
let mut parts = line.split(' ');
let raw = parts.next().unwrap();
Expand Down Expand Up @@ -155,3 +166,52 @@ pub fn o200k_base() -> Result<CoreBPE> {
)?;
Ok(bpe)
}

/// Use for gpt-oss models like `gpt-oss-20b`, `gpt-oss-120b`.
/// Initializes and returns a new instance of the o200k_harmony tokenizer.
pub fn o200k_harmony() -> Result<CoreBPE> {
let o200k_harmony = include_str!("../../assets/o200k_base.tiktoken");

let mut encoder = HashMap::default();
for line in o200k_harmony.lines() {
let mut parts = line.split(' ');
let raw = parts.next().unwrap();
let token = &general_purpose::STANDARD.decode(raw)?;
let rank: Rank = parts.next().unwrap().parse().unwrap();
encoder.insert(token.clone(), rank);
}

let mut special_tokens = HashMap::default();

special_tokens.insert(String::from(STARTOFTEXT), 199998);
special_tokens.insert(String::from(ENDOFTEXT), 199999);
special_tokens.insert(String::from(RESERVED_200000), 200000);
special_tokens.insert(String::from(RESERVED_200001), 200001);
special_tokens.insert(String::from(RETURN), 200002);
special_tokens.insert(String::from(CONSTRAIN), 200003);
special_tokens.insert(String::from(RESERVED_200004), 200004);
special_tokens.insert(String::from(CHANNEL), 200005);
special_tokens.insert(String::from(START), 200006);
special_tokens.insert(String::from(END), 200007);
special_tokens.insert(String::from(MESSAGE), 200008);
special_tokens.insert(String::from(RESERVED_200009), 200009);
special_tokens.insert(String::from(RESERVED_200010), 200010);
special_tokens.insert(String::from(RESERVED_200011), 200011);
special_tokens.insert(String::from(CALL), 200012);
special_tokens.insert(String::from(RESERVED_200013), 200013);

let bpe = CoreBPE::new(
encoder,
special_tokens,
&[
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"\\p{N}{1,3}",
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
"\\s*[\\r\\n]+",
"\\s+(?!\\S)",
"\\s+",
].join("|"),
)?;
Ok(bpe)
}
20 changes: 14 additions & 6 deletions tiktoken-rs/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use lazy_static::lazy_static;
/// ```
#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)]
pub enum Tokenizer {
O200kHarmony,
O200kBase,
Cl100kBase,
P50kBase,
Expand All @@ -29,18 +30,21 @@ pub enum Tokenizer {
}

// Keep this in sync with:
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7
// https://github.com/openai/tiktoken/blob/eedc856364506a9d4651645a0290eb0ba81e6935/tiktoken/model.py#L7-L27
const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
("o1-", Tokenizer::O200kBase),
("o3-", Tokenizer::O200kBase),
("o4-", Tokenizer::O200kBase),
// chat
("gpt-5-", Tokenizer::O200kBase),
("gpt-4.5-", Tokenizer::O200kBase),
("gpt-4.1-", Tokenizer::O200kBase),
("chatgpt-4o-", Tokenizer::O200kBase),
("gpt-4o-", Tokenizer::O200kBase),
("gpt-4-", Tokenizer::Cl100kBase),
("gpt-3.5-turbo-", Tokenizer::Cl100kBase),
("gpt-35-turbo-", Tokenizer::Cl100kBase),
("gpt-4o-", Tokenizer::O200kBase), // e.g., gpt-4o-2024-05-13
("gpt-4-", Tokenizer::Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
("gpt-3.5-turbo-", Tokenizer::Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
("gpt-35-turbo-", Tokenizer::Cl100kBase), // Azure deployment name
("gpt-oss-", Tokenizer::O200kHarmony),
// fine-tuned
("ft:gpt-4o", Tokenizer::O200kBase),
("ft:gpt-4", Tokenizer::Cl100kBase),
Expand All @@ -50,13 +54,14 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
];

// Keep this in sync with:
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
// https://github.com/openai/tiktoken/blob/eedc856364506a9d4651645a0290eb0ba81e6935/tiktoken/model.py#L29-L84
const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
// reasoning
("o1", Tokenizer::O200kBase),
("o3", Tokenizer::O200kBase),
("o4", Tokenizer::O200kBase),
// chat
("gpt-5", Tokenizer::O200kBase),
("gpt-4.1", Tokenizer::O200kBase),
("chatgpt-4o-latest", Tokenizer::O200kBase),
("gpt-4o", Tokenizer::O200kBase),
Expand Down Expand Up @@ -162,6 +167,9 @@ mod tests {

#[test]
fn test_get_tokenizer() {
assert_eq!(get_tokenizer("gpt-5"), Some(Tokenizer::O200kBase));
assert_eq!(get_tokenizer("gpt-oss-20b"), Some(Tokenizer::O200kHarmony));
assert_eq!(get_tokenizer("gpt-oss-120b"), Some(Tokenizer::O200kHarmony));
assert_eq!(
get_tokenizer("chatgpt-4o-latest"),
Some(Tokenizer::O200kBase)
Expand Down
1 change: 1 addition & 0 deletions tiktoken-rs/tests/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ fn test_finetuned_context_size() {
fn test_o_series_context_size() {
assert_eq!(get_context_size("o3-small"), 200_000);
assert_eq!(get_context_size("o4"), 200_000);
assert_eq!(get_context_size("gpt-5"), 400_000);
}
5 changes: 3 additions & 2 deletions tiktoken-rs/tests/tiktoken.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use rustc_hash::FxHashMap as HashMap;

use tiktoken_rs::{
byte_pair_split, cl100k_base, o200k_base, p50k_base, p50k_base_singleton, r50k_base, CoreBPE,
Rank,
byte_pair_split, cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_base_singleton,
r50k_base, CoreBPE, Rank,
};

#[test]
Expand Down Expand Up @@ -166,4 +166,5 @@ fn test_unicode_roundtrip() {
test_roundtrip(&r50k_base().unwrap(), "我想借几本汉语书");
test_roundtrip(&cl100k_base().unwrap(), "你会说中文吗?");
test_roundtrip(&o200k_base().unwrap(), "ひらがなカタカナ漢字");
test_roundtrip(&o200k_harmony().unwrap(), "ひらがなカタカナ漢字");
}
Loading