Skip to content

Commit 44f70dc

Browse files
committed
Add: o200k_harmony tokenizer for gpt-oss.
1 parent 2ebea63 commit 44f70dc

File tree

9 files changed

+115
-18
lines changed

9 files changed

+115
-18
lines changed

tiktoken-rs/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ println!("max_tokens: {}", max_tokens);
105105

106106
| Encoding name | OpenAI models |
107107
| ----------------------- | ------------------------------------------------------------------------- |
108-
| `o200k_base` | GPT-4o models, GPT-4.1, o1, o3, and o4 models |
108+
| `o200k_harmony` | gpt-oss models, `gpt-oss-20b`, `gpt-oss-120b` |
109+
| `o200k_base` | GPT-4o models, GPT-4.1, GPT-5, o1, o3, and o4 models |
109110
| `cl100k_base` | ChatGPT models, `text-embedding-ada-002` |
110111
| `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` |
111112
| `p50k_edit` | Use for edit models like `text-davinci-edit-001`, `code-davinci-edit-001` |

tiktoken-rs/benches/init.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
#![feature(test)]
22
extern crate test;
33

4-
use tiktoken_rs::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
4+
use tiktoken_rs::{cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base};
5+
6+
#[bench]
7+
fn bench_init_o200k_harmony(b: &mut test::Bencher) {
8+
b.iter(|| o200k_harmony().unwrap());
9+
}
510

611
#[bench]
712
fn bench_init_o200k_base(b: &mut test::Bencher) {

tiktoken-rs/src/api.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use anyhow::{anyhow, Result};
33
use crate::{
44
cl100k_base,
55
model::get_context_size,
6-
o200k_base, p50k_base, p50k_edit, r50k_base,
6+
o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base,
77
tokenizer::{get_tokenizer, Tokenizer},
88
CoreBPE,
99
};
@@ -99,7 +99,10 @@ pub fn num_tokens_from_messages(
9999
) -> Result<usize> {
100100
let tokenizer =
101101
get_tokenizer(model).ok_or_else(|| anyhow!("No tokenizer found for model {}", model))?;
102-
if tokenizer != Tokenizer::Cl100kBase && tokenizer != Tokenizer::O200kBase {
102+
if tokenizer != Tokenizer::Cl100kBase
103+
&& tokenizer != Tokenizer::O200kBase
104+
&& tokenizer != Tokenizer::O200kHarmony
105+
{
103106
anyhow::bail!("Chat completion is only supported chat models")
104107
}
105108
let bpe = get_bpe_from_tokenizer(tokenizer)?;
@@ -255,6 +258,7 @@ pub fn get_bpe_from_model(model: &str) -> Result<CoreBPE> {
255258
/// If successful, the function returns a `Result` containing the `CoreBPE` instance corresponding to the given tokenizer.
256259
pub fn get_bpe_from_tokenizer(tokenizer: Tokenizer) -> Result<CoreBPE> {
257260
match tokenizer {
261+
Tokenizer::O200kHarmony => o200k_harmony(),
258262
Tokenizer::O200kBase => o200k_base(),
259263
Tokenizer::Cl100kBase => cl100k_base(),
260264
Tokenizer::R50kBase => r50k_base(),

tiktoken-rs/src/model.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ pub fn get_context_size(model: &str) -> usize {
3636
let base = rest.split(':').next().unwrap_or(rest);
3737
return get_context_size(base);
3838
}
39+
if starts_with_any!(model, "gpt-5") {
40+
return 400_000;
41+
}
42+
if starts_with_any!(model, "gpt-oss") {
43+
return 131_072;
44+
}
3945
if starts_with_any!(model, "o1", "o3", "o4") {
4046
return 200_000;
4147
}

tiktoken-rs/src/singleton.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use lazy_static::lazy_static;
22

33
use crate::vendor_tiktoken::CoreBPE;
44

5-
use crate::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
5+
use crate::{cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base};
66

77
/// Returns a singleton instance of the r50k_base tokenizer. (also known as `gpt2`)
88
/// Use for GPT-3 models like `davinci`
@@ -58,3 +58,14 @@ pub fn o200k_base_singleton() -> &'static CoreBPE {
5858
}
5959
&O200K_BASE
6060
}
61+
62+
/// Returns a singleton instance of the o200k_harmony tokenizer.
63+
/// Use for gpt-oss models like `gpt-oss-20b`, `gpt-oss-120b`.
64+
///
65+
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
66+
pub fn o200k_harmony_singleton() -> &'static CoreBPE {
67+
lazy_static! {
68+
static ref O200K_HARMONY: CoreBPE = o200k_harmony().unwrap();
69+
}
70+
&O200K_HARMONY
71+
}

tiktoken-rs/src/tiktoken_ext/openai_public.rs

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,19 @@
1+
pub const STARTOFTEXT: &str = "<|startoftext|>";
12
pub const ENDOFTEXT: &str = "<|endoftext|>";
3+
pub const RESERVED_200000: &str = "<|reserved_200000|>";
4+
pub const RESERVED_200001: &str = "<|reserved_200001|>";
5+
pub const RETURN: &str = "<|return|>";
6+
pub const CONSTRAIN: &str = "<|constrain|>";
7+
pub const RESERVED_200004: &str = "<|reserved_200004|>";
8+
pub const CHANNEL: &str = "<|channel|>";
9+
pub const START: &str = "<|start|>";
10+
pub const END: &str = "<|end|>";
11+
pub const MESSAGE: &str = "<|message|>";
12+
pub const RESERVED_200009: &str = "<|reserved_200009|>";
13+
pub const RESERVED_200010: &str = "<|reserved_200010|>";
14+
pub const RESERVED_200011: &str = "<|reserved_200011|>";
15+
pub const CALL: &str = "<|call|>";
16+
pub const RESERVED_200013: &str = "<|reserved_200013|>";
217
pub const FIM_PREFIX: &str = "<|fim_prefix|>";
318
pub const FIM_MIDDLE: &str = "<|fim_middle|>";
419
pub const FIM_SUFFIX: &str = "<|fim_suffix|>";
@@ -123,11 +138,7 @@ pub fn cl100k_base() -> Result<CoreBPE> {
123138
pub fn o200k_base() -> Result<CoreBPE> {
124139
let o200k_base = include_str!("../../assets/o200k_base.tiktoken");
125140

126-
let mut encoder: std::collections::HashMap<
127-
Vec<u8>,
128-
Rank,
129-
std::hash::BuildHasherDefault<rustc_hash::FxHasher>,
130-
> = HashMap::default();
141+
let mut encoder = HashMap::default();
131142
for line in o200k_base.lines() {
132143
let mut parts = line.split(' ');
133144
let raw = parts.next().unwrap();
@@ -155,3 +166,52 @@ pub fn o200k_base() -> Result<CoreBPE> {
155166
)?;
156167
Ok(bpe)
157168
}
169+
170+
/// Use for gpt-oss models like `gpt-oss-20b`, `gpt-oss-120b`.
171+
/// Initializes and returns a new instance of the o200k_harmony tokenizer.
172+
pub fn o200k_harmony() -> Result<CoreBPE> {
173+
let o200k_harmony = include_str!("../../assets/o200k_base.tiktoken");
174+
175+
let mut encoder = HashMap::default();
176+
for line in o200k_harmony.lines() {
177+
let mut parts = line.split(' ');
178+
let raw = parts.next().unwrap();
179+
let token = &general_purpose::STANDARD.decode(raw)?;
180+
let rank: Rank = parts.next().unwrap().parse().unwrap();
181+
encoder.insert(token.clone(), rank);
182+
}
183+
184+
let mut special_tokens = HashMap::default();
185+
186+
special_tokens.insert(String::from(STARTOFTEXT), 199998);
187+
special_tokens.insert(String::from(ENDOFTEXT), 199999);
188+
special_tokens.insert(String::from(RESERVED_200000), 200000);
189+
special_tokens.insert(String::from(RESERVED_200001), 200001);
190+
special_tokens.insert(String::from(RETURN), 200002);
191+
special_tokens.insert(String::from(CONSTRAIN), 200003);
192+
special_tokens.insert(String::from(RESERVED_200004), 200004);
193+
special_tokens.insert(String::from(CHANNEL), 200005);
194+
special_tokens.insert(String::from(START), 200006);
195+
special_tokens.insert(String::from(END), 200007);
196+
special_tokens.insert(String::from(MESSAGE), 200008);
197+
special_tokens.insert(String::from(RESERVED_200009), 200009);
198+
special_tokens.insert(String::from(RESERVED_200010), 200010);
199+
special_tokens.insert(String::from(RESERVED_200011), 200011);
200+
special_tokens.insert(String::from(CALL), 200012);
201+
special_tokens.insert(String::from(RESERVED_200013), 200013);
202+
203+
let bpe = CoreBPE::new(
204+
encoder,
205+
special_tokens,
206+
&[
207+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
208+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
209+
"\\p{N}{1,3}",
210+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
211+
"\\s*[\\r\\n]+",
212+
"\\s+(?!\\S)",
213+
"\\s+",
214+
].join("|"),
215+
)?;
216+
Ok(bpe)
217+
}

tiktoken-rs/src/tokenizer.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use lazy_static::lazy_static;
2020
/// ```
2121
#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)]
2222
pub enum Tokenizer {
23+
O200kHarmony,
2324
O200kBase,
2425
Cl100kBase,
2526
P50kBase,
@@ -29,18 +30,21 @@ pub enum Tokenizer {
2930
}
3031

3132
// Keep this in sync with:
32-
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7
33+
// https://github.com/openai/tiktoken/blob/eedc856364506a9d4651645a0290eb0ba81e6935/tiktoken/model.py#L7-L27
3334
const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
3435
("o1-", Tokenizer::O200kBase),
3536
("o3-", Tokenizer::O200kBase),
3637
("o4-", Tokenizer::O200kBase),
3738
// chat
39+
("gpt-5-", Tokenizer::O200kBase),
40+
("gpt-4.5-", Tokenizer::O200kBase),
3841
("gpt-4.1-", Tokenizer::O200kBase),
3942
("chatgpt-4o-", Tokenizer::O200kBase),
40-
("gpt-4o-", Tokenizer::O200kBase),
41-
("gpt-4-", Tokenizer::Cl100kBase),
42-
("gpt-3.5-turbo-", Tokenizer::Cl100kBase),
43-
("gpt-35-turbo-", Tokenizer::Cl100kBase),
43+
("gpt-4o-", Tokenizer::O200kBase), // e.g., gpt-4o-2024-05-13
44+
("gpt-4-", Tokenizer::Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
45+
("gpt-3.5-turbo-", Tokenizer::Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
46+
("gpt-35-turbo-", Tokenizer::Cl100kBase), // Azure deployment name
47+
("gpt-oss-", Tokenizer::O200kHarmony),
4448
// fine-tuned
4549
("ft:gpt-4o", Tokenizer::O200kBase),
4650
("ft:gpt-4", Tokenizer::Cl100kBase),
@@ -50,13 +54,14 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
5054
];
5155

5256
// Keep this in sync with:
53-
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
57+
// https://github.com/openai/tiktoken/blob/eedc856364506a9d4651645a0290eb0ba81e6935/tiktoken/model.py#L29-L84
5458
const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
5559
// reasoning
5660
("o1", Tokenizer::O200kBase),
5761
("o3", Tokenizer::O200kBase),
5862
("o4", Tokenizer::O200kBase),
5963
// chat
64+
("gpt-5", Tokenizer::O200kBase),
6065
("gpt-4.1", Tokenizer::O200kBase),
6166
("chatgpt-4o-latest", Tokenizer::O200kBase),
6267
("gpt-4o", Tokenizer::O200kBase),
@@ -162,6 +167,9 @@ mod tests {
162167

163168
#[test]
164169
fn test_get_tokenizer() {
170+
assert_eq!(get_tokenizer("gpt-5"), Some(Tokenizer::O200kBase));
171+
assert_eq!(get_tokenizer("gpt-oss-20b"), Some(Tokenizer::O200kHarmony));
172+
assert_eq!(get_tokenizer("gpt-oss-120b"), Some(Tokenizer::O200kHarmony));
165173
assert_eq!(
166174
get_tokenizer("chatgpt-4o-latest"),
167175
Some(Tokenizer::O200kBase)

tiktoken-rs/tests/model.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@ fn test_finetuned_context_size() {
1616
fn test_o_series_context_size() {
1717
assert_eq!(get_context_size("o3-small"), 200_000);
1818
assert_eq!(get_context_size("o4"), 200_000);
19+
assert_eq!(get_context_size("gpt-5"), 400_000);
1920
}

tiktoken-rs/tests/tiktoken.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use rustc_hash::FxHashMap as HashMap;
22

33
use tiktoken_rs::{
4-
byte_pair_split, cl100k_base, o200k_base, p50k_base, p50k_base_singleton, r50k_base, CoreBPE,
5-
Rank,
4+
byte_pair_split, cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_base_singleton,
5+
r50k_base, CoreBPE, Rank,
66
};
77

88
#[test]
@@ -166,4 +166,5 @@ fn test_unicode_roundtrip() {
166166
test_roundtrip(&r50k_base().unwrap(), "我想借几本汉语书");
167167
test_roundtrip(&cl100k_base().unwrap(), "你会说中文吗?");
168168
test_roundtrip(&o200k_base().unwrap(), "ひらがなカタカナ漢字");
169+
test_roundtrip(&o200k_harmony().unwrap(), "ひらがなカタカナ漢字");
169170
}

0 commit comments

Comments
 (0)