Skip to content

Commit 94c43ed

Browse files
committed
Add: o200k_harmony tokenizer for gpt-oss.
1 parent 2ebea63 commit 94c43ed

File tree

8 files changed

+102
-11
lines changed

8 files changed

+102
-11
lines changed

tiktoken-rs/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ println!("max_tokens: {}", max_tokens);
105105

106106
| Encoding name | OpenAI models |
107107
| ----------------------- | ------------------------------------------------------------------------- |
108+
| `o200k_harmony` | gpt-oss models |
108109
| `o200k_base` | GPT-4o models, GPT-4.1, o1, o3, and o4 models |
109110
| `cl100k_base` | ChatGPT models, `text-embedding-ada-002` |
110111
| `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` |

tiktoken-rs/benches/init.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
#![feature(test)]
22
extern crate test;
33

4-
use tiktoken_rs::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
4+
use tiktoken_rs::{cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base};
5+
6+
#[bench]
7+
fn bench_init_o200k_harmony(b: &mut test::Bencher) {
8+
b.iter(|| o200k_harmony().unwrap());
9+
}
510

611
#[bench]
712
fn bench_init_o200k_base(b: &mut test::Bencher) {

tiktoken-rs/src/api.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use anyhow::{anyhow, Result};
33
use crate::{
44
cl100k_base,
55
model::get_context_size,
6-
o200k_base, p50k_base, p50k_edit, r50k_base,
6+
o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base,
77
tokenizer::{get_tokenizer, Tokenizer},
88
CoreBPE,
99
};
@@ -99,7 +99,10 @@ pub fn num_tokens_from_messages(
9999
) -> Result<usize> {
100100
let tokenizer =
101101
get_tokenizer(model).ok_or_else(|| anyhow!("No tokenizer found for model {}", model))?;
102-
if tokenizer != Tokenizer::Cl100kBase && tokenizer != Tokenizer::O200kBase {
102+
if tokenizer != Tokenizer::Cl100kBase
103+
&& tokenizer != Tokenizer::O200kBase
104+
&& tokenizer != Tokenizer::O200kHarmony
105+
{
103106
anyhow::bail!("Chat completion is only supported chat models")
104107
}
105108
let bpe = get_bpe_from_tokenizer(tokenizer)?;
@@ -255,6 +258,7 @@ pub fn get_bpe_from_model(model: &str) -> Result<CoreBPE> {
255258
/// If successful, the function returns a `Result` containing the `CoreBPE` instance corresponding to the given tokenizer.
256259
pub fn get_bpe_from_tokenizer(tokenizer: Tokenizer) -> Result<CoreBPE> {
257260
match tokenizer {
261+
Tokenizer::O200kHarmony => o200k_harmony(),
258262
Tokenizer::O200kBase => o200k_base(),
259263
Tokenizer::Cl100kBase => cl100k_base(),
260264
Tokenizer::R50kBase => r50k_base(),

tiktoken-rs/src/model.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ pub fn get_context_size(model: &str) -> usize {
3636
let base = rest.split(':').next().unwrap_or(rest);
3737
return get_context_size(base);
3838
}
39+
if starts_with_any!(model, "gpt-oss") {
40+
return 131_072;
41+
}
3942
if starts_with_any!(model, "o1", "o3", "o4") {
4043
return 200_000;
4144
}

tiktoken-rs/src/singleton.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use lazy_static::lazy_static;
22

33
use crate::vendor_tiktoken::CoreBPE;
44

5-
use crate::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
5+
use crate::{cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_edit, r50k_base};
66

77
/// Returns a singleton instance of the r50k_base tokenizer. (also known as `gpt2`)
88
/// Use for GPT-3 models like `davinci`
@@ -58,3 +58,14 @@ pub fn o200k_base_singleton() -> &'static CoreBPE {
5858
}
5959
&O200K_BASE
6060
}
61+
62+
/// Returns a singleton instance of the o200k_harmony tokenizer.
63+
/// Use for gpt-oss models.
64+
///
65+
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
66+
pub fn o200k_harmony_singleton() -> &'static CoreBPE {
67+
lazy_static! {
68+
static ref O200K_HARMONY: CoreBPE = o200k_harmony().unwrap();
69+
}
70+
&O200K_HARMONY
71+
}

tiktoken-rs/src/tiktoken_ext/openai_public.rs

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,19 @@
1+
pub const STARTOFTEXT: &str = "<|startoftext|>";
12
pub const ENDOFTEXT: &str = "<|endoftext|>";
3+
pub const RESERVED_200000: &str = "<|reserved_200000|>";
4+
pub const RESERVED_200001: &str = "<|reserved_200001|>";
5+
pub const RETURN: &str = "<|return|>";
6+
pub const CONSTRAIN: &str = "<|constrain|>";
7+
pub const RESERVED_200004: &str = "<|reserved_200004|>";
8+
pub const CHANNEL: &str = "<|channel|>";
9+
pub const START: &str = "<|start|>";
10+
pub const END: &str = "<|end|>";
11+
pub const MESSAGE: &str = "<|message|>";
12+
pub const RESERVED_200009: &str = "<|reserved_200009|>";
13+
pub const RESERVED_200010: &str = "<|reserved_200010|>";
14+
pub const RESERVED_200011: &str = "<|reserved_200011|>";
15+
pub const CALL: &str = "<|call|>";
16+
pub const RESERVED_200013: &str = "<|reserved_200013|>";
217
pub const FIM_PREFIX: &str = "<|fim_prefix|>";
318
pub const FIM_MIDDLE: &str = "<|fim_middle|>";
419
pub const FIM_SUFFIX: &str = "<|fim_suffix|>";
@@ -123,11 +138,7 @@ pub fn cl100k_base() -> Result<CoreBPE> {
123138
pub fn o200k_base() -> Result<CoreBPE> {
124139
let o200k_base = include_str!("../../assets/o200k_base.tiktoken");
125140

126-
let mut encoder: std::collections::HashMap<
127-
Vec<u8>,
128-
Rank,
129-
std::hash::BuildHasherDefault<rustc_hash::FxHasher>,
130-
> = HashMap::default();
141+
let mut encoder = HashMap::default();
131142
for line in o200k_base.lines() {
132143
let mut parts = line.split(' ');
133144
let raw = parts.next().unwrap();
@@ -155,3 +166,52 @@ pub fn o200k_base() -> Result<CoreBPE> {
155166
)?;
156167
Ok(bpe)
157168
}
169+
170+
/// Use for gpt-oss models.
171+
/// Initializes and returns a new instance of the o200k_harmony tokenizer.
172+
pub fn o200k_harmony() -> Result<CoreBPE> {
173+
let o200k_harmony = include_str!("../../assets/o200k_base.tiktoken");
174+
175+
let mut encoder = HashMap::default();
176+
for line in o200k_harmony.lines() {
177+
let mut parts = line.split(' ');
178+
let raw = parts.next().unwrap();
179+
let token = &general_purpose::STANDARD.decode(raw)?;
180+
let rank: Rank = parts.next().unwrap().parse().unwrap();
181+
encoder.insert(token.clone(), rank);
182+
}
183+
184+
let mut special_tokens = HashMap::default();
185+
186+
special_tokens.insert(String::from(STARTOFTEXT), 199998);
187+
special_tokens.insert(String::from(ENDOFTEXT), 199999);
188+
special_tokens.insert(String::from(RESERVED_200000), 200000);
189+
special_tokens.insert(String::from(RESERVED_200001), 200001);
190+
special_tokens.insert(String::from(RETURN), 200002);
191+
special_tokens.insert(String::from(CONSTRAIN), 200003);
192+
special_tokens.insert(String::from(RESERVED_200004), 200004);
193+
special_tokens.insert(String::from(CHANNEL), 200005);
194+
special_tokens.insert(String::from(START), 200006);
195+
special_tokens.insert(String::from(END), 200007);
196+
special_tokens.insert(String::from(MESSAGE), 200008);
197+
special_tokens.insert(String::from(RESERVED_200009), 200009);
198+
special_tokens.insert(String::from(RESERVED_200010), 200010);
199+
special_tokens.insert(String::from(RESERVED_200011), 200011);
200+
special_tokens.insert(String::from(CALL), 200012);
201+
special_tokens.insert(String::from(RESERVED_200013), 200013);
202+
203+
let bpe = CoreBPE::new(
204+
encoder,
205+
special_tokens,
206+
&[
207+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
208+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
209+
"\\p{N}{1,3}",
210+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
211+
"\\s*[\\r\\n]+",
212+
"\\s+(?!\\S)",
213+
"\\s+",
214+
].join("|"),
215+
)?;
216+
Ok(bpe)
217+
}

tiktoken-rs/src/tokenizer.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use lazy_static::lazy_static;
2020
/// ```
2121
#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)]
2222
pub enum Tokenizer {
23+
O200kHarmony,
2324
O200kBase,
2425
Cl100kBase,
2526
P50kBase,
@@ -31,6 +32,7 @@ pub enum Tokenizer {
3132
// Keep this in sync with:
3233
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7
3334
const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
35+
("gpt-oss-", Tokenizer::O200kHarmony),
3436
("o1-", Tokenizer::O200kBase),
3537
("o3-", Tokenizer::O200kBase),
3638
("o4-", Tokenizer::O200kBase),
@@ -52,6 +54,8 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
5254
// Keep this in sync with:
5355
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
5456
const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
57+
("gpt-oss-120b", Tokenizer::O200kHarmony),
58+
("gpt-oss-20b", Tokenizer::O200kHarmony),
5559
// reasoning
5660
("o1", Tokenizer::O200kBase),
5761
("o3", Tokenizer::O200kBase),
@@ -162,6 +166,8 @@ mod tests {
162166

163167
#[test]
164168
fn test_get_tokenizer() {
169+
assert_eq!(get_tokenizer("gpt-oss-20b"), Some(Tokenizer::O200kHarmony));
170+
assert_eq!(get_tokenizer("gpt-oss-120b"), Some(Tokenizer::O200kHarmony));
165171
assert_eq!(
166172
get_tokenizer("chatgpt-4o-latest"),
167173
Some(Tokenizer::O200kBase)

tiktoken-rs/tests/tiktoken.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use rustc_hash::FxHashMap as HashMap;
22

33
use tiktoken_rs::{
4-
byte_pair_split, cl100k_base, o200k_base, p50k_base, p50k_base_singleton, r50k_base, CoreBPE,
5-
Rank,
4+
byte_pair_split, cl100k_base, o200k_base, o200k_harmony, p50k_base, p50k_base_singleton,
5+
r50k_base, CoreBPE, Rank,
66
};
77

88
#[test]
@@ -166,4 +166,5 @@ fn test_unicode_roundtrip() {
166166
test_roundtrip(&r50k_base().unwrap(), "我想借几本汉语书");
167167
test_roundtrip(&cl100k_base().unwrap(), "你会说中文吗?");
168168
test_roundtrip(&o200k_base().unwrap(), "ひらがなカタカナ漢字");
169+
test_roundtrip(&o200k_harmony().unwrap(), "ひらがなカタカナ漢字");
169170
}

0 commit comments

Comments
 (0)