Skip to content

Commit 345b0ef

Browse files
Update with GPT-5 Support
1 parent 2ebea63 commit 345b0ef

File tree

6 files changed

+33
-5
lines changed

6 files changed

+33
-5
lines changed

tiktoken-rs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ println!("max_tokens: {}", max_tokens);
105105

106106
| Encoding name | OpenAI models |
107107
| ----------------------- | ------------------------------------------------------------------------- |
108-
| `o200k_base` | GPT-4o models, GPT-4.1, o1, o3, and o4 models |
108+
| `o200k_base` | GPT-5, GPT-4.1, GPT-4o, o1, o3, and o4 models |
109109
| `cl100k_base` | ChatGPT models, `text-embedding-ada-002` |
110110
| `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` |
111111
| `p50k_edit` | Use for edit models like `text-davinci-edit-001`, `code-davinci-edit-001` |

tiktoken-rs/src/model.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ pub fn get_context_size(model: &str) -> usize {
3939
if starts_with_any!(model, "o1", "o3", "o4") {
4040
return 200_000;
4141
}
42+
if starts_with_any!(model, "gpt-5") {
43+
return 400_000;
44+
}
4245
if starts_with_any!(model, "gpt-4.1") {
4346
return 1_047_576;
4447
}

tiktoken-rs/src/singleton.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ pub fn cl100k_base_singleton() -> &'static CoreBPE {
4949
}
5050

5151
/// Returns a singleton instance of the o200k_base tokenizer.
52-
/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`.
52+
/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`.
5353
///
5454
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
5555
pub fn o200k_base_singleton() -> &'static CoreBPE {

tiktoken-rs/src/tiktoken_ext/openai_public.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ pub fn cl100k_base() -> Result<CoreBPE> {
118118
Ok(bpe)
119119
}
120120

121-
/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`.
121+
/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`.
122122
/// Initializes and returns a new instance of the o200k_base tokenizer.
123123
pub fn o200k_base() -> Result<CoreBPE> {
124124
let o200k_base = include_str!("../../assets/o200k_base.tiktoken");

tiktoken-rs/src/tokenizer.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,13 @@ pub enum Tokenizer {
2929
}
3030

3131
// Keep this in sync with:
32-
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7
32+
// https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L7
3333
const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
3434
("o1-", Tokenizer::O200kBase),
3535
("o3-", Tokenizer::O200kBase),
3636
("o4-", Tokenizer::O200kBase),
3737
// chat
38+
("gpt-5-", Tokenizer::O200kBase),
3839
("gpt-4.1-", Tokenizer::O200kBase),
3940
("chatgpt-4o-", Tokenizer::O200kBase),
4041
("gpt-4o-", Tokenizer::O200kBase),
@@ -50,7 +51,7 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
5051
];
5152

5253
// Keep this in sync with:
53-
// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
54+
// https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L29
5455
const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
5556
// reasoning
5657
("o1", Tokenizer::O200kBase),
@@ -162,6 +163,10 @@ mod tests {
162163

163164
#[test]
164165
fn test_get_tokenizer() {
166+
assert_eq!(
167+
get_tokenizer("gpt-5-mini"),
168+
Some(Tokenizer::O200kBase)
169+
);
165170
assert_eq!(
166171
get_tokenizer("chatgpt-4o-latest"),
167172
Some(Tokenizer::O200kBase)

tiktoken-rs/tests/model.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,30 @@ fn test_finetuned_context_size() {
1010
get_context_size("ft:gpt-4o:custom"),
1111
get_context_size("gpt-4o")
1212
);
13+
assert_eq!(
14+
get_context_size("ft:gpt-5:custom"),
15+
get_context_size("gpt-5")
16+
);
17+
assert_eq!(
18+
get_context_size("ft:gpt-4.1:custom"),
19+
get_context_size("gpt-4.1")
20+
);
1321
}
1422

1523
#[test]
1624
fn test_o_series_context_size() {
1725
assert_eq!(get_context_size("o3-small"), 200_000);
1826
assert_eq!(get_context_size("o4"), 200_000);
1927
}
28+
29+
#[test]
30+
fn test_4_1_series_context_size() {
31+
assert_eq!(get_context_size("gpt-4.1"), 1_047_576);
32+
assert_eq!(get_context_size("gpt-4.1-mini"), 1_047_576);
33+
}
34+
35+
#[test]
36+
fn test_5_series_context_size() {
37+
assert_eq!(get_context_size("gpt-5"), 400_000);
38+
assert_eq!(get_context_size("gpt-5-nano"), 400_000);
39+
}

0 commit comments

Comments
 (0)