diff --git a/tiktoken-rs/README.md b/tiktoken-rs/README.md index 162209c..c93074c 100644 --- a/tiktoken-rs/README.md +++ b/tiktoken-rs/README.md @@ -105,7 +105,7 @@ println!("max_tokens: {}", max_tokens); | Encoding name | OpenAI models | | ----------------------- | ------------------------------------------------------------------------- | -| `o200k_base` | GPT-4o models, GPT-4.1, o1, o3, and o4 models | +| `o200k_base` | GPT-5, GPT-4.1, GPT-4o, o1, o3, and o4 models | | `cl100k_base` | ChatGPT models, `text-embedding-ada-002` | | `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` | | `p50k_edit` | Use for edit models like `text-davinci-edit-001`, `code-davinci-edit-001` | diff --git a/tiktoken-rs/src/model.rs b/tiktoken-rs/src/model.rs index 319b1f0..fbed510 100644 --- a/tiktoken-rs/src/model.rs +++ b/tiktoken-rs/src/model.rs @@ -39,6 +39,9 @@ pub fn get_context_size(model: &str) -> usize { if starts_with_any!(model, "o1", "o3", "o4") { return 200_000; } + if starts_with_any!(model, "gpt-5") { + return 400_000; + } if starts_with_any!(model, "gpt-4.1") { return 1_047_576; } diff --git a/tiktoken-rs/src/singleton.rs b/tiktoken-rs/src/singleton.rs index 266d2fa..a757a81 100644 --- a/tiktoken-rs/src/singleton.rs +++ b/tiktoken-rs/src/singleton.rs @@ -49,7 +49,7 @@ pub fn cl100k_base_singleton() -> &'static CoreBPE { } /// Returns a singleton instance of the o200k_base tokenizer. -/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`. +/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`. /// /// This function will only initialize the tokenizer once, and then return a reference the tokenizer pub fn o200k_base_singleton() -> &'static CoreBPE { diff --git a/tiktoken-rs/src/tiktoken_ext/openai_public.rs b/tiktoken-rs/src/tiktoken_ext/openai_public.rs index d36dd10..f2b5cb6 100644 --- a/tiktoken-rs/src/tiktoken_ext/openai_public.rs +++ b/tiktoken-rs/src/tiktoken_ext/openai_public.rs @@ -118,7 +118,7 @@ pub fn cl100k_base() -> Result { Ok(bpe) } -/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`. +/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`. /// Initializes and returns a new instance of the o200k_base tokenizer. pub fn o200k_base() -> Result { let o200k_base = include_str!("../../assets/o200k_base.tiktoken"); diff --git a/tiktoken-rs/src/tokenizer.rs b/tiktoken-rs/src/tokenizer.rs index 7ab0b19..e03d7ff 100644 --- a/tiktoken-rs/src/tokenizer.rs +++ b/tiktoken-rs/src/tokenizer.rs @@ -29,12 +29,13 @@ pub enum Tokenizer { } // Keep this in sync with: -// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7 +// https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L7 const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[ ("o1-", Tokenizer::O200kBase), ("o3-", Tokenizer::O200kBase), ("o4-", Tokenizer::O200kBase), // chat + ("gpt-5-", Tokenizer::O200kBase), ("gpt-4.1-", Tokenizer::O200kBase), ("chatgpt-4o-", Tokenizer::O200kBase), ("gpt-4o-", Tokenizer::O200kBase), @@ -50,7 +51,7 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[ ]; // Keep this in sync with: -// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22 +// https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L29 const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[ // reasoning ("o1", Tokenizer::O200kBase), @@ -162,6 +163,10 @@ mod tests { #[test] fn test_get_tokenizer() { + assert_eq!( + get_tokenizer("gpt-5-mini"), + Some(Tokenizer::O200kBase) + ); assert_eq!( get_tokenizer("chatgpt-4o-latest"), Some(Tokenizer::O200kBase) diff --git a/tiktoken-rs/tests/model.rs b/tiktoken-rs/tests/model.rs index 2cf1217..435e647 100644 --- a/tiktoken-rs/tests/model.rs +++ b/tiktoken-rs/tests/model.rs @@ -10,6 +10,14 @@ fn test_finetuned_context_size() { get_context_size("ft:gpt-4o:custom"), get_context_size("gpt-4o") ); + assert_eq!( + get_context_size("ft:gpt-5:custom"), + get_context_size("gpt-5") + ); + assert_eq!( + get_context_size("ft:gpt-4.1:custom"), + get_context_size("gpt-4.1") + ); } #[test] @@ -17,3 +25,15 @@ fn test_o_series_context_size() { assert_eq!(get_context_size("o3-small"), 200_000); assert_eq!(get_context_size("o4"), 200_000); } + +#[test] +fn test_4_1_series_context_size() { + assert_eq!(get_context_size("gpt-4.1"), 1_047_576); + assert_eq!(get_context_size("gpt-4.1-mini"), 1_047_576); +} + +#[test] +fn test_5_series_context_size() { + assert_eq!(get_context_size("gpt-5"), 400_000); + assert_eq!(get_context_size("gpt-5-nano"), 400_000); +}