Update with GPT-5 Support

LoganBresnahan · LoganBresnahan · commit 345b0ef9c61b · 2025-08-11T15:33:31.000-05:00
diff --git a/tiktoken-rs/README.md b/tiktoken-rs/README.md
@@ -105,7 +105,7 @@ println!("max_tokens: {}", max_tokens);
 
 | Encoding name           | OpenAI models                                                             |
 | ----------------------- | ------------------------------------------------------------------------- |
-| `o200k_base`            | GPT-4o models, GPT-4.1, o1, o3, and o4 models                             |
+| `o200k_base`            | GPT-5, GPT-4.1, GPT-4o, o1, o3, and o4 models                             |
 | `cl100k_base`           | ChatGPT models, `text-embedding-ada-002`                                  |
 | `p50k_base`             | Code models, `text-davinci-002`, `text-davinci-003`                       |
 | `p50k_edit`             | Use for edit models like `text-davinci-edit-001`, `code-davinci-edit-001` |
diff --git a/tiktoken-rs/src/model.rs b/tiktoken-rs/src/model.rs
@@ -39,6 +39,9 @@ pub fn get_context_size(model: &str) -> usize {
     if starts_with_any!(model, "o1", "o3", "o4") {
         return 200_000;
     }
+    if starts_with_any!(model, "gpt-5") {
+        return 400_000;
+    }
     if starts_with_any!(model, "gpt-4.1") {
         return 1_047_576;
     }
diff --git a/tiktoken-rs/src/singleton.rs b/tiktoken-rs/src/singleton.rs
@@ -49,7 +49,7 @@ pub fn cl100k_base_singleton() -> &'static CoreBPE {
 }
 
 /// Returns a singleton instance of the o200k_base tokenizer.
-/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`.
+/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`.
 ///
 /// This function will only initialize the tokenizer once, and then return a reference the tokenizer
 pub fn o200k_base_singleton() -> &'static CoreBPE {
diff --git a/tiktoken-rs/src/tiktoken_ext/openai_public.rs b/tiktoken-rs/src/tiktoken_ext/openai_public.rs
@@ -118,7 +118,7 @@ pub fn cl100k_base() -> Result<CoreBPE> {
     Ok(bpe)
 }
 
-/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`.
+/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`.
 /// Initializes and returns a new instance of the o200k_base tokenizer.
 pub fn o200k_base() -> Result<CoreBPE> {
     let o200k_base = include_str!("../../assets/o200k_base.tiktoken");
diff --git a/tiktoken-rs/src/tokenizer.rs b/tiktoken-rs/src/tokenizer.rs
@@ -29,12 +29,13 @@ pub enum Tokenizer {
 }
 
 // Keep this in sync with:
-// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7
+// https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L7
 const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
     ("o1-", Tokenizer::O200kBase),
     ("o3-", Tokenizer::O200kBase),
     ("o4-", Tokenizer::O200kBase),
     // chat
+    ("gpt-5-", Tokenizer::O200kBase),
     ("gpt-4.1-", Tokenizer::O200kBase),
     ("chatgpt-4o-", Tokenizer::O200kBase),
     ("gpt-4o-", Tokenizer::O200kBase),
@@ -50,7 +51,7 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
 ];
 
 // Keep this in sync with:
-// https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
+// https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L29
 const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
     // reasoning
     ("o1", Tokenizer::O200kBase),
@@ -162,6 +163,10 @@ mod tests {
 
     #[test]
     fn test_get_tokenizer() {
+        assert_eq!(
+            get_tokenizer("gpt-5-mini"),
+            Some(Tokenizer::O200kBase)
+        );
         assert_eq!(
             get_tokenizer("chatgpt-4o-latest"),
             Some(Tokenizer::O200kBase)
diff --git a/tiktoken-rs/tests/model.rs b/tiktoken-rs/tests/model.rs
@@ -10,10 +10,30 @@ fn test_finetuned_context_size() {
         get_context_size("ft:gpt-4o:custom"),
         get_context_size("gpt-4o")
     );
+    assert_eq!(
+        get_context_size("ft:gpt-5:custom"),
+        get_context_size("gpt-5")
+    );
+    assert_eq!(
+        get_context_size("ft:gpt-4.1:custom"),
+        get_context_size("gpt-4.1")
+    );
 }
 
 #[test]
 fn test_o_series_context_size() {
     assert_eq!(get_context_size("o3-small"), 200_000);
     assert_eq!(get_context_size("o4"), 200_000);
 }
+
+#[test]
+fn test_4_1_series_context_size() {
+    assert_eq!(get_context_size("gpt-4.1"), 1_047_576);
+    assert_eq!(get_context_size("gpt-4.1-mini"), 1_047_576);
+}
+
+#[test]
+fn test_5_series_context_size() {
+    assert_eq!(get_context_size("gpt-5"), 400_000);
+    assert_eq!(get_context_size("gpt-5-nano"), 400_000);
+}

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,9 @@ pub fn get_context_size(model: &str) -> usize {`
`39`	`39`	`if starts_with_any!(model, "o1", "o3", "o4") {`
`40`	`40`	`return 200_000;`
`41`	`41`	`}`
	`42`	`+ if starts_with_any!(model, "gpt-5") {`
	`43`	`+ return 400_000;`
	`44`	`+ }`
`42`	`45`	`if starts_with_any!(model, "gpt-4.1") {`
`43`	`46`	`return 1_047_576;`
`44`	`47`	`}`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ pub fn cl100k_base_singleton() -> &'static CoreBPE {`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`/// Returns a singleton instance of the o200k_base tokenizer.`
`52`		-/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`.
	`52`	+/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`.
`53`	`53`	`///`
`54`	`54`	`/// This function will only initialize the tokenizer once, and then return a reference the tokenizer`
`55`	`55`	`pub fn o200k_base_singleton() -> &'static CoreBPE {`
Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ pub fn cl100k_base() -> Result<CoreBPE> {`
`118`	`118`	`Ok(bpe)`
`119`	`119`	`}`
`120`	`120`
`121`		-/// Use for GPT-4o models and other `o` series models like `o1`, `o3`, and `o4`.
	`121`	+/// Use for GPT-5, GPT-4.1, GPT-4o, and other `o` series models like `o1`, `o3`, and `o4`.
`122`	`122`	`/// Initializes and returns a new instance of the o200k_base tokenizer.`
`123`	`123`	`pub fn o200k_base() -> Result<CoreBPE> {`
`124`	`124`	`let o200k_base = include_str!("../../assets/o200k_base.tiktoken");`