feat: add message token counting, encoding factory, and expanded model support (#31)

HavenDV · claude · HavenDV · commit 3b1dc52594c1 · 2026-03-19T03:23:52.000+04:00
- Add ChatMessage type and Encoder.CountMessageTokens() for OpenAI chat
  message token counting using the official formula (3 tokens/message
  overhead + 3 reply priming)
- Add TikTokenEncoder.CreateForEncoding() for string-based encoding lookup
- Add ModelToEncoding.ForEncoding()/TryForEncoding() methods
- Expand Models constants: O3, O3Mini, O1, O1Mini, Gpt4oMini, Gpt4Turbo
- Bundle p50k/r50k encodings in the Tiktoken meta-package
- Add 10 new tests (50 total)

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/libs/Tiktoken.Core/ChatMessage.cs b/src/libs/Tiktoken.Core/ChatMessage.cs
@@ -0,0 +1,40 @@
+namespace Tiktoken;
+
+/// <summary>
+/// Represents a chat message for token counting purposes.
+/// Compatible with OpenAI's message format.
+/// </summary>
+public class ChatMessage
+{
+    /// <summary>
+    /// The role of the message sender (e.g., "system", "user", "assistant").
+    /// </summary>
+    public string Role { get; set; } = string.Empty;
+
+    /// <summary>
+    /// The text content of the message.
+    /// </summary>
+    public string Content { get; set; } = string.Empty;
+
+    /// <summary>
+    /// Optional name of the sender. When present, adds 1 extra token.
+    /// </summary>
+    public string? Name { get; set; }
+
+    /// <summary>
+    /// Creates a new chat message.
+    /// </summary>
+    public ChatMessage()
+    {
+    }
+
+    /// <summary>
+    /// Creates a new chat message with the specified role and content.
+    /// </summary>
+    public ChatMessage(string role, string content, string? name = null)
+    {
+        Role = role ?? throw new ArgumentNullException(nameof(role));
+        Content = content ?? throw new ArgumentNullException(nameof(content));
+        Name = name;
+    }
+}
diff --git a/src/libs/Tiktoken.Core/Encoder.cs b/src/libs/Tiktoken.Core/Encoder.cs
@@ -207,6 +207,49 @@ public IReadOnlyCollection<int> EncodeWithDisallowedSpecial(
             disallowedSpecial: [..disallowedSpecial]);
     }
 
+    /// <summary>
+    /// Counts total tokens for a list of chat messages using OpenAI's token counting formula.
+    /// Each message adds <paramref name="tokensPerMessage"/> overhead tokens (default 3).
+    /// If a message has a <see cref="ChatMessage.Name"/>, <paramref name="tokensPerName"/> extra tokens are added (default 1).
+    /// An additional 3 tokens are added at the end for reply priming.
+    /// </summary>
+    /// <remarks>
+    /// Based on the official OpenAI token counting cookbook:
+    /// https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
+    /// <para>
+    /// The default values (tokensPerMessage=3, tokensPerName=1) are correct for
+    /// gpt-4o, gpt-4, gpt-3.5-turbo, and all newer models.
+    /// </para>
+    /// </remarks>
+    /// <param name="messages">The chat messages to count tokens for.</param>
+    /// <param name="tokensPerMessage">Overhead tokens added per message (default: 3).</param>
+    /// <param name="tokensPerName">Extra tokens when a message has a name (default: 1).</param>
+    /// <returns>The total token count including message overhead and reply priming.</returns>
+    public int CountMessageTokens(
+        IReadOnlyList<ChatMessage> messages,
+        int tokensPerMessage = 3,
+        int tokensPerName = 1)
+    {
+        messages = messages ?? throw new ArgumentNullException(nameof(messages));
+
+        var count = 0;
+        for (var i = 0; i < messages.Count; i++)
+        {
+            var message = messages[i];
+            count += tokensPerMessage;
+            count += CountTokens(message.Role);
+            count += CountTokens(message.Content);
+            if (message.Name != null)
+            {
+                count += CountTokens(message.Name);
+                count += tokensPerName;
+            }
+        }
+
+        count += 3; // every reply is primed with <|start|>assistant<|message|>
+        return count;
+    }
+
     /// <summary>
     ///
     /// </summary>
diff --git a/src/libs/Tiktoken/ModelToEncoding.cs b/src/libs/Tiktoken/ModelToEncoding.cs
@@ -13,6 +13,10 @@ public static class ModelToEncoding
 
     private static Dictionary<string, Lazy<Encoding>> Dictionary { get; } = new()
     {
+        // o-series reasoning models
+        { "o3", O200K },
+        { "o1", O200K },
+
         // chat
         { "gpt-4o", O200K },
         { "gpt-4", Cl100K },
@@ -27,9 +31,9 @@ public static class ModelToEncoding
 
     /// <summary>
     /// Returns encoding by model name or null.
+    /// Uses prefix matching (e.g., "gpt-4o-mini" matches "gpt-4o").
     /// </summary>
     /// <param name="modelName">gpt-4 gpt-3.5-turbo ...</param>
-    /// <exception cref="ArgumentException"></exception>
     /// <returns></returns>
     public static Encoding? TryFor(string modelName)
     {
@@ -41,6 +45,7 @@ public static class ModelToEncoding
 
     /// <summary>
     /// Returns encoding by model name or throws exception.
+    /// Uses prefix matching (e.g., "gpt-4o-mini" matches "gpt-4o").
     /// </summary>
     /// <param name="modelName">gpt-4 gpt-3.5-turbo ...</param>
     /// <exception cref="ArgumentException"></exception>
@@ -50,4 +55,34 @@ public static Encoding For(string modelName)
         return TryFor(modelName) ??
                throw new ArgumentException($"Model name {modelName} is not supported.");
     }
+
+    /// <summary>
+    /// Returns encoding by encoding name (e.g., "cl100k_base", "o200k_base").
+    /// </summary>
+    /// <param name="encodingName">cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base</param>
+    /// <returns></returns>
+    /// <exception cref="ArgumentException"></exception>
+    public static Encoding ForEncoding(string encodingName)
+    {
+        return TryForEncoding(encodingName) ??
+               throw new ArgumentException($"Encoding name {encodingName} is not supported.");
+    }
+
+    /// <summary>
+    /// Returns encoding by encoding name or null.
+    /// </summary>
+    /// <param name="encodingName">cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base</param>
+    /// <returns></returns>
+    public static Encoding? TryForEncoding(string encodingName)
+    {
+        return encodingName switch
+        {
+            "cl100k_base" => Cl100K.Value,
+            "o200k_base" => O200K.Value,
+            "p50k_base" => new P50KBase(),
+            "p50k_edit" => new P50KEdit(),
+            "r50k_base" => new R50KBase(),
+            _ => null,
+        };
+    }
 }
diff --git a/src/libs/Tiktoken/Models.cs b/src/libs/Tiktoken/Models.cs
@@ -5,18 +5,46 @@ namespace Tiktoken;
 /// </summary>
 public static class Models
 {
+    // o-series reasoning models (o200k_base)
+
+    /// <summary>o3 (uses o200k_base encoding).</summary>
+    public const string O3 = "o3";
+
+    /// <summary>o3-mini (uses o200k_base encoding).</summary>
+    public const string O3Mini = "o3-mini";
+
+    /// <summary>o1 (uses o200k_base encoding).</summary>
+    public const string O1 = "o1";
+
+    /// <summary>o1-mini (uses o200k_base encoding).</summary>
+    public const string O1Mini = "o1-mini";
+
+    // GPT-4o family (o200k_base)
+
     /// <summary>GPT-4o (uses o200k_base encoding).</summary>
     public const string Gpt4o = "gpt-4o";
 
+    /// <summary>GPT-4o mini (uses o200k_base encoding).</summary>
+    public const string Gpt4oMini = "gpt-4o-mini";
+
+    // GPT-4 family (cl100k_base)
+
+    /// <summary>GPT-4 Turbo (uses cl100k_base encoding).</summary>
+    public const string Gpt4Turbo = "gpt-4-turbo";
+
     /// <summary>GPT-4 (uses cl100k_base encoding).</summary>
     public const string Gpt4 = "gpt-4";
 
+    // GPT-3.5 family (cl100k_base)
+
     /// <summary>GPT-3.5 Turbo (uses cl100k_base encoding).</summary>
     public const string Gpt35Turbo = "gpt-3.5-turbo";
 
     /// <summary>GPT-3.5 Turbo Azure deployment name (uses cl100k_base encoding).</summary>
     public const string Gpt35TurboAzure = "gpt-35-turbo";
 
+    // Embeddings (cl100k_base)
+
     /// <summary>Text Embedding Ada 002 (uses cl100k_base encoding).</summary>
     public const string TextEmbeddingAda002 = "text-embedding-ada-002";
 
diff --git a/src/libs/Tiktoken/TikTokenEncoder.cs b/src/libs/Tiktoken/TikTokenEncoder.cs
@@ -33,4 +33,15 @@ public static Encoder CreateForModel(string modelName)
     {
         return ModelToEncoder.TryFor(modelName);
     }
+
+    /// <summary>
+    /// Creates an encoder for the specified encoding name (e.g., "cl100k_base", "o200k_base").
+    /// </summary>
+    /// <param name="encodingName">Encoding name (cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base).</param>
+    /// <returns>An <see cref="Encoder"/> instance.</returns>
+    /// <exception cref="ArgumentException">Thrown when the encoding name is not supported.</exception>
+    public static Encoder CreateForEncoding(string encodingName)
+    {
+        return new Encoder(ModelToEncoding.ForEncoding(encodingName));
+    }
 }
diff --git a/src/libs/Tiktoken/Tiktoken.csproj b/src/libs/Tiktoken/Tiktoken.csproj
@@ -8,6 +8,8 @@
       <ProjectReference Include="..\Tiktoken.Core\Tiktoken.Core.csproj" />
       <ProjectReference Include="..\Tiktoken.Encodings.cl100k\Tiktoken.Encodings.cl100k.csproj" />
       <ProjectReference Include="..\Tiktoken.Encodings.o200k\Tiktoken.Encodings.o200k.csproj" />
+      <ProjectReference Include="..\Tiktoken.Encodings.p50k\Tiktoken.Encodings.p50k.csproj" />
+      <ProjectReference Include="..\Tiktoken.Encodings.r50k\Tiktoken.Encodings.r50k.csproj" />
       <ProjectReference Include="..\Tiktoken.Encodings.Tokenizer\Tiktoken.Encodings.Tokenizer.csproj" />
     </ItemGroup>
 
diff --git a/src/tests/Tiktoken.UnitTests/Tests.cs b/src/tests/Tiktoken.UnitTests/Tests.cs
@@ -367,4 +367,115 @@ public void TokenizerJsonDetectsSequenceSplitPattern()
         var decoded = encoder.Decode(encoded);
         decoded.Should().Be("hello world");
     }
+
+    [TestMethod]
+    public void CreateForEncodingCl100K()
+    {
+        var encoder = TikTokenEncoder.CreateForEncoding("cl100k_base");
+        var tokens = encoder.Encode("hello world");
+
+        tokens.Count.Should().Be(2);
+    }
+
+    [TestMethod]
+    public void CreateForEncodingO200K()
+    {
+        var encoder = TikTokenEncoder.CreateForEncoding("o200k_base");
+        var tokens = encoder.Encode("hello world");
+
+        tokens.Count.Should().Be(2);
+    }
+
+    [TestMethod]
+    public void CreateForEncodingP50K()
+    {
+        var encoder = TikTokenEncoder.CreateForEncoding("p50k_base");
+        encoder.Encode("hello world").Count.Should().BeGreaterThan(0);
+    }
+
+    [TestMethod]
+    public void CreateForEncodingR50K()
+    {
+        var encoder = TikTokenEncoder.CreateForEncoding("r50k_base");
+        encoder.Encode("hello world").Count.Should().BeGreaterThan(0);
+    }
+
+    [TestMethod]
+    public void CreateForEncodingThrowsOnUnknown()
+    {
+        var act = () => TikTokenEncoder.CreateForEncoding("unknown_encoding");
+        act.Should().Throw<ArgumentException>();
+    }
+
+    [TestMethod]
+    public void ModelPrefixMatchingO3Mini()
+    {
+        var encoder = TikTokenEncoder.CreateForModel(Models.O3Mini);
+        encoder.Should().NotBeNull();
+        encoder.Encode("hello").Count.Should().BeGreaterThan(0);
+    }
+
+    [TestMethod]
+    public void ModelPrefixMatchingGpt4Turbo()
+    {
+        var encoder = TikTokenEncoder.CreateForModel(Models.Gpt4Turbo);
+        encoder.Should().NotBeNull();
+        encoder.Encode("hello").Count.Should().BeGreaterThan(0);
+    }
+
+    [TestMethod]
+    public void CountMessageTokensBasic()
+    {
+        var encoder = ModelToEncoder.For("gpt-4o");
+        var messages = new List<ChatMessage>
+        {
+            new("system", "You are a helpful assistant."),
+            new("user", "hello world"),
+        };
+
+        var count = encoder.CountMessageTokens(messages);
+
+        // Each message: 3 overhead + role tokens + content tokens
+        // Plus 3 reply priming at the end
+        // "system" = 1 token, "You are a helpful assistant." = 6 tokens → 3 + 1 + 6 = 10
+        // "user" = 1 token, "hello world" = 2 tokens → 3 + 1 + 2 = 6
+        // Reply priming: 3
+        // Total: 10 + 6 + 3 = 19
+        count.Should().Be(19);
+    }
+
+    [TestMethod]
+    public void CountMessageTokensWithName()
+    {
+        var encoder = ModelToEncoder.For("gpt-4o");
+        var messages = new List<ChatMessage>
+        {
+            new("system", "You are a helpful assistant.", name: "helper"),
+        };
+
+        var countWithName = encoder.CountMessageTokens(messages);
+
+        var messagesWithoutName = new List<ChatMessage>
+        {
+            new("system", "You are a helpful assistant."),
+        };
+
+        var countWithoutName = encoder.CountMessageTokens(messagesWithoutName);
+
+        // Name adds: CountTokens("helper") + 1
+        // "helper" = 1 token, so name adds 2
+        countWithName.Should().Be(countWithoutName + 2);
+    }
+
+    [TestMethod]
+    public void CountMessageTokensEmpty()
+    {
+        var encoder = ModelToEncoder.For("gpt-4o");
+        var messages = new List<ChatMessage>();
+
+        var count = encoder.CountMessageTokens(messages);
+
+        // Only reply priming: 3
+        count.Should().Be(3);
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -33,4 +33,15 @@ public static Encoder CreateForModel(string modelName)`
`33`	`33`	`{`
`34`	`34`	`return ModelToEncoder.TryFor(modelName);`
`35`	`35`	`}`
	`36`	`+`
	`37`	`+ /// <summary>`
	`38`	`+ /// Creates an encoder for the specified encoding name (e.g., "cl100k_base", "o200k_base").`
	`39`	`+ /// </summary>`
	`40`	`+ /// <param name="encodingName">Encoding name (cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base).</param>`
	`41`	`+ /// <returns>An <see cref="Encoder"/> instance.</returns>`
	`42`	`+ /// <exception cref="ArgumentException">Thrown when the encoding name is not supported.</exception>`
	`43`	`+ public static Encoder CreateForEncoding(string encodingName)`
	`44`	`+ {`
	`45`	`+ return new Encoder(ModelToEncoding.ForEncoding(encodingName));`
	`46`	`+ }`
`36`	`47`	`}`