Skip to content

Commit 3b1dc52

Browse files
HavenDVclaude
andcommitted
feat: add message token counting, encoding factory, and expanded model support (#31)
- Add ChatMessage type and Encoder.CountMessageTokens() for OpenAI chat message token counting using the official formula (3 tokens/message overhead + 3 reply priming) - Add TikTokenEncoder.CreateForEncoding() for string-based encoding lookup - Add ModelToEncoding.ForEncoding()/TryForEncoding() methods - Expand Models constants: O3, O3Mini, O1, O1Mini, Gpt4oMini, Gpt4Turbo - Bundle p50k/r50k encodings in the Tiktoken meta-package - Add 10 new tests (50 total) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d84c36f commit 3b1dc52

File tree

7 files changed

+271
-1
lines changed

7 files changed

+271
-1
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
namespace Tiktoken;
2+
3+
/// <summary>
4+
/// Represents a chat message for token counting purposes.
5+
/// Compatible with OpenAI's message format.
6+
/// </summary>
7+
public class ChatMessage
8+
{
9+
/// <summary>
10+
/// The role of the message sender (e.g., "system", "user", "assistant").
11+
/// </summary>
12+
public string Role { get; set; } = string.Empty;
13+
14+
/// <summary>
15+
/// The text content of the message.
16+
/// </summary>
17+
public string Content { get; set; } = string.Empty;
18+
19+
/// <summary>
20+
/// Optional name of the sender. When present, adds 1 extra token.
21+
/// </summary>
22+
public string? Name { get; set; }
23+
24+
/// <summary>
25+
/// Creates a new chat message.
26+
/// </summary>
27+
public ChatMessage()
28+
{
29+
}
30+
31+
/// <summary>
32+
/// Creates a new chat message with the specified role and content.
33+
/// </summary>
34+
public ChatMessage(string role, string content, string? name = null)
35+
{
36+
Role = role ?? throw new ArgumentNullException(nameof(role));
37+
Content = content ?? throw new ArgumentNullException(nameof(content));
38+
Name = name;
39+
}
40+
}

src/libs/Tiktoken.Core/Encoder.cs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,49 @@ public IReadOnlyCollection<int> EncodeWithDisallowedSpecial(
207207
disallowedSpecial: [..disallowedSpecial]);
208208
}
209209

210+
/// <summary>
211+
/// Counts total tokens for a list of chat messages using OpenAI's token counting formula.
212+
/// Each message adds <paramref name="tokensPerMessage"/> overhead tokens (default 3).
213+
/// If a message has a <see cref="ChatMessage.Name"/>, <paramref name="tokensPerName"/> extra tokens are added (default 1).
214+
/// An additional 3 tokens are added at the end for reply priming.
215+
/// </summary>
216+
/// <remarks>
217+
/// Based on the official OpenAI token counting cookbook:
218+
/// https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
219+
/// <para>
220+
/// The default values (tokensPerMessage=3, tokensPerName=1) are correct for
221+
/// gpt-4o, gpt-4, gpt-3.5-turbo, and all newer models.
222+
/// </para>
223+
/// </remarks>
224+
/// <param name="messages">The chat messages to count tokens for.</param>
225+
/// <param name="tokensPerMessage">Overhead tokens added per message (default: 3).</param>
226+
/// <param name="tokensPerName">Extra tokens when a message has a name (default: 1).</param>
227+
/// <returns>The total token count including message overhead and reply priming.</returns>
228+
public int CountMessageTokens(
229+
IReadOnlyList<ChatMessage> messages,
230+
int tokensPerMessage = 3,
231+
int tokensPerName = 1)
232+
{
233+
messages = messages ?? throw new ArgumentNullException(nameof(messages));
234+
235+
var count = 0;
236+
for (var i = 0; i < messages.Count; i++)
237+
{
238+
var message = messages[i];
239+
count += tokensPerMessage;
240+
count += CountTokens(message.Role);
241+
count += CountTokens(message.Content);
242+
if (message.Name != null)
243+
{
244+
count += CountTokens(message.Name);
245+
count += tokensPerName;
246+
}
247+
}
248+
249+
count += 3; // every reply is primed with <|start|>assistant<|message|>
250+
return count;
251+
}
252+
210253
/// <summary>
211254
///
212255
/// </summary>

src/libs/Tiktoken/ModelToEncoding.cs

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ public static class ModelToEncoding
1313

1414
private static Dictionary<string, Lazy<Encoding>> Dictionary { get; } = new()
1515
{
16+
// o-series reasoning models
17+
{ "o3", O200K },
18+
{ "o1", O200K },
19+
1620
// chat
1721
{ "gpt-4o", O200K },
1822
{ "gpt-4", Cl100K },
@@ -27,9 +31,9 @@ public static class ModelToEncoding
2731

2832
/// <summary>
2933
/// Returns encoding by model name or null.
34+
/// Uses prefix matching (e.g., "gpt-4o-mini" matches "gpt-4o").
3035
/// </summary>
3136
/// <param name="modelName">gpt-4 gpt-3.5-turbo ...</param>
32-
/// <exception cref="ArgumentException"></exception>
3337
/// <returns></returns>
3438
public static Encoding? TryFor(string modelName)
3539
{
@@ -41,6 +45,7 @@ public static class ModelToEncoding
4145

4246
/// <summary>
4347
/// Returns encoding by model name or throws exception.
48+
/// Uses prefix matching (e.g., "gpt-4o-mini" matches "gpt-4o").
4449
/// </summary>
4550
/// <param name="modelName">gpt-4 gpt-3.5-turbo ...</param>
4651
/// <exception cref="ArgumentException"></exception>
@@ -50,4 +55,34 @@ public static Encoding For(string modelName)
5055
return TryFor(modelName) ??
5156
throw new ArgumentException($"Model name {modelName} is not supported.");
5257
}
58+
59+
/// <summary>
60+
/// Returns encoding by encoding name (e.g., "cl100k_base", "o200k_base").
61+
/// </summary>
62+
/// <param name="encodingName">cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base</param>
63+
/// <returns></returns>
64+
/// <exception cref="ArgumentException"></exception>
65+
public static Encoding ForEncoding(string encodingName)
66+
{
67+
return TryForEncoding(encodingName) ??
68+
throw new ArgumentException($"Encoding name {encodingName} is not supported.");
69+
}
70+
71+
/// <summary>
72+
/// Returns encoding by encoding name or null.
73+
/// </summary>
74+
/// <param name="encodingName">cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base</param>
75+
/// <returns></returns>
76+
public static Encoding? TryForEncoding(string encodingName)
77+
{
78+
return encodingName switch
79+
{
80+
"cl100k_base" => Cl100K.Value,
81+
"o200k_base" => O200K.Value,
82+
"p50k_base" => new P50KBase(),
83+
"p50k_edit" => new P50KEdit(),
84+
"r50k_base" => new R50KBase(),
85+
_ => null,
86+
};
87+
}
5388
}

src/libs/Tiktoken/Models.cs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,46 @@ namespace Tiktoken;
55
/// </summary>
66
public static class Models
77
{
8+
// o-series reasoning models (o200k_base)
9+
10+
/// <summary>o3 (uses o200k_base encoding).</summary>
11+
public const string O3 = "o3";
12+
13+
/// <summary>o3-mini (uses o200k_base encoding).</summary>
14+
public const string O3Mini = "o3-mini";
15+
16+
/// <summary>o1 (uses o200k_base encoding).</summary>
17+
public const string O1 = "o1";
18+
19+
/// <summary>o1-mini (uses o200k_base encoding).</summary>
20+
public const string O1Mini = "o1-mini";
21+
22+
// GPT-4o family (o200k_base)
23+
824
/// <summary>GPT-4o (uses o200k_base encoding).</summary>
925
public const string Gpt4o = "gpt-4o";
1026

27+
/// <summary>GPT-4o mini (uses o200k_base encoding).</summary>
28+
public const string Gpt4oMini = "gpt-4o-mini";
29+
30+
// GPT-4 family (cl100k_base)
31+
32+
/// <summary>GPT-4 Turbo (uses cl100k_base encoding).</summary>
33+
public const string Gpt4Turbo = "gpt-4-turbo";
34+
1135
/// <summary>GPT-4 (uses cl100k_base encoding).</summary>
1236
public const string Gpt4 = "gpt-4";
1337

38+
// GPT-3.5 family (cl100k_base)
39+
1440
/// <summary>GPT-3.5 Turbo (uses cl100k_base encoding).</summary>
1541
public const string Gpt35Turbo = "gpt-3.5-turbo";
1642

1743
/// <summary>GPT-3.5 Turbo Azure deployment name (uses cl100k_base encoding).</summary>
1844
public const string Gpt35TurboAzure = "gpt-35-turbo";
1945

46+
// Embeddings (cl100k_base)
47+
2048
/// <summary>Text Embedding Ada 002 (uses cl100k_base encoding).</summary>
2149
public const string TextEmbeddingAda002 = "text-embedding-ada-002";
2250

src/libs/Tiktoken/TikTokenEncoder.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,15 @@ public static Encoder CreateForModel(string modelName)
3333
{
3434
return ModelToEncoder.TryFor(modelName);
3535
}
36+
37+
/// <summary>
38+
/// Creates an encoder for the specified encoding name (e.g., "cl100k_base", "o200k_base").
39+
/// </summary>
40+
/// <param name="encodingName">Encoding name (cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base).</param>
41+
/// <returns>An <see cref="Encoder"/> instance.</returns>
42+
/// <exception cref="ArgumentException">Thrown when the encoding name is not supported.</exception>
43+
public static Encoder CreateForEncoding(string encodingName)
44+
{
45+
return new Encoder(ModelToEncoding.ForEncoding(encodingName));
46+
}
3647
}

src/libs/Tiktoken/Tiktoken.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
<ProjectReference Include="..\Tiktoken.Core\Tiktoken.Core.csproj" />
99
<ProjectReference Include="..\Tiktoken.Encodings.cl100k\Tiktoken.Encodings.cl100k.csproj" />
1010
<ProjectReference Include="..\Tiktoken.Encodings.o200k\Tiktoken.Encodings.o200k.csproj" />
11+
<ProjectReference Include="..\Tiktoken.Encodings.p50k\Tiktoken.Encodings.p50k.csproj" />
12+
<ProjectReference Include="..\Tiktoken.Encodings.r50k\Tiktoken.Encodings.r50k.csproj" />
1113
<ProjectReference Include="..\Tiktoken.Encodings.Tokenizer\Tiktoken.Encodings.Tokenizer.csproj" />
1214
</ItemGroup>
1315

src/tests/Tiktoken.UnitTests/Tests.cs

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,4 +367,115 @@ public void TokenizerJsonDetectsSequenceSplitPattern()
367367
var decoded = encoder.Decode(encoded);
368368
decoded.Should().Be("hello world");
369369
}
370+
371+
[TestMethod]
372+
public void CreateForEncodingCl100K()
373+
{
374+
var encoder = TikTokenEncoder.CreateForEncoding("cl100k_base");
375+
var tokens = encoder.Encode("hello world");
376+
377+
tokens.Count.Should().Be(2);
378+
}
379+
380+
[TestMethod]
381+
public void CreateForEncodingO200K()
382+
{
383+
var encoder = TikTokenEncoder.CreateForEncoding("o200k_base");
384+
var tokens = encoder.Encode("hello world");
385+
386+
tokens.Count.Should().Be(2);
387+
}
388+
389+
[TestMethod]
390+
public void CreateForEncodingP50K()
391+
{
392+
var encoder = TikTokenEncoder.CreateForEncoding("p50k_base");
393+
encoder.Encode("hello world").Count.Should().BeGreaterThan(0);
394+
}
395+
396+
[TestMethod]
397+
public void CreateForEncodingR50K()
398+
{
399+
var encoder = TikTokenEncoder.CreateForEncoding("r50k_base");
400+
encoder.Encode("hello world").Count.Should().BeGreaterThan(0);
401+
}
402+
403+
[TestMethod]
404+
public void CreateForEncodingThrowsOnUnknown()
405+
{
406+
var act = () => TikTokenEncoder.CreateForEncoding("unknown_encoding");
407+
act.Should().Throw<ArgumentException>();
408+
}
409+
410+
[TestMethod]
411+
public void ModelPrefixMatchingO3Mini()
412+
{
413+
var encoder = TikTokenEncoder.CreateForModel(Models.O3Mini);
414+
encoder.Should().NotBeNull();
415+
encoder.Encode("hello").Count.Should().BeGreaterThan(0);
416+
}
417+
418+
[TestMethod]
419+
public void ModelPrefixMatchingGpt4Turbo()
420+
{
421+
var encoder = TikTokenEncoder.CreateForModel(Models.Gpt4Turbo);
422+
encoder.Should().NotBeNull();
423+
encoder.Encode("hello").Count.Should().BeGreaterThan(0);
424+
}
425+
426+
[TestMethod]
427+
public void CountMessageTokensBasic()
428+
{
429+
var encoder = ModelToEncoder.For("gpt-4o");
430+
var messages = new List<ChatMessage>
431+
{
432+
new("system", "You are a helpful assistant."),
433+
new("user", "hello world"),
434+
};
435+
436+
var count = encoder.CountMessageTokens(messages);
437+
438+
// Each message: 3 overhead + role tokens + content tokens
439+
// Plus 3 reply priming at the end
440+
// "system" = 1 token, "You are a helpful assistant." = 6 tokens → 3 + 1 + 6 = 10
441+
// "user" = 1 token, "hello world" = 2 tokens → 3 + 1 + 2 = 6
442+
// Reply priming: 3
443+
// Total: 10 + 6 + 3 = 19
444+
count.Should().Be(19);
445+
}
446+
447+
[TestMethod]
448+
public void CountMessageTokensWithName()
449+
{
450+
var encoder = ModelToEncoder.For("gpt-4o");
451+
var messages = new List<ChatMessage>
452+
{
453+
new("system", "You are a helpful assistant.", name: "helper"),
454+
};
455+
456+
var countWithName = encoder.CountMessageTokens(messages);
457+
458+
var messagesWithoutName = new List<ChatMessage>
459+
{
460+
new("system", "You are a helpful assistant."),
461+
};
462+
463+
var countWithoutName = encoder.CountMessageTokens(messagesWithoutName);
464+
465+
// Name adds: CountTokens("helper") + 1
466+
// "helper" = 1 token, so name adds 2
467+
countWithName.Should().Be(countWithoutName + 2);
468+
}
469+
470+
[TestMethod]
471+
public void CountMessageTokensEmpty()
472+
{
473+
var encoder = ModelToEncoder.For("gpt-4o");
474+
var messages = new List<ChatMessage>();
475+
476+
var count = encoder.CountMessageTokens(messages);
477+
478+
// Only reply priming: 3
479+
count.Should().Be(3);
480+
}
370481
}

0 commit comments

Comments
 (0)