Skip to content

Commit c95cbe3

Browse files
HavenDVclaude
andcommitted
feat: add Tiktoken.Encodings.Tokenizer package for loading HuggingFace tokenizer.json files
Adds a new package that converts HuggingFace tokenizer.json files to Tiktoken encodings, enabling tokenization for GPT-2, Llama 3, Qwen2, DeepSeek, and other BPE-based models. Supports ByteLevel, Split, and Sequence[Split, ByteLevel] pre-tokenizer types with auto-detected regex patterns. Includes FromFile, FromJson, FromStream, and FromUrlAsync APIs. Closes #40 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6d5b313 commit c95cbe3

File tree

11 files changed

+945
-254
lines changed

11 files changed

+945
-254
lines changed

README.md

Lines changed: 168 additions & 127 deletions
Large diffs are not rendered by default.

Tiktoken.slnx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
<Project Path="src/libs/Tiktoken.Encodings.p50k/Tiktoken.Encodings.p50k.csproj" />
1212
<Project Path="src/libs/Tiktoken.Encodings.r50k/Tiktoken.Encodings.r50k.csproj" />
1313
<Project Path="src/libs/Tiktoken/Tiktoken.csproj" />
14+
<Project Path="src/libs/Tiktoken.Encodings.Tokenizer/Tiktoken.Encodings.Tokenizer.csproj" />
1415
</Folder>
1516
<Folder Name="/misc/">
1617
<File Path=".gitattributes" />

benchmarks/2.2.0.0_encode.md

Lines changed: 136 additions & 127 deletions
Large diffs are not rendered by default.

src/Directory.Packages.props

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
2222
</PackageVersion>
2323
<PackageVersion Include="SharpToken" Version="2.0.4" />
24+
<PackageVersion Include="System.Text.Json" Version="9.0.5" />
2425
<PackageVersion Include="System.ValueTuple" Version="4.6.2" />
2526
<PackageVersion Include="TiktokenSharp" Version="1.2.1" />
2627
<PackageVersion Include="Verify.MSTest" Version="31.13.2" />
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
using System.Text.Json.Serialization;
2+
3+
namespace Tiktoken.Encodings;
4+
5+
[JsonSerializable(typeof(TokenizerJson))]
6+
internal partial class SourceGenerationContext : JsonSerializerContext;
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFrameworks>net4.6.2;netstandard2.0;netstandard2.1;net8.0;net9.0;net10.0</TargetFrameworks>
5+
<RootNamespace>Tiktoken.Encodings</RootNamespace>
6+
</PropertyGroup>
7+
8+
<PropertyGroup Label="NuGet">
9+
<Description>Load tokenizer.json (HuggingFace format) as a Tiktoken encoding.</Description>
10+
<PackageTags>chatgpt;openai;tiktoken;tokens;huggingface;tokenizer;bpe;llama;qwen;deepseek;gpt2</PackageTags>
11+
</PropertyGroup>
12+
13+
<ItemGroup>
14+
<ProjectReference Include="..\Tiktoken.Encodings.Abstractions\Tiktoken.Encodings.Abstractions.csproj" />
15+
</ItemGroup>
16+
17+
<ItemGroup>
18+
<PackageReference Include="System.Text.Json" />
19+
</ItemGroup>
20+
21+
<ItemGroup Condition="'$(TargetFramework)' == 'net4.6.2'">
22+
<PackageReference Include="System.ValueTuple" />
23+
</ItemGroup>
24+
25+
<ItemGroup>
26+
<PackageReference Include="PolySharp">
27+
<PrivateAssets>all</PrivateAssets>
28+
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
29+
</PackageReference>
30+
</ItemGroup>
31+
32+
</Project>
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
using System.Text.Json.Serialization;
2+
3+
namespace Tiktoken.Encodings;
4+
5+
/// <summary>
6+
/// Represents a HuggingFace tokenizer.json file.
7+
/// </summary>
8+
public class TokenizerJson
9+
{
10+
/// <summary>
11+
/// </summary>
12+
[JsonPropertyName("version")]
13+
public string Version { get; set; } = string.Empty;
14+
15+
/// <summary>
16+
/// </summary>
17+
[JsonPropertyName("added_tokens")]
18+
public IReadOnlyList<TokenizerAddedToken> AddedTokens { get; set; } = [];
19+
20+
/// <summary>
21+
/// </summary>
22+
[JsonPropertyName("pre_tokenizer")]
23+
public TokenizerPreTokenizer? PreTokenizer { get; set; }
24+
25+
/// <summary>
26+
/// </summary>
27+
[JsonPropertyName("model")]
28+
public TokenizerModel? Model { get; set; }
29+
}
30+
31+
/// <summary>
32+
/// A token added to the tokenizer vocabulary.
33+
/// </summary>
34+
public class TokenizerAddedToken
35+
{
36+
/// <summary>
37+
/// </summary>
38+
[JsonPropertyName("id")]
39+
public int Id { get; set; }
40+
41+
/// <summary>
42+
/// </summary>
43+
[JsonPropertyName("special")]
44+
public bool Special { get; set; }
45+
46+
/// <summary>
47+
/// </summary>
48+
[JsonPropertyName("content")]
49+
public string Content { get; set; } = string.Empty;
50+
}
51+
52+
/// <summary>
53+
/// The pre-tokenizer configuration. Supports ByteLevel, Split, Sequence, and other types.
54+
/// </summary>
55+
public class TokenizerPreTokenizer
56+
{
57+
/// <summary>
58+
/// The pre-tokenizer type (ByteLevel, Split, Sequence, Metaspace, etc.).
59+
/// </summary>
60+
[JsonPropertyName("type")]
61+
public string Type { get; set; } = string.Empty;
62+
63+
/// <summary>
64+
/// For Split type: the pattern to split on.
65+
/// </summary>
66+
[JsonPropertyName("pattern")]
67+
public TokenizerSplitPattern? Pattern { get; set; }
68+
69+
/// <summary>
70+
/// For Sequence type: the list of nested pre-tokenizers.
71+
/// </summary>
72+
[JsonPropertyName("pretokenizers")]
73+
public IReadOnlyList<TokenizerPreTokenizer>? PreTokenizers { get; set; }
74+
}
75+
76+
/// <summary>
77+
/// A split pattern — either a Regex or a String.
78+
/// </summary>
79+
public class TokenizerSplitPattern
80+
{
81+
/// <summary>
82+
/// Regex pattern variant.
83+
/// </summary>
84+
[JsonPropertyName("Regex")]
85+
public string? Regex { get; set; }
86+
87+
/// <summary>
88+
/// String pattern variant.
89+
/// </summary>
90+
[JsonPropertyName("String")]
91+
public string? String { get; set; }
92+
}
93+
94+
/// <summary>
95+
/// The BPE model definition.
96+
/// </summary>
97+
public class TokenizerModel
98+
{
99+
/// <summary>
100+
/// </summary>
101+
[JsonPropertyName("type")]
102+
public string Type { get; set; } = string.Empty;
103+
104+
/// <summary>
105+
/// </summary>
106+
[JsonPropertyName("vocab")]
107+
#pragma warning disable CA2227
108+
public Dictionary<string, int>? Vocab { get; set; }
109+
#pragma warning restore CA2227
110+
111+
/// <summary>
112+
/// </summary>
113+
[JsonPropertyName("merges")]
114+
public IReadOnlyList<string>? Merges { get; set; }
115+
}

0 commit comments

Comments
 (0)