Skip to content

Commit 6413a0a

Browse files
committed
benchmark: Added Microsoft.ML.Tokenizers.
1 parent 4b1c5db commit 6413a0a

File tree

6 files changed

+107
-40
lines changed

6 files changed

+107
-40
lines changed

README.md

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -33,45 +33,51 @@ You can view the reports for each version [here](benchmarks)
3333
<!--BENCHMARKS_START-->
3434
```
3535
36-
BenchmarkDotNet v0.13.12, macOS Sonoma 14.4.1 (23E224) [Darwin 23.4.0]
36+
BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0]
3737
Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores
38-
.NET SDK 8.0.204
39-
[Host] : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
40-
DefaultJob : .NET 8.0.4 (8.0.424.16909), Arm64 RyuJIT AdvSIMD
38+
.NET SDK 9.0.100
39+
[Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
40+
DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
4141
4242
4343
```
44-
| Method | Categories | Data | Mean | Median | Ratio | Gen0 | Gen1 | Gen2 | Allocated | Alloc Ratio |
45-
|--------------------------- |------------ |-------------------- |-------------:|-------------:|------:|---------:|--------:|-------:|----------:|------------:|
46-
| **SharpTokenV2_0_1_** | **CountTokens** | **1. (...)57. [19866]** | **632,817.1 ns** | **632,257.2 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** |
47-
| TiktokenSharpV1_0_9_ | CountTokens | 1. (...)57. [19866] | 463,840.3 ns | 458,851.3 ns | 0.74 | 64.4531 | 3.4180 | - | 404649 B | 20.12 |
48-
| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 801,796.0 ns | 806,271.8 ns | 1.27 | 247.0703 | 98.6328 | 0.9766 | 1547675 B | 76.94 |
49-
| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 319,697.2 ns | 319,475.1 ns | 0.50 | 49.3164 | - | - | 309449 B | 15.38 |
50-
| | | | | | | | | | | |
51-
| **SharpTokenV2_0_1_** | **CountTokens** | **Hello, World!** | **478.1 ns** | **478.1 ns** | **1.00** | **0.0401** | **-** | **-** | **256 B** | **1.00** |
52-
| TiktokenSharpV1_0_9_ | CountTokens | Hello, World! | 275.2 ns | 275.1 ns | 0.58 | 0.0505 | - | - | 320 B | 1.25 |
53-
| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 498.1 ns | 497.4 ns | 1.04 | 0.2356 | - | - | 1480 B | 5.78 |
54-
| Tiktoken_ | CountTokens | Hello, World! | 212.9 ns | 212.8 ns | 0.45 | 0.0420 | - | - | 264 B | 1.03 |
55-
| | | | | | | | | | | |
56-
| **SharpTokenV2_0_1_** | **CountTokens** | **King(...)edy. [275]** | **6,652.5 ns** | **6,651.9 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** |
57-
| TiktokenSharpV1_0_9_ | CountTokens | King(...)edy. [275] | 4,774.2 ns | 4,781.1 ns | 0.72 | 0.8011 | - | - | 5064 B | 9.74 |
58-
| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,261.6 ns | 7,241.6 ns | 1.09 | 3.0899 | 0.1450 | 0.0076 | 19344 B | 37.20 |
59-
| Tiktoken_ | CountTokens | King(...)edy. [275] | 3,216.1 ns | 3,189.9 ns | 0.49 | 0.6447 | - | - | 4064 B | 7.82 |
60-
| | | | | | | | | | | |
61-
| **SharpTokenV2_0_1_Encode** | **Encode** | **1. (...)57. [19866]** | **613,700.9 ns** | **612,821.4 ns** | **1.00** | **2.9297** | **-** | **-** | **20115 B** | **1.00** |
62-
| TiktokenSharpV1_0_9_Encode | Encode | 1. (...)57. [19866] | 444,436.3 ns | 444,298.4 ns | 0.72 | 64.4531 | 3.4180 | - | 404649 B | 20.12 |
63-
| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 773,882.5 ns | 774,314.3 ns | 1.26 | 246.0938 | 85.9375 | - | 1547673 B | 76.94 |
64-
| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 335,482.3 ns | 333,936.4 ns | 0.55 | 59.5703 | 2.4414 | - | 375601 B | 18.67 |
65-
| | | | | | | | | | | |
66-
| **SharpTokenV2_0_1_Encode** | **Encode** | **Hello, World!** | **443.7 ns** | **436.8 ns** | **1.00** | **0.0405** | **-** | **-** | **256 B** | **1.00** |
67-
| TiktokenSharpV1_0_9_Encode | Encode | Hello, World! | 300.4 ns | 299.4 ns | 0.67 | 0.0505 | - | - | 320 B | 1.25 |
68-
| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 504.7 ns | 498.5 ns | 1.15 | 0.2356 | 0.0010 | - | 1480 B | 5.78 |
69-
| Tiktoken_Encode | Encode | Hello, World! | 262.4 ns | 262.6 ns | 0.58 | 0.1030 | - | - | 648 B | 2.53 |
70-
| | | | | | | | | | | |
71-
| **SharpTokenV2_0_1_Encode** | **Encode** | **King(...)edy. [275]** | **6,784.3 ns** | **6,714.1 ns** | **1.00** | **0.0763** | **-** | **-** | **520 B** | **1.00** |
72-
| TiktokenSharpV1_0_9_Encode | Encode | King(...)edy. [275] | 4,691.2 ns | 4,690.7 ns | 0.69 | 0.8011 | - | - | 5064 B | 9.74 |
73-
| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,287.9 ns | 7,290.9 ns | 1.08 | 3.0823 | 0.1373 | - | 19344 B | 37.20 |
74-
| Tiktoken_Encode | Encode | King(...)edy. [275] | 3,606.2 ns | 3,607.4 ns | 0.53 | 0.7973 | - | - | 5024 B | 9.66 |
44+
| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio |
45+
|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:|
46+
| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
47+
| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 |
48+
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 |
49+
| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 |
50+
| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 |
51+
| | | | | | | | | |
52+
| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
53+
| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 |
54+
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 |
55+
| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 |
56+
| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 |
57+
| | | | | | | | | |
58+
| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
59+
| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 |
60+
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 |
61+
| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 |
62+
| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 |
63+
| | | | | | | | | |
64+
| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
65+
| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 |
66+
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 |
67+
| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 |
68+
| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 |
69+
| | | | | | | | | |
70+
| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
71+
| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 |
72+
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 |
73+
| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 |
74+
| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 |
75+
| | | | | | | | | |
76+
| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
77+
| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 |
78+
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 |
79+
| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 |
80+
| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 |
7581

7682
<!--BENCHMARKS_END-->
7783

benchmarks/2.2.0.0_encode.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
```
2+
3+
BenchmarkDotNet v0.14.0, macOS Sequoia 15.1 (24B83) [Darwin 24.1.0]
4+
Apple M1 Pro, 1 CPU, 10 logical and 10 physical cores
5+
.NET SDK 9.0.100
6+
[Host] : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
7+
DefaultJob : .NET 9.0.0 (9.0.24.52809), Arm64 RyuJIT AdvSIMD
8+
9+
10+
```
11+
| Method | Categories | Data | Mean | Ratio | Gen0 | Gen1 | Allocated | Alloc Ratio |
12+
|---------------------------------- |------------ |-------------------- |-------------:|------:|---------:|--------:|----------:|------------:|
13+
| **SharpTokenV2_0_3_** | **CountTokens** | **1. (...)57. [19866]** | **567,130.0 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
14+
| TiktokenSharpV1_1_5_ | CountTokens | 1. (...)57. [19866] | 483,976.7 ns | 0.85 | 64.4531 | 5.8594 | 404648 B | 20.12 |
15+
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | 1. (...)57. [19866] | 427,733.2 ns | 0.75 | - | - | 297 B | 0.01 |
16+
| TokenizerLibV1_3_3_ | CountTokens | 1. (...)57. [19866] | 773,467.5 ns | 1.36 | 246.0938 | 83.9844 | 1547675 B | 76.94 |
17+
| Tiktoken_ | CountTokens | 1. (...)57. [19866] | 271,564.3 ns | 0.48 | 23.4375 | - | 148313 B | 7.37 |
18+
| | | | | | | | | |
19+
| **SharpTokenV2_0_3_** | **CountTokens** | **Hello, World!** | **380.0 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
20+
| TiktokenSharpV1_1_5_ | CountTokens | Hello, World! | 263.8 ns | 0.69 | 0.0505 | - | 320 B | 1.25 |
21+
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | Hello, World! | 305.7 ns | 0.80 | 0.0153 | - | 96 B | 0.38 |
22+
| TokenizerLibV1_3_3_ | CountTokens | Hello, World! | 509.6 ns | 1.34 | 0.2356 | 0.0010 | 1480 B | 5.78 |
23+
| Tiktoken_ | CountTokens | Hello, World! | 175.7 ns | 0.46 | 0.0191 | - | 120 B | 0.47 |
24+
| | | | | | | | | |
25+
| **SharpTokenV2_0_3_** | **CountTokens** | **King(...)edy. [275]** | **5,990.7 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
26+
| TiktokenSharpV1_1_5_ | CountTokens | King(...)edy. [275] | 4,516.5 ns | 0.75 | 0.8011 | - | 5064 B | 9.74 |
27+
| MicrosoftMLTokenizerV1_0_0_ | CountTokens | King(...)edy. [275] | 3,871.2 ns | 0.65 | 0.0153 | - | 96 B | 0.18 |
28+
| TokenizerLibV1_3_3_ | CountTokens | King(...)edy. [275] | 7,465.8 ns | 1.25 | 3.0823 | 0.1373 | 19344 B | 37.20 |
29+
| Tiktoken_ | CountTokens | King(...)edy. [275] | 2,744.5 ns | 0.46 | 0.3128 | - | 1976 B | 3.80 |
30+
| | | | | | | | | |
31+
| **SharpTokenV2_0_3_Encode** | **Encode** | **1. (...)57. [19866]** | **568,150.3 ns** | **1.00** | **2.9297** | **-** | **20115 B** | **1.00** |
32+
| TiktokenSharpV1_1_5_Encode | Encode | 1. (...)57. [19866] | 444,972.1 ns | 0.78 | 64.4531 | 5.8594 | 404649 B | 20.12 |
33+
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | 1. (...)57. [19866] | 410,970.9 ns | 0.72 | 10.2539 | 0.4883 | 66137 B | 3.29 |
34+
| TokenizerLibV1_3_3_Encode | Encode | 1. (...)57. [19866] | 770,068.9 ns | 1.36 | 246.0938 | 90.8203 | 1547675 B | 76.94 |
35+
| Tiktoken_Encode | Encode | 1. (...)57. [19866] | 290,030.9 ns | 0.51 | 33.6914 | 1.4648 | 214465 B | 10.66 |
36+
| | | | | | | | | |
37+
| **SharpTokenV2_0_3_Encode** | **Encode** | **Hello, World!** | **381.2 ns** | **1.00** | **0.0405** | **-** | **256 B** | **1.00** |
38+
| TiktokenSharpV1_1_5_Encode | Encode | Hello, World! | 260.2 ns | 0.68 | 0.0505 | - | 320 B | 1.25 |
39+
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | Hello, World! | 325.1 ns | 0.85 | 0.0267 | - | 168 B | 0.66 |
40+
| TokenizerLibV1_3_3_Encode | Encode | Hello, World! | 511.6 ns | 1.34 | 0.2356 | - | 1480 B | 5.78 |
41+
| Tiktoken_Encode | Encode | Hello, World! | 241.4 ns | 0.63 | 0.0801 | - | 504 B | 1.97 |
42+
| | | | | | | | | |
43+
| **SharpTokenV2_0_3_Encode** | **Encode** | **King(...)edy. [275]** | **5,957.3 ns** | **1.00** | **0.0763** | **-** | **520 B** | **1.00** |
44+
| TiktokenSharpV1_1_5_Encode | Encode | King(...)edy. [275] | 4,523.8 ns | 0.76 | 0.8011 | - | 5064 B | 9.74 |
45+
| MicrosoftMLTokenizerV1_0_0_Encode | Encode | King(...)edy. [275] | 4,069.8 ns | 0.68 | 0.1144 | - | 744 B | 1.43 |
46+
| TokenizerLibV1_3_3_Encode | Encode | King(...)edy. [275] | 7,207.8 ns | 1.21 | 3.0823 | 0.1373 | 19344 B | 37.20 |
47+
| Tiktoken_Encode | Encode | King(...)edy. [275] | 2,945.7 ns | 0.49 | 0.4654 | - | 2936 B | 5.65 |

src/Directory.Packages.props

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
1313
</PackageVersion>
1414
<PackageVersion Include="Microsoft.DeepDev.TokenizerLib" Version="1.3.3" />
15+
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="1.0.0" />
16+
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="1.0.0" />
1517
<PackageVersion Include="MSTest" Version="3.6.3" />
1618
<PackageVersion Include="PolySharp" Version="1.14.1">
1719
<PrivateAssets>all</PrivateAssets>

src/benchmarks/Tiktoken.Benchmarks/Benchmarks.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using BenchmarkDotNet.Attributes;
22
using BenchmarkDotNet.Configs;
33
using Microsoft.DeepDev;
4+
using Microsoft.ML.Tokenizers;
45
using SharpToken;
56
using Tiktoken.Encodings;
67
using TiktokenSharp;
@@ -18,6 +19,7 @@ public class Benchmarks
1819
private readonly GptEncoding _sharpToken = GptEncoding.GetEncoding("cl100k_base");
1920
private readonly TikToken _tiktokenSharp = TikToken.GetEncoding("cl100k_base");
2021
private readonly Encoder _tiktoken = new(new Cl100KBase());
22+
private readonly Tokenizer _microsoftMlTiktoken = TiktokenTokenizer.CreateForModel("gpt-4");
2123
private ITokenizer? _tokenizerLib;
2224

2325
[Params(Strings.HelloWorld, Strings.KingLear, Strings.Bitcoin)]
@@ -31,11 +33,15 @@ public async Task GlobalSetup()
3133

3234
[Benchmark(Baseline = true)]
3335
[BenchmarkCategory("Encode")]
34-
public List<int> SharpTokenV2_0_1_Encode() => _sharpToken.Encode(Data);
36+
public List<int> SharpTokenV2_0_3_Encode() => _sharpToken.Encode(Data);
3537

3638
[Benchmark]
3739
[BenchmarkCategory("Encode")]
38-
public List<int> TiktokenSharpV1_0_9_Encode() => _tiktokenSharp.Encode(Data);
40+
public List<int> TiktokenSharpV1_1_5_Encode() => _tiktokenSharp.Encode(Data);
41+
42+
[Benchmark]
43+
[BenchmarkCategory("Encode")]
44+
public IReadOnlyCollection<int> MicrosoftMLTokenizerV1_0_0_Encode() => _microsoftMlTiktoken.EncodeToIds(Data);
3945

4046
[Benchmark]
4147
[BenchmarkCategory("Encode")]
@@ -48,11 +54,15 @@ public async Task GlobalSetup()
4854

4955
[Benchmark(Baseline = true)]
5056
[BenchmarkCategory("CountTokens")]
51-
public int SharpTokenV2_0_1_() => _sharpToken.Encode(Data).Count;
57+
public int SharpTokenV2_0_3_() => _sharpToken.Encode(Data).Count;
58+
59+
[Benchmark]
60+
[BenchmarkCategory("CountTokens")]
61+
public int TiktokenSharpV1_1_5_() => _tiktokenSharp.Encode(Data).Count;
5262

5363
[Benchmark]
5464
[BenchmarkCategory("CountTokens")]
55-
public int TiktokenSharpV1_0_9_() => _tiktokenSharp.Encode(Data).Count;
65+
public int MicrosoftMLTokenizerV1_0_0_() => _microsoftMlTiktoken.CountTokens(Data);
5666

5767
[Benchmark]
5868
[BenchmarkCategory("CountTokens")]

0 commit comments

Comments
 (0)