Skip to content

Commit 1679b57

Browse files
bgraingerHavenDV
authored andcommitted
Reduce allocations in EncodingLoader.
- Avoid garbage strings from splitting on space. - Avoid allocations from using LINQ in a hot loop. Signed-off-by: Bradley Grainger <bgrainger@gmail.com>
1 parent f7e7404 commit 1679b57

File tree

2 files changed

+33
-3
lines changed

2 files changed

+33
-3
lines changed

src/libs/Tiktoken.Core/CoreBPE.cs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,19 @@ public CoreBpe(
4545
Encoder = encoder;
4646
FastEncoder = Encoder
4747
.ToDictionary(
48-
static x => new string(x.Key.Select(y => (char) y).ToArray()),
48+
#if NETSTANDARD2_1_OR_GREATER || NET6_0_OR_GREATER
49+
static x =>
50+
{
51+
Span<char> chars = stackalloc char[x.Key.Length];
52+
for (var i = 0; i < x.Key.Length; i++)
53+
{
54+
chars[i] = (char)x.Key[i];
55+
}
56+
return new string(chars);
57+
},
58+
#else
59+
static x => new string(x.Key.Select(static y => (char) y).ToArray()),
60+
#endif
4961
static x => x.Value);
5062
SpecialTokensEncoder = specialTokensEncoder;
5163

src/libs/Tiktoken.Encodings.Abstractions/EncodingLoader.cs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public static Dictionary<byte[], int> LoadEncodingFromManifestResource(
3030
assembly.GetManifestResourceStream(resourcePath) ??
3131
throw new InvalidOperationException("Resource not found.");
3232
using var reader = new StreamReader(stream);
33-
33+
3434
var lines = new List<string>();
3535
while (reader.ReadLine() is { } line)
3636
{
@@ -53,7 +53,11 @@ public static Dictionary<byte[], int> LoadEncodingFromLines(
5353
string name)
5454
{
5555
lines = lines ?? throw new ArgumentNullException(nameof(lines));
56-
56+
57+
#if NET7_0_OR_GREATER
58+
Span<Range> tokens = stackalloc Range[3];
59+
Span<byte> bytes = stackalloc byte[256];
60+
#endif
5761
var dictionary = new Dictionary<byte[], int>(new ByteArrayComparer());
5862
foreach (var line in lines)
5963
{
@@ -62,14 +66,28 @@ public static Dictionary<byte[], int> LoadEncodingFromLines(
6266
continue;
6367
}
6468

69+
#if NET7_0_OR_GREATER
70+
var splitCount = line.AsSpan().Split(tokens, ' ');
71+
if (splitCount != 2)
72+
{
73+
throw new FormatException($"Invalid file format: {name}");
74+
}
75+
#else
6576
var tokens = line.Split(' ');
6677
if (tokens.Length != 2)
6778
{
6879
throw new FormatException($"Invalid file format: {name}");
6980
}
81+
#endif
7082

83+
#if NET7_0_OR_GREATER
84+
Convert.TryFromBase64Chars(line.AsSpan(tokens[0]), bytes, out var bytesWritten);
85+
var tokenBytes = bytes.Slice(0, bytesWritten).ToArray();
86+
var rank = int.Parse(line.AsSpan(tokens[1]), CultureInfo.InvariantCulture);
87+
#else
7188
var tokenBytes = Convert.FromBase64String(tokens[0]);
7289
var rank = int.Parse(tokens[1], CultureInfo.InvariantCulture);
90+
#endif
7391
dictionary[tokenBytes] = rank;
7492
}
7593

0 commit comments

Comments
 (0)