11using System . Collections . Concurrent ;
2+ using System . Runtime . CompilerServices ;
23using System . Text ;
34using System . Text . RegularExpressions ;
45using Tiktoken . Core ;
@@ -86,12 +87,12 @@ public int CountTokensNative(string text)
8687 var tokens = 0 ;
8788#if NET7_0_OR_GREATER
8889 var textSpan = text . AsSpan ( ) ;
90+ Span < byte > pieceBytes = stackalloc byte [ 128 ] ;
8991#endif
9092
9193#if NET7_0_OR_GREATER
9294 foreach ( var match in Regex . EnumerateMatches ( textSpan ) )
9395 {
94- var matchValue = textSpan . Slice ( match . Index , match . Length ) . ToArray ( ) ;
9596 var fastKey = new string ( textSpan . Slice ( match . Index , match . Length ) ) ;
9697#else
9798 foreach ( Match match in Regex . Matches ( text ) )
@@ -110,7 +111,11 @@ public int CountTokensNative(string text)
110111 continue ;
111112 }
112113
114+ #if NET7_0_OR_GREATER
115+ var piece = GetUtf8Bytes ( textSpan . Slice ( match . Index , match . Length ) , pieceBytes ) ;
116+ #else
113117 var piece = System . Text . Encoding . UTF8 . GetBytes ( matchValue ) ;
118+ #endif
114119 if ( Encoder . ContainsKey ( piece ) )
115120 {
116121 tokens ++ ;
@@ -148,6 +153,7 @@ public IReadOnlyCollection<int> EncodeNative(
148153 var tokens = new List < int > ( ) ;
149154#if NET7_0_OR_GREATER
150155 var textSpan = text . AsSpan ( ) ;
156+ Span < byte > pieceBytes = stackalloc byte [ 128 ] ;
151157#endif
152158
153159 var specialTokens = new List < ( int Index , int Length ) > ( capacity : 32 ) ;
@@ -181,7 +187,6 @@ public IReadOnlyCollection<int> EncodeNative(
181187#if NET7_0_OR_GREATER
182188 foreach ( var match in Regex . EnumerateMatches ( textSpan [ start ..specialStart ] ) )
183189 {
184- var matchValue = textSpan . Slice ( match . Index , match . Length ) . ToArray ( ) ;
185190 var fastKey = new string ( textSpan . Slice ( match . Index , match . Length ) ) ;
186191#else
187192 foreach ( Match match in Regex . Matches ( text [ start ..specialStart ] ) )
@@ -199,8 +204,12 @@ public IReadOnlyCollection<int> EncodeNative(
199204 tokens . AddRange ( fastTokens ) ;
200205 continue ;
201206 }
202-
207+
208+ #if NET7_0_OR_GREATER
209+ var piece = GetUtf8Bytes ( textSpan . Slice ( match . Index , match . Length ) , pieceBytes ) ;
210+ #else
203211 var piece = System . Text . Encoding . UTF8 . GetBytes ( matchValue ) ;
212+ #endif
204213 if ( Encoder . TryGetValue ( piece , out var token ) )
205214 {
206215 tokens . Add ( token ) ;
@@ -544,4 +553,20 @@ public byte[] DecodeNative(IReadOnlyCollection<int> tokens)
544553 }
545554 return ret . ToArray ( ) ;
546555 }
556+
557+ #if NETSTANDARD2_1_OR_GREATER || NET5_0_OR_GREATER
558+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
559+ private static byte [ ] GetUtf8Bytes ( ReadOnlySpan < char > text , Span < byte > scratch )
560+ {
561+ // check if text can be decoded into the buffer; each UTF-16 char can become at most 3 UTF-8 bytes
562+ if ( text . Length * 3 < scratch . Length )
563+ {
564+ return scratch [ ..System . Text . Encoding . UTF8 . GetBytes ( text , scratch ) ] . ToArray ( ) ;
565+ }
566+ else
567+ {
568+ return System . Text . Encoding . UTF8 . GetBytes ( text . ToArray ( ) ) ;
569+ }
570+ }
571+ #endif
547572}
0 commit comments