@@ -156,21 +156,37 @@ private void CacheSpecialTokensEncoding(IReadOnlyDictionary<string, int>? specia
156156        internal  static async  ValueTask < ( Dictionary < ReadOnlyMemory < byte > ,  int > ,  Dictionary < StringSpanOrdinalKey ,  ( int  Id ,  string  Token ) > ,  Dictionary < int ,  ReadOnlyMemory < byte > > ) >  LoadTiktokenBpeAsync ( 
157157            Stream  vocabStream ,  bool  useAsync ,  CancellationToken  cancellationToken  =  default ) 
158158        { 
159-             var   encoder   =   new   Dictionary < ReadOnlyMemory < byte > ,  int > ( ReadOnlyMemoryByteComparer . Instance ) ; 
160-             var   vocab   =   new   Dictionary < StringSpanOrdinalKey ,  ( int  Id ,  string  Token ) > ( ) ; 
161-             var   decoder   =   new   Dictionary < int ,  ReadOnlyMemory < byte > > ( ) ; 
159+             Dictionary < ReadOnlyMemory < byte > ,  int >   encoder ; 
160+             Dictionary < StringSpanOrdinalKey ,  ( int  Id ,  string  Token ) >   vocab ; 
161+             Dictionary < int ,  ReadOnlyMemory < byte > >   decoder ; 
162162
163163            try 
164164            { 
165165                // Don't dispose the reader as it will dispose the underlying stream vocabStream. The caller is responsible for disposing the stream. 
166166                StreamReader  reader  =  new  StreamReader ( vocabStream ) ; 
167-                 string ?  line ; 
168-                 do 
167+                 string ?  line  =  useAsync  ?  await  Helpers . ReadLineAsync ( reader ,  cancellationToken ) . ConfigureAwait ( false )  :  reader . ReadLine ( ) ; 
168+ 
169+                 const  string  capacity  =  "Capacity: " ; 
170+                 int  suggestedCapacity  =  0 ;  // default capacity 
171+                 if  ( line  is  not null  &&  line . StartsWith ( capacity ,  StringComparison . Ordinal ) ) 
169172                { 
170-                     line  =  useAsync  ? 
171-                         await  Helpers . ReadLineAsync ( reader ,  cancellationToken ) . ConfigureAwait ( false )  : 
172-                         reader . ReadLine ( ) ; 
173-                 }  while  ( line  is  not null  &&  line . Length  ==  0 ) ; 
173+                     if  ( ! Helpers . TryParseInt32 ( line ,  capacity . Length ,  out  suggestedCapacity ) ) 
174+                     { 
175+                         throw  new  FormatException ( $ "Invalid format in the BPE vocab file stream") ; 
176+                     } 
177+ 
178+                     line  =  useAsync  ?  await  Helpers . ReadLineAsync ( reader ,  cancellationToken ) . ConfigureAwait ( false )  :  reader . ReadLine ( ) ; 
179+                 } 
180+ 
181+                 encoder  =  new  Dictionary < ReadOnlyMemory < byte > ,  int > ( suggestedCapacity ,  ReadOnlyMemoryByteComparer . Instance ) ; 
182+                 vocab  =  new  Dictionary < StringSpanOrdinalKey ,  ( int  Id ,  string  Token ) > ( suggestedCapacity ) ; 
183+                 decoder  =  new  Dictionary < int ,  ReadOnlyMemory < byte > > ( suggestedCapacity ) ; 
184+ 
185+                 // skip empty lines 
186+                 while  ( line  is  not null  &&  line . Length  ==  0 ) 
187+                 { 
188+                     line  =  useAsync  ?  await  Helpers . ReadLineAsync ( reader ,  cancellationToken ) . ConfigureAwait ( false )  :  reader . ReadLine ( ) ; 
189+                 } 
174190
175191                if  ( line  is  not null  &&  line . IndexOf ( ' ' )  <  0 ) 
176192                { 
0 commit comments