dotnet
diff --git a/‎src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
Lines changed: 9 additions & 1 deletion b/‎src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BPE.cs
Lines changed: 5 additions & 5 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BPE.cs
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs
Lines changed: 9 additions & 4 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs
Lines changed: 9 additions & 4 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Cache.cs
Lines changed: 4 additions & 4 deletions b/‎src/Microsoft.ML.Tokenizers/Model/Cache.cs
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
Lines changed: 6 additions & 2 deletions b/‎src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Model.cs
Lines changed: 38 additions & 0 deletions b/‎src/Microsoft.ML.Tokenizers/Model/Model.cs
Lines changed: 38 additions & 0 deletions
@@ -2,11 +2,19 @@
   <Import Project="$(RepoRoot)eng/pkg/Pack.props" />
 
   <PropertyGroup>
-    <TargetFramework>netstandard2.0</TargetFramework>
+    <TargetFrameworks>netstandard2.0;net8.0</TargetFrameworks>
     <Nullable>enable</Nullable>
     <PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
   </PropertyGroup>
 
+  <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
+    <Compile Remove="Utils/Helpers.netcoreapp.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetFramework)' != 'netstandard2.0'">
+    <Compile Remove="Utils/Helpers.netstandard.cs" />
+  </ItemGroup>
+
   <ItemGroup>
     <PackageReference Include="System.Text.Json" Version="$(SystemTextJsonVersion)" />
   </ItemGroup>
 
@@ -36,7 +36,7 @@ public string? UnknownToken
 
                 if (value is null)
                 {
-                    if (VocabReverse.TryGetValue(0, out string v))
+                    if (VocabReverse.TryGetValue(0, out string? v))
                     {
                         VocabReverse.Remove(0);
                         if (Vocab.TryGetValue(v, out int id))
@@ -103,7 +103,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st
                 VocabReverse.Add(kvp.Value, kvp.Key);
             }
 
-            if (unknownToken is null && VocabReverse.TryGetValue(0, out string unkToken))
+            if (unknownToken is null && VocabReverse.TryGetValue(0, out string? unkToken))
             {
                 unknownToken = unkToken;
             }
@@ -187,7 +187,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence)
         /// <returns>The mapped token of the Id.</returns>
         public override string? IdToToken(int id, bool skipSpecialTokens = false)
         {
-            if (VocabReverse.TryGetValue(id, out string value))
+            if (VocabReverse.TryGetValue(id, out string? value))
             {
                 return value;
             }
@@ -253,7 +253,7 @@ public override string[] Save(string path, string? prefix = null)
         }
 
         /// Read the given files to extract the vocab and merges
-        internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string? vocab, string? merges)
+        internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string vocab, string? merges)
         {
             Dictionary<string, int>? dic;
             using (Stream stream = File.OpenRead(vocab))
@@ -320,7 +320,7 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(strin
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal string CharToString(char c)
         {
-            if (_charToString.TryGetValue(c, out string v))
+            if (_charToString.TryGetValue(c, out string? v))
             {
                 return v;
             }
 
@@ -83,7 +83,12 @@ public BpeTrainer(
             MinFrequency = minFrequency;
             VocabSize = vocabSize;
             Progress = progress;
-            SpecialTokens = new List<AddedToken>(specialTokens);
+
+            if (specialTokens is not null)
+            {
+                SpecialTokens = new List<AddedToken>(specialTokens);
+            }
+
             LimitAlphabet = limitAlphabet;
             InitialAlphabet = initialAlphabet;
             ContinuingSubwordPrefix = continuingSubwordPrefix;
@@ -172,7 +177,7 @@ private void ComputeAlphabet(Dictionary<string, int> wc, Dictionary<string, int>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal string CharToString(char c)
         {
-            if (_charToString.TryGetValue(c, out string v))
+            if (_charToString.TryGetValue(c, out string? v))
             {
                 return v;
             }
@@ -259,7 +264,7 @@ internal string CharToString(char c)
                     // Then update counts
                     int count = counts[i];
 
-                    if (!whereToUpdate.TryGetValue(curPair, out HashSet<int> h))
+                    if (!whereToUpdate.TryGetValue(curPair, out HashSet<int>? h))
                     {
                         h = new HashSet<int>();
                         whereToUpdate[curPair] = h;
@@ -398,7 +403,7 @@ internal string CharToString(char c)
 
                     if (change > 0)
                     {
-                        if (!whereToUpdate.TryGetValue(p, out HashSet<int> h))
+                        if (!whereToUpdate.TryGetValue(p, out HashSet<int>? h))
                         {
                             h = new();
                             whereToUpdate[p] = h;
 
@@ -9,7 +9,7 @@
 
 namespace Microsoft.ML.Tokenizers
 {
-    internal sealed class Cache<TKey, TValue>
+    internal sealed class Cache<TKey, TValue> where TKey : notnull
     {
         internal Cache() : this(Bpe.DefaultCacheCapacity) { }
 
@@ -39,13 +39,13 @@ internal void Clear()
 
         internal List<TValue> GetValues(IEnumerable<TKey> keys)
         {
-            List<TValue>? values = new();
+            List<TValue> values = new();
             _cacheLock.EnterReadLock();
             try
             {
                 foreach (TKey key in keys)
                 {
-                    if (Map.TryGetValue(key, out TValue value))
+                    if (Map.TryGetValue(key, out TValue? value))
                     {
                         values.Add(value);
                     }
@@ -61,7 +61,7 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)
             _cacheLock.EnterReadLock();
             try
             {
-                if (Map.TryGetValue(key, out TValue value))
+                if (Map.TryGetValue(key, out TValue? value))
                 {
                     return value;
                 }
 
@@ -429,7 +429,7 @@ private Dictionary<string, int> GetVocabulary(Stream vocabularyStream)
                 using StreamReader reader = new StreamReader(mergeStream);
                 while (reader.Peek() >= 0)
                 {
-                    splitContents.Add(reader.ReadLine());
+                    splitContents.Add(reader.ReadLine()!);
                 }
             }
             catch (Exception e)
@@ -761,7 +761,11 @@ public void AddFromStream(Stream stream)
 
             while (reader.Peek() >= 0)
             {
-                string line = reader.ReadLine();
+                string? line = reader.ReadLine();
+                if (line is null)
+                {
+                    continue;
+                }
 
                 var splitLine = line.Trim().Split(' ');
                 if (splitLine.Length != 2)
 
@@ -20,13 +20,51 @@ public abstract class Model
         /// <returns>The list of tokens generated from the sequence tokenization.</returns>
         public abstract IReadOnlyList<Token> Tokenize(string sequence);
 
+        /// <summary>
+        /// Tokenize a split sequence string to a list of tokens.
+        /// </summary>
+        /// <param name="sequence">The text to tokenize.</param>
+        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <returns>The list of tokens generated from the sequence tokenization.</returns>
+        public virtual IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken) => Tokenize(sequence);
+
+        /// <summary>
+        /// Tokenize a split sequence string to a list of Ids and add them to the accumulatedIds list.
+        /// </summary>
+        /// <param name="sequence">The sequence to split.</param>
+        /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
+        /// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
+        /// <returns>True if the operation succeeded, false otherwise.</returns>
+        public virtual bool TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)
+        {
+            if (accumulatedIds is null)
+            {
+                throw new ArgumentNullException(nameof(accumulatedIds));
+            }
+
+            var tokens = Tokenize(sequence);
+            foreach (var token in tokens)
+            {
+                accumulatedIds.Add(token.Id);
+            }
+            return true;
+        }
+
         /// <summary>
         /// Map the token to tokenized Id.
         /// </summary>
         /// <param name="token">The token to map to the Id.</param>
         /// <returns>The mapped Id of the token.</returns>
         public abstract int? TokenToId(string token);
 
+        /// <summary>
+        /// Map the token to tokenized id with the option to skip the special tokens.
+        /// </summary>
+        /// <param name="token">The token to map to Id</param>
+        /// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the encoding.</param>
+        /// <returns>The mapped Id of the token.</returns>
+        public virtual int? TokenToId(string token, bool skipSpecialTokens) => TokenToId(token);
+
         /// <summary>
         /// Map the tokenized Id to the token.
         /// </summary>
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ public string? UnknownToken`
`36`	`36`
`37`	`37`	`if (value is null)`
`38`	`38`	`{`
`39`		`- if (VocabReverse.TryGetValue(0, out string v))`
	`39`	`+ if (VocabReverse.TryGetValue(0, out string? v))`
`40`	`40`	`{`
`41`	`41`	`VocabReverse.Remove(0);`
`42`	`42`	`if (Vocab.TryGetValue(v, out int id))`
`@@ -103,7 +103,7 @@ public Bpe(string vocabFile, string? mergesFile, string? unknownToken = null, st`
`103`	`103`	`VocabReverse.Add(kvp.Value, kvp.Key);`
`104`	`104`	`}`
`105`	`105`
`106`		`- if (unknownToken is null && VocabReverse.TryGetValue(0, out string unkToken))`
	`106`	`+ if (unknownToken is null && VocabReverse.TryGetValue(0, out string? unkToken))`
`107`	`107`	`{`
`108`	`108`	`unknownToken = unkToken;`
`109`	`109`	`}`
`@@ -187,7 +187,7 @@ public override IReadOnlyList<Token> Tokenize(string sequence)`
`187`	`187`	`/// <returns>The mapped token of the Id.</returns>`
`188`	`188`	`public override string? IdToToken(int id, bool skipSpecialTokens = false)`
`189`	`189`	`{`
`190`		`- if (VocabReverse.TryGetValue(id, out string value))`
	`190`	`+ if (VocabReverse.TryGetValue(id, out string? value))`
`191`	`191`	`{`
`192`	`192`	`return value;`
`193`	`193`	`}`
`@@ -253,7 +253,7 @@ public override string[] Save(string path, string? prefix = null)`
`253`	`253`	`}`
`254`	`254`
`255`	`255`	`/// Read the given files to extract the vocab and merges`
`256`		`- internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string? vocab, string? merges)`
	`256`	`+ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(string vocab, string? merges)`
`257`	`257`	`{`
`258`	`258`	`Dictionary<string, int>? dic;`
`259`	`259`	`using (Stream stream = File.OpenRead(vocab))`
`@@ -320,7 +320,7 @@ internal static (Dictionary<string, int>?, Vec<(string, string)>) ReadFile(strin`
`320`	`320`	`[MethodImpl(MethodImplOptions.AggressiveInlining)]`
`321`	`321`	`internal string CharToString(char c)`
`322`	`322`	`{`
`323`		`- if (_charToString.TryGetValue(c, out string v))`
	`323`	`+ if (_charToString.TryGetValue(c, out string? v))`
`324`	`324`	`{`
`325`	`325`	`return v;`
`326`	`326`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`
`10`	`10`	`namespace Microsoft.ML.Tokenizers`
`11`	`11`	`{`
`12`		`- internal sealed class Cache<TKey, TValue>`
	`12`	`+ internal sealed class Cache<TKey, TValue> where TKey : notnull`
`13`	`13`	`{`
`14`	`14`	`internal Cache() : this(Bpe.DefaultCacheCapacity) { }`
`15`	`15`
`@@ -39,13 +39,13 @@ internal void Clear()`
`39`	`39`
`40`	`40`	`internal List<TValue> GetValues(IEnumerable<TKey> keys)`
`41`	`41`	`{`
`42`		`- List<TValue>? values = new();`
	`42`	`+ List<TValue> values = new();`
`43`	`43`	`_cacheLock.EnterReadLock();`
`44`	`44`	`try`
`45`	`45`	`{`
`46`	`46`	`foreach (TKey key in keys)`
`47`	`47`	`{`
`48`		`- if (Map.TryGetValue(key, out TValue value))`
	`48`	`+ if (Map.TryGetValue(key, out TValue? value))`
`49`	`49`	`{`
`50`	`50`	`values.Add(value);`
`51`	`51`	`}`
`@@ -61,7 +61,7 @@ internal List<TValue> GetValues(IEnumerable<TKey> keys)`
`61`	`61`	`_cacheLock.EnterReadLock();`
`62`	`62`	`try`
`63`	`63`	`{`
`64`		`- if (Map.TryGetValue(key, out TValue value))`
	`64`	`+ if (Map.TryGetValue(key, out TValue? value))`
`65`	`65`	`{`
`66`	`66`	`return value;`
`67`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -429,7 +429,7 @@ private Dictionary<string, int> GetVocabulary(Stream vocabularyStream)`
`429`	`429`	`using StreamReader reader = new StreamReader(mergeStream);`
`430`	`430`	`while (reader.Peek() >= 0)`
`431`	`431`	`{`
`432`		`- splitContents.Add(reader.ReadLine());`
	`432`	`+ splitContents.Add(reader.ReadLine()!);`
`433`	`433`	`}`
`434`	`434`	`}`
`435`	`435`	`catch (Exception e)`
`@@ -761,7 +761,11 @@ public void AddFromStream(Stream stream)`
`761`	`761`
`762`	`762`	`while (reader.Peek() >= 0)`
`763`	`763`	`{`
`764`		`- string line = reader.ReadLine();`
	`764`	`+ string? line = reader.ReadLine();`
	`765`	`+ if (line is null)`
	`766`	`+ {`
	`767`	`+ continue;`
	`768`	`+ }`
`765`	`769`
`766`	`770`	`var splitLine = line.Trim().Split(' ');`
`767`	`771`	`if (splitLine.Length != 2)`