code edits

gewarren · gewarren · commit 9be411235e71 · 2025-10-29T17:23:44.000-07:00
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs
@@ -12,29 +12,28 @@ public static void Run()
     private static void BasicUsage()
     {
         // <BpeBasic>
-        // Create a BPE tokenizer using Tiktoken
-        // BPE (Byte Pair Encoding) is the underlying algorithm used by many tokenizers
+        // Create a BPE tokenizer using Tiktoken.
         Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
 
         string text = "Hello, how are you doing today?";
 
-        // Encode text to token IDs
+        // Encode text to token IDs.
         IReadOnlyList<int> ids = tokenizer.EncodeToIds(text);
         Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
 
-        // Count tokens
+        // Count tokens.
         int tokenCount = tokenizer.CountTokens(text);
         Console.WriteLine($"Token count: {tokenCount}");
 
-        // Get detailed token information
+        // Get detailed token information.
         IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens(text, out string? normalizedString);
         Console.WriteLine("Tokens:");
-        foreach (var token in tokens)
+        foreach (EncodedToken token in tokens)
         {
             Console.WriteLine($"  ID: {token.Id}, Value: '{token.Value}'");
         }
 
-        // Decode tokens back to text
+        // Decode tokens back to text.
         string? decoded = tokenizer.Decode(ids);
         Console.WriteLine($"Decoded: {decoded}");
         // </BpeBasic>
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs
@@ -16,26 +16,26 @@ public static async Task RunAsync()
     private static async Task BasicUsageAsync()
     {
         // <LlamaBasic>
-        // Open a stream to the remote Llama tokenizer model data file
+        // Open a stream to the remote Llama tokenizer model data file.
         using HttpClient httpClient = new();
         const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
         using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
 
-        // Create the Llama tokenizer using the remote stream
+        // Create the Llama tokenizer using the remote stream.
         Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
 
         string input = "Hello, world!";
 
-        // Encode text to token IDs
+        // Encode text to token IDs.
         IReadOnlyList<int> ids = llamaTokenizer.EncodeToIds(input);
         Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
         // Output: Token IDs: 1, 15043, 29892, 3186, 29991
 
-        // Count the tokens
+        // Count the tokens.
         Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
         // Output: Tokens: 5
 
-        // Decode token IDs back to text
+        // Decode token IDs back to text.
         string? decoded = llamaTokenizer.Decode(ids);
         Console.WriteLine($"Decoded: {decoded}");
         // Output: Decoded: Hello, world!
@@ -44,20 +44,20 @@ private static async Task BasicUsageAsync()
 
     private static void AdvancedOptions()
     {
-        // For demonstration purposes, we'll use a mock tokenizer
-        // In real scenarios, you would initialize this properly
+        // For demonstration purposes, we'll use a mock tokenizer.
+        // In real scenarios, you would initialize this properly.
         Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
-        
+
         // <LlamaAdvanced>
         ReadOnlySpan<char> textSpan = "Hello World".AsSpan();
 
-        // Bypass normalization during encoding
+        // Bypass normalization during encoding.
         IReadOnlyList<int> ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false);
 
-        // Bypass pretokenization during encoding
+        // Bypass pretokenization during encoding.
         ids = tokenizer.EncodeToIds(textSpan, considerPreTokenization: false);
 
-        // Bypass both normalization and pretokenization
+        // Bypass both normalization and pretokenization.
         ids = tokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false);
         // </LlamaAdvanced>
     }
diff --git a/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs b/docs/ai/how-to/snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs
@@ -13,21 +13,21 @@ public static void Run()
     private static void BasicUsage()
     {
         // <TiktokenBasic>
-        // Initialize the tokenizer for the gpt-4o model
+        // Initialize the tokenizer for the gpt-4o model.
         Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
 
         string source = "Text tokenization is the process of splitting a string into a list of tokens.";
 
-        // Count the tokens in the text
+        // Count the tokens in the text.
         Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
         // Output: Tokens: 16
 
-        // Encode text to token IDs
+        // Encode text to token IDs.
         IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
         Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
         // Output: Token IDs: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13
 
-        // Decode token IDs back to text
+        // Decode token IDs back to text.
         string? decoded = tokenizer.Decode(ids);
         Console.WriteLine($"Decoded: {decoded}");
         // Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens.
@@ -41,15 +41,15 @@ private static void TrimText()
 
         string source = "Text tokenization is the process of splitting a string into a list of tokens.";
 
-        // Get the last 5 tokens from the text
+        // Get the last 5 tokens from the text.
         var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _);
         if (processedText is not null)
         {
             Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}");
             // Output: Last 5 tokens:  a list of tokens.
         }
 
-        // Get the first 5 tokens from the text
+        // Get the first 5 tokens from the text.
         trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
         if (processedText is not null)
         {
diff --git a/docs/ai/how-to/use-tokenizers.md b/docs/ai/how-to/use-tokenizers.md
@@ -62,12 +62,14 @@ The tokenizer supports advanced encoding options, such as controlling normalizat
 
 ## Use BPE tokenizer
 
-Byte-pair encoding (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. The following example demonstrates BPE tokenization:
+*Byte-pair encoding* (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. BPE was initially developed as an algorithm to compress texts, and then used by OpenAI for tokenization when it pretrained the GPT model. The following example demonstrates BPE tokenization:
 
 :::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic":::
 
 The library also provides specialized tokenizers like <xref:Microsoft.ML.Tokenizers.BpeTokenizer> and <xref:Microsoft.ML.Tokenizers.EnglishRobertaTokenizer> that you can configure with custom vocabularies for specific models.
 
+For more information about BPE, see [Byte-pair encoding tokenization](https://huggingface.co/learn/llm-course/chapter6/5).
+
 ## Common tokenizer operations
 
 All tokenizers in the library implement the <xref:Microsoft.ML.Tokenizers.Tokenizer> base class. The following table shows the available methods.
@@ -81,7 +83,7 @@ All tokenizers in the library implement the <xref:Microsoft.ML.Tokenizers.Tokeni
 | <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCount*> | Finds the character index for a specific token count from the start |
 | <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCountFromEnd*> | Finds the character index for a specific token count from the end |
 
-## Migration from other libraries
+## Migrate from other libraries
 
 If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md).