Some tweaks to the Microsoft.ML.Tokenizers PACKAGE.md (dotnet#7360)

stephentoub · web-flow · commit 86bdd730d4ed · 2025-01-13T11:00:11.000-07:00
diff --git a/src/Microsoft.ML.Tokenizers/PACKAGE.md b/src/Microsoft.ML.Tokenizers/PACKAGE.md
@@ -1,6 +1,6 @@
 ## About
 
-Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms.
+Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms.
 
 ## Key Features
 
@@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization
 
 ```c#
 using Microsoft.ML.Tokenizers;
-using System.Net.Http;
 using System.IO;
+using System.Net.Http;
 
 //
 // Using Tiktoken Tokenizer
 //
 
-// initialize the tokenizer for `gpt-4` model
-Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
+// Initialize the tokenizer for the `gpt-4o` model. This instance should be cached for all subsequent use.
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
 
 string source = "Text tokenization is the process of splitting a string into a list of tokens.";
 
 Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
-// print: Tokens: 16
+// prints: Tokens: 16
 
 var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string processedText, out _);
 Console.WriteLine($"5 tokens from end: {processedText.Substring(trimIndex)}");
-// 5 tokens from end:  a list of tokens.
+// prints: 5 tokens from end:  a list of tokens.
 
 trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
 Console.WriteLine($"5 tokens from start: {processedText.Substring(0, trimIndex)}");
-// 5 tokens from start: Text tokenization is the
+// prints: 5 tokens from start: Text tokenization is the
 
 IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
 Console.WriteLine(string.Join(", ", ids));
@@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids));
 // Using Llama Tokenizer
 //
 
-// Open stream of remote Llama tokenizer model data file
+// Open a stream to the remote Llama tokenizer model data file.
 using HttpClient httpClient = new();
 const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
 using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
 
-// Create the Llama tokenizer using the remote stream
+// Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use.
 Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
+
 string input = "Hello, world!";
 ids = llamaTokenizer.EncodeToIds(input);
 Console.WriteLine(string.Join(", ", ids));
 // prints: 1, 15043, 29892, 3186, 29991
 
 Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
-// print: Tokens: 5
+// prints: Tokens: 5
 ```
 
 ## Main Types