Update readmes for Tokenizers and Microsoft.ML (#7070)

ericstj · web-flow · commit 50258301def2 · 2024-03-15T13:32:23.000-07:00
* Make docs changes skip validation builds

* Apply package readme templates

* Fill in content for package readmes

* Address feedback
diff --git a/.vsts-dotnet-ci.yml b/.vsts-dotnet-ci.yml
@@ -15,6 +15,15 @@ trigger:
     - main
     - feature/*
     - release/*
+  paths:
+    include:
+    - '*'
+    exclude:
+    - '**.md'
+    - .github/*
+    - docs/*
+    - LICENSE
+    - THIRD-PARTY-NOTICES.TXT
 
 resources:
   containers:
diff --git a/build/codecoverage-ci.yml b/build/codecoverage-ci.yml
@@ -15,6 +15,15 @@ trigger:
     - main
     - feature/*
     - release/*
+  paths:
+    include:
+    - '*'
+    exclude:
+    - '**.md'
+    - .github/*
+    - docs/*
+    - LICENSE
+    - THIRD-PARTY-NOTICES.TXT
 
 jobs:
 - template: /build/ci/job-template.yml
diff --git a/eng/Packaging.targets b/eng/Packaging.targets
@@ -1,9 +1,16 @@
 <Project>
+  <PropertyGroup>  
+    <PackageReadmeFile Condition="'$(PackageReadmeFile)' == '' and Exists('PACKAGE.md')">PACKAGE.md</PackageReadmeFile>
+  </PropertyGroup>
 
   <ItemGroup>
     <Content Include="$(RepositoryEngineeringDir)pkg\mlnetlogo.png" Pack="true" PackagePath="" />
   </ItemGroup>
 
+  <ItemGroup Condition="'$(PackageReadmeFile)' != ''">
+    <None Include="$(PackageReadmeFile)" Pack="true" PackagePath="\" />
+  </ItemGroup>
+  
   <ItemGroup Condition="'$(IncludeMLNetNotices)' != 'false'">
     <Content Include="$(RepoRoot)THIRD-PARTY-NOTICES.TXT" Pack="true" PackagePath="" />
     <Content Include="$(RepoRoot)LICENSE" Pack="true" PackagePath=""/>
diff --git a/src/Microsoft.ML.Tokenizers/PACKAGE.md b/src/Microsoft.ML.Tokenizers/PACKAGE.md
@@ -0,0 +1,61 @@
+## About
+
+Microsoft.ML.Tokenizers supports various the implmentation of the tokenization used in the NLP transforms.
+
+## Key Features
+
+* Extensisble tokenizer architecture that allows for specialization of Normalizer, PreTokenizer, Model/Encoder, Decoder
+* BPE - Byte pair encoding model
+* English Roberta model
+* Tiktoken model
+
+## How to Use
+
+```c#
+using Microsoft.ML.Tokenizers;
+
+// initialize the tokenizer for `gpt-4` model, downloading data files
+Tokenizer tokenizer = await Tiktoken.CreateByModelNameAsync("gpt-4");
+
+string source = "Text tokenization is the process of splitting a string into a list of tokens.";
+
+Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
+// print: Tokens: 16
+
+var trimIndex = tokenizer.LastIndexOfTokenCount(source, 5, out string processedText, out _);
+Console.WriteLine($"5 tokens from end: {processedText.Substring(trimIndex)}");
+// 5 tokens from end:  a list of tokens.
+
+trimIndex = tokenizer.IndexOfTokenCount(source, 5, out processedText, out _);
+Console.WriteLine($"5 tokens from start: {processedText.Substring(0, trimIndex)}");
+// 5 tokens from start: Text tokenization is the
+
+IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
+Console.WriteLine(string.Join(", ", ids));
+// prints: 1199, 4037, 2065, 374, 279, 1920, 315, 45473, 264, 925, 1139, 264, 1160, 315, 11460, 13
+```
+
+## Main Types
+
+The main types provided by this library are:
+
+* `Microsoft.ML.Tokenizers.Tokenizer`
+* `Microsoft.ML.Tokenizers.Bpe`
+* `Microsoft.ML.Tokenizers.EnglishRoberta`
+* `Microsoft.ML.Tokenizers.TikToken`
+* `Microsoft.ML.Tokenizers.TokenizerDecoder`
+* `Microsoft.ML.Tokenizers.Normalizer`
+* `Microsoft.ML.Tokenizers.PreTokenizer`
+
+## Additional Documentation
+
+* [Conceptual documentation](TODO)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML/Microsoft.ML.csproj b/src/Microsoft.ML/Microsoft.ML.csproj
@@ -14,7 +14,6 @@
     <NoWarn>$(NoWarn);NU5127;NU5128</NoWarn>
     <IsPackable>true</IsPackable>
     <PackageDescription>ML.NET is a cross-platform open-source machine learning framework which makes machine learning accessible to .NET developers.</PackageDescription>
-    <PackageReadmeFile>README.md</PackageReadmeFile>
   </PropertyGroup>
   <ItemGroup>
     <ProjectReference Include="../Microsoft.ML.DataView/Microsoft.ML.DataView.csproj" />
@@ -36,7 +35,6 @@
   <ItemGroup>
     <Content Include="$(RepoRoot)eng\pkg\CommonPackage.props" Pack="true" PackagePath="build\netstandard2.0\$(MSBuildProjectName).props" />
     <Content Include="build\**\*" Pack="true" PackagePath="build" />
-    <None Include="README.md" Pack="true" PackagePath="\"/> <!--NuGet PackageReadmeFile-->
   </ItemGroup>
 
 </Project>
diff --git a/src/Microsoft.ML/PACKAGE.md b/src/Microsoft.ML/PACKAGE.md
@@ -0,0 +1,50 @@
+## About
+
+ML.NET is a cross-platform open-source machine learning framework which makes machine learning accessible to .NET developers.
+
+For more information, see the [ML.NET documentation](https://docs.microsoft.com/dotnet/machine-learning/).
+
+## Key Features
+
+* Classification/Categorization - Automatically divide customer feedback into positive and negative categories
+* Regression/Predict continuous values - Predict the price of houses based on size and location
+* Anomaly Detection - Detect fraudulent banking transactions
+* Recommendations - Suggest products that online shoppers may want to buy, based on their previous purchases
+* Time series/sequential data - Forecast the weather/product sales
+* Image classification - Categorize pathologies in medical images
+* Text classification - Categorize documents based on their content.
+* Sentence similarity - Measure how similar two sentences are.
+
+## How to Use
+
+See [Machine Learning Samples](https://github.com/dotnet/machinelearning-samples) for an assortment of samples that show how to get started using ML.NET.
+
+## Main Types
+
+Some of the types provided by this library are:
+
+* `Microsoft.ML.MLContext`
+* `Microsoft.ML.ITransformer`
+* `Microsoft.ML.IEstimator<TTransformer>`
+
+## Additional Documentation
+
+* [Conceptual documentation](https://learn.microsoft.com/en-us/dotnet/machine-learning/)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml)
+
+## Related Packages
+
+* Core data abstraction: [Microsoft.ML.DataView](https://www.nuget.org/packages/Microsoft.ML.DataView)
+* LightGBM Model Support: [Microsoft.ML.LightGbm](https://www.nuget.org/packages/Microsoft.ML.LightGbm)
+* Fast Tree: [Microsoft.ML.FastTree](https://www.nuget.org/packages/Microsoft.ML.FastTree)
+* Image analytics: [Microsoft.ML.ImageAnalytics](https://www.nuget.org/packages/Microsoft.ML.ImageAnalytics)
+* Reccomender: [Microsoft.ML.Recommender](https://www.nuget.org/packages/Microsoft.ML.Recommender)
+* Time series: [Microsoft.ML.TimeSeries](https://www.nuget.org/packages/Microsoft.ML.TimeSeries)
+* Automatic model selection / tuning:  [Microsoft.ML.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML)
+* Exporting Onnx Models: [Microsoft.ML.OnnxConverter](https://www.nuget.org/packages/Microsoft.ML.OnnxConverter)
+* Loading Onnx models: [Microsoft.ML.OnnxTransformer](https://www.nuget.org/packages/Microsoft.ML.OnnxTransformer)
+* Text tokenizers: [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers)
+
+## Feedback & Contributing
+
+Microsoft.ML is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML/README.md b/src/Microsoft.ML/README.md