Skip to content

Commit 68f2cbf

Browse files
diberrygewarrenIEvangelistBillWagner
authored
Scan images - DO NOT MERGE (#363)
* first attempt * Filtering works * Update cleanrepo/CleanRepo/Repo.cs Co-authored-by: Genevieve Warren <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: Genevieve Warren <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: Genevieve Warren <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: Genevieve Warren <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: Genevieve Warren <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: Genevieve Warren <[email protected]> * changes as requested * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: David Pine <[email protected]> * Fixed warnings - removed comments * Moved OCR functionality to bottom of menu, and all together. * Instructions * Fix output file logic * Update cleanrepo/CleanRepo/Program.cs Co-authored-by: Bill Wagner <[email protected]> --------- Co-authored-by: Genevieve Warren <[email protected]> Co-authored-by: David Pine <[email protected]> Co-authored-by: Bill Wagner <[email protected]>
1 parent a440136 commit 68f2cbf

File tree

5 files changed

+323
-8
lines changed

5 files changed

+323
-8
lines changed

cleanrepo/CleanRepo.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
</ItemGroup>
1717
<ItemGroup>
1818
<PackageReference Include="CommandLineParser" Version="2.9.1" />
19+
<PackageReference Include="Tesseract" Version="5.2.0" />
1920
<PackageReference Include="Microsoft.Build" Version="17.10.4" />
2021
</ItemGroup>
2122
</Project>

cleanrepo/Options.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,19 @@ class Options
5353
[Option("remove-hops", HelpText = "Clean redirection JSON file by replacing targets that are themselves redirected (daisy chains).")]
5454
public bool RemoveRedirectHops { get; set; }
5555

56+
[Option("catalog-images-with-text", Default = false, HelpText = "Map images to the markdown/YAML files that reference them, with all text found in images. Must set --ocr-model-directory path.")]
57+
public bool CatalogImagesWithText { get; set; }
58+
59+
[Option("filter-images-for-text", Default = false, HelpText = "Filter images for text. Must set --ocr-model-directory and --filter-text-json-file paths.")]
60+
public bool FilterImagesForText { get; set; }
61+
62+
[Option("ocr-model-directory", HelpText = "Directory that contains the OCR (Tesseract) models for image scanning.")]
63+
public string? OcrModelDirectory { get; set; }
64+
65+
[Option("filter-text-json-file", HelpText = "JSON file of array of strings to filter OCR results with.")]
66+
public string? FilterTextJsonFile { get; set; }
67+
68+
5669
//[Option("format-redirects", Required = false, HelpText = "Format the redirection JSON file by deserializing and then serializing with pretty printing.")]
5770
//public bool FormatRedirectsFile { get; set; }
5871

cleanrepo/Program.cs

Lines changed: 188 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44
using System.Data;
55
using System.Diagnostics;
66
using System.Text;
7+
using System.Text.Json;
78
using System.Text.RegularExpressions;
9+
using Tesseract;
10+
using static System.Net.Mime.MediaTypeNames;
811

912
namespace CleanRepo;
1013

@@ -26,13 +29,52 @@ static void Main(string[] args)
2629
// }
2730
//
2831
// ... to avoid hardcoded values in DEBUG preprocessor directives like this:
29-
args = new[] { "--orphaned-snippets", "--snippets-directory=c:\\users\\gewarren\\testrepo3\\snippets", "--xml-source=true" };
32+
args = new[] {
33+
"--help"
34+
};
3035
//args = new[] { "--orphaned-snippets", "--relative-links", "--remove-hops", "--replace-redirects", "--orphaned-includes", "--orphaned-articles", "--orphaned-images",
3136
//"--articles-directory=c:\\users\\gewarren\\docs\\docs\\fundamentals", "--media-directory=c:\\users\\gewarren\\docs\\docs\\core",
3237
//"--includes-directory=c:\\users\\gewarren\\docs\\includes", "--snippets-directory=c:\\users\\gewarren\\docs\\samples\\snippets\\csharp\\vs_snippets_clr",
3338
//"--docfx-directory=c:\\users\\gewarren\\docs", "--url-base-path=/dotnet", "--delete=false"};
3439
#endif
3540

41+
/*
42+
43+
Display help screen
44+
45+
args = new[] {
46+
"--help"
47+
*/
48+
49+
50+
/*
51+
52+
Catalog images with text
53+
54+
args = new[] {
55+
"--catalog-images-with-text",
56+
"--url-base-path=/azure/developer/javascript",
57+
"--ocr-model-directory=c:\\Users\\diberry\\repos\\temp\\tesseract\\tessdata_fast",
58+
"--articles-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles",
59+
"--media-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles\\javascript\\media"};
60+
*/
61+
62+
/*
63+
64+
Filter images for text
65+
66+
args = new[] {
67+
"--filter-images-for-text",
68+
"--filter-text-json-file=c:\\Users\\diberry\\repos\\filter-text.json",
69+
"--url-base-path=/azure/developer/javascript",
70+
"--ocr-model-directory=c:\\Users\\diberry\\repos\\temp\\tesseract\\tessdata_fast",
71+
"--articles-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles",
72+
"--media-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles\\javascript\\media"};
73+
*/
74+
75+
76+
77+
3678
Parser.Default.ParseArguments<Options>(args).WithParsed(RunOptions);
3779
}
3880

@@ -46,6 +88,8 @@ options.FindOrphanedIncludes is false &&
4688
options.FindOrphanedSnippets is false &&
4789
options.ReplaceRedirectTargets is false &&
4890
options.ReplaceWithRelativeLinks is false &&
91+
options.CatalogImagesWithText is false &&
92+
options.FilterImagesForText is false &&
4993
options.RemoveRedirectHops is false)
5094
{
5195
Console.WriteLine("\nYou didn't specify which function to perform. To see options, use 'CleanRepo.exe -?'.");
@@ -175,8 +219,10 @@ options.ReplaceWithRelativeLinks is false &&
175219
}
176220

177221
// Catalog images
178-
if (options.CatalogImages)
222+
if (options.CatalogImages || options.CatalogImagesWithText || options.FilterImagesForText)
179223
{
224+
225+
180226
if (string.IsNullOrEmpty(options.MediaDirectory))
181227
{
182228
Console.WriteLine("\nEnter the path to the directory where you want to catalog media files:\n");
@@ -194,6 +240,11 @@ options.ReplaceWithRelativeLinks is false &&
194240
Console.WriteLine($"'{options.MediaDirectory}' is not a child of the docfx.json file's directory '{docFxRepo.DocFxDirectory}'.");
195241
return;
196242
}
243+
if (options.CatalogImagesWithText && string.IsNullOrEmpty(options.OcrModelDirectory))
244+
{
245+
Console.WriteLine($"'--ocr-model-directory' directory was not provided.");
246+
return;
247+
}
197248

198249
// Add regex to find image refs similar to 'social_image_url: "/dotnet/media/logo.png"'
199250
// This is done here (dynamically) because it relies on knowing the base path URL.
@@ -203,11 +254,87 @@ options.ReplaceWithRelativeLinks is false &&
203254
if (docFxRepo._imageRefs is null)
204255
docFxRepo._imageRefs = GetMediaFiles(options.MediaDirectory);
205256

206-
Console.WriteLine($"\nCataloging the images in the '{options.MediaDirectory}' directory...\n");
257+
Console.WriteLine($"\nCataloging '{docFxRepo._imageRefs.Count}' images (recursively) in the '{options.MediaDirectory}' directory...\n");
258+
259+
if (options.CatalogImagesWithText && !string.IsNullOrEmpty(options.OcrModelDirectory))
260+
{
261+
// Extract hash keys from the dictionary
262+
List<string> mediaFilesList = docFxRepo._imageRefs.Keys.ToList();
263+
264+
// Pass hash keys to ScanMediaFiles
265+
docFxRepo._ocrRefs = ScanMediaFiles(mediaFilesList, options.OcrModelDirectory);
266+
267+
268+
269+
docFxRepo.OutputImageReferences(true);
270+
}
271+
else if (options.FilterImagesForText && !string.IsNullOrEmpty(options.OcrModelDirectory))
272+
{
273+
if (string.IsNullOrEmpty(options.FilterTextJsonFile))
274+
{
275+
Console.WriteLine($"\nThe filterTextJsonFile can't be empty when requesting FilterImagesForText.");
276+
return;
277+
}
278+
if (!File.Exists(options.FilterTextJsonFile))
279+
{
280+
Console.WriteLine($"\nThe filterTextJsonFile '{options.FilterTextJsonFile}' doesn't exist.");
281+
return;
282+
}
283+
284+
List<string> searchTerms = [];
285+
try
286+
{
287+
string jsonContent = File.ReadAllText(options.FilterTextJsonFile);
288+
289+
searchTerms = JsonSerializer.Deserialize<List<string>>(jsonContent) ?? new List<string>();
290+
}
291+
catch (IOException ioEx)
292+
{
293+
Console.WriteLine($"\nIO error reading '{options.FilterTextJsonFile}': {ioEx.Message}");
294+
}
295+
catch (UnauthorizedAccessException uaEx)
296+
{
297+
Console.WriteLine($"\nAccess error reading '{options.FilterTextJsonFile}': {uaEx.Message}");
298+
}
299+
catch (JsonException jsonEx)
300+
{
301+
Console.WriteLine($"\nError deserializing '{options.FilterTextJsonFile}': {jsonEx.Message}");
302+
}
303+
catch (Exception ex) // Fallback for any other unexpected exceptions
304+
{
305+
Console.WriteLine($"\nUnexpected error: {ex.Message}");
306+
return;
307+
}
308+
if (searchTerms.Count == 0)
309+
{
310+
Console.WriteLine($"\nNo search terms found in '{options.FilterTextJsonFile}'.");
311+
return;
312+
}
313+
314+
// Extract hash keys from the dictionary
315+
List<string> mediaFilesList = docFxRepo._imageRefs.Keys.ToList();
316+
317+
if(mediaFilesList.Count==0)
318+
{
319+
Console.WriteLine($"\nNo media files found.");
320+
}
321+
// Pass hash keys to ScanMediaFiles
322+
Dictionary<string, string> unfilteredResults = ScanMediaFiles(mediaFilesList, options.OcrModelDirectory);
323+
324+
// Filter results
325+
docFxRepo._ocrFilteredRefs = FilterMediaFiles(unfilteredResults, searchTerms);
326+
327+
docFxRepo.OutputImageReferences(true, true);
328+
}
329+
else
330+
{
331+
docFxRepo.OutputImageReferences();
332+
}
333+
207334

208-
docFxRepo.OutputImageReferences();
209335
}
210336

337+
211338
// Find orphaned include-type files
212339
if (options.FindOrphanedIncludes)
213340
{
@@ -1576,7 +1703,7 @@ private static Dictionary<string, List<string>> GetMediaFiles(string mediaDirect
15761703

15771704
Dictionary<string, List<string>> mediaFiles = new(StringComparer.InvariantCultureIgnoreCase);
15781705

1579-
string[] fileExtensions = ["*.png", "*.jpg", "*.gif", "*.svg"];
1706+
string[] fileExtensions = [ "*.png", "*.jpg", "*.gif", "*.svg" ]; // Correctly initialize the array
15801707

15811708
foreach (string extension in fileExtensions)
15821709
{
@@ -1591,6 +1718,62 @@ private static Dictionary<string, List<string>> GetMediaFiles(string mediaDirect
15911718

15921719
return mediaFiles;
15931720
}
1721+
/// <summary>
1722+
/// Returns a dictionary of all .png/.jpg/.gif/.svg files in the directory.
1723+
/// The search includes the text found in the files.
1724+
/// </summary>
1725+
private static Dictionary<string, string> ScanMediaFiles(List<string>? imageFilePaths, string ocrModelDirectory)
1726+
{
1727+
1728+
Dictionary<string, string> ocrDataForFiles = new(StringComparer.InvariantCultureIgnoreCase);
1729+
1730+
if (imageFilePaths is null or { Count : 0 })
1731+
{
1732+
Console.WriteLine("\nNo .png/.jpg/.gif/.svg files to scan!");
1733+
return ocrDataForFiles;
1734+
}
1735+
1736+
using var engine = new TesseractEngine(ocrModelDirectory, "eng", EngineMode.Default);
1737+
foreach (string imageFilePath in imageFilePaths)
1738+
{
1739+
using var img = Pix.LoadFromFile(imageFilePath);
1740+
using Page page = engine.Process(img);
1741+
1742+
string text = page.GetText();
1743+
ocrDataForFiles.Add(imageFilePath, text);
1744+
}
1745+
return ocrDataForFiles;
1746+
}
1747+
1748+
// Filter ocrDictionary by filterTerms
1749+
private static Dictionary<string, List<KeyValuePair<string, string>>> FilterMediaFiles(Dictionary<string, string> ocrDictionary, List<string> filterTerms)
1750+
{
1751+
// Sort the filterTerms to ensure the result is sorted by filterTerm
1752+
filterTerms.Sort();
1753+
1754+
Dictionary<string, List<KeyValuePair<string, string>>> filterTermFilesDictionary = [];
1755+
1756+
foreach (string filterTerm in filterTerms)
1757+
{
1758+
List<KeyValuePair<string, string>> matchedFiles = [];
1759+
1760+
foreach (var imageFile in ocrDictionary)
1761+
{
1762+
if (imageFile.Value.Contains(filterTerm, StringComparison.OrdinalIgnoreCase))
1763+
{
1764+
// Add both the file path and the text for that file
1765+
matchedFiles.Add(new KeyValuePair<string, string>(imageFile.Key, imageFile.Value));
1766+
}
1767+
}
1768+
1769+
if (matchedFiles.Count > 0)
1770+
{
1771+
filterTermFilesDictionary.Add(filterTerm, matchedFiles);
1772+
}
1773+
}
1774+
1775+
return filterTermFilesDictionary;
1776+
}
15941777

15951778
/// <summary>
15961779
/// Gets all *.yml files recursively, starting in the ancestor directory that contains docfx.json.

cleanrepo/README.md

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@ This command-line tool helps you clean up a DocFx-based content repo. It can:
44

55
- Find and delete markdown files that aren't linked from a TOC file.
66
- Find and delete orphaned image (.png, .jpg, .gif, .svg) files.
7-
- Map images to the files that reference them.
7+
- Map images to the files that reference them and return text from those images.
88
- Find and delete orphaned "shared" markdown files (includes).
99
- Find and delete orphaned snippet (.cs, .vb, .cpp, .fs, and .xaml) files.
1010
- Find and replace links to redirected files.
1111
- Remove daisy chains (or hops) within the redirection files for the docset.
1212
- Replace site-relative links with file-relative links (includes image links).
13+
- Filter image list based on strings found in images.
1314

1415
## Usage
1516

@@ -23,6 +24,10 @@ This command-line tool helps you clean up a DocFx-based content repo. It can:
2324
| --replace-redirects | Find backlinks to redirected files and replace with new target. |
2425
| --remove-hops | Remove daisy chains within the redirection files for the docset. |
2526
| --relative-links | Replace site-relative links with file-relative links. |
27+
| --catalog-images-with-text | Map images to the markdown/YAML files that reference them, with all text found in images. Must set --ocr-model-directory path. |
28+
| --filter-images-for-text | Filter images for text. Must set --ocr-model-directory and --filter-text-json-file paths. |
29+
| --ocr-model-directory | Directory that contains the OCR (Tesseract) models for image scanning. |
30+
| --filter-text-json-file | JSON file of array of strings to filter OCR results with. |
2631

2732
## Usage examples
2833

@@ -43,3 +48,66 @@ This command-line tool helps you clean up a DocFx-based content repo. It can:
4348
```
4449
CleanRepo.exe --orphaned-images
4550
```
51+
52+
## Text to image examples
53+
54+
The text-to-image functionality supported in the `--catalog-images-with-text` and `--filter-images-for-text` options is provided by the [Tesseract](https://www.nuget.org/packages/tesseract/) NuGet package.
55+
56+
### Get the Tesseract models
57+
58+
You must determine which Tesseract models you want to use and install them on your system. Tesseract models are generated per operating system. Tesseract models come in a variety of sizes. You will also need to download the language data files for tesseract 4.0.0 or above from [tesseract-tessdata](https://github.com/tesseract-ocr/tessdata/). Use the `--ocr-model-directory` value to set the path.
59+
60+
### Catalog images with text
61+
62+
To catalog the images with text:
63+
64+
```console
65+
CleanRepo --catalog-images-with-text \
66+
--url-base-path=/azure/developer/javascript \
67+
--articles-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles \
68+
--media-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles\\javascript\\media
69+
--ocr-model-directory=c:\\Users\\diberry\\repos\\temp\\tesseract\\tessdata_fast
70+
```
71+
72+
The output file is prefixed with `ImageFiles-` and looks like:
73+
74+
```json
75+
76+
```
77+
78+
### Filter images with text
79+
80+
81+
To file images based on an array of string, use the `--filter-text-json-file` path to the JSON file with the text to filter for:
82+
83+
```json
84+
["Azure","Microsoft"]
85+
```
86+
87+
88+
```console
89+
CleanRepo --filter-images-for-text \
90+
--filter-text-json-file=c:\\Users\\diberry\\repos\\filter-text.json \
91+
--url-base-path=/azure/developer/javascript \
92+
--ocr-model-directory=c:\\Users\\diberry\\repos\\temp\\tesseract\\tessdata_fast \
93+
--articles-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles \
94+
--media-directory=c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles\\javascript\\media
95+
```
96+
97+
The output file is prefixed with `FilteredOcrImageFiles-` and looks like:
98+
99+
```json
100+
{
101+
"Azure": [
102+
{
103+
"Key": "c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles\\javascript\\media\\visual-studio-code-azure-resources-extension-remove-resource-group.png",
104+
"Value": "*J File Edit Selection View Go Run Terminal Help\n\nQa AZURE oo\n\n\u003E FUNCTIONS\n-v RESOURCE GROUPS\n\\ \u0026 Pay-As-You-Go-diberry Y\n|\nEdit Tags...\n\u00A3\nView Properties\nte Open in Portal\nRefresh\n90\n\n \n\n \n\n \n\n \n"
105+
}],
106+
"Microsoft": [
107+
{
108+
"Key": "c:\\Users\\diberry\\repos\\writing\\docs\\azure-dev-docs-pr-2\\articles\\javascript\\media\\azure-function-resource-group-management\\azure-portal-function-application-insights-link.png",
109+
"Value": "Function App\n\n\u00AE Overview\n\n \n\n| View Application Insights data G)\n\n \n\n \n\n \n\n \n\nActivity log Link to an Application Insights resource\n8. Access control (IAM)\n\u00A9 tes \u00A9 temepiseaieiin yt eb ise ea\n\n@ Diagnose and solve problems\n\n\u00A9 Microsoft Defender for Cloud @ totum Apptzation ihe of check that Applicaton nights OK ard the insramentaion key are removed rm your apliaton,\n\n\u0026 events (preview)\n\nFunctions O) \u00E9sarteg etiam caer toe Gorman Vier Tc home\nApplication Insights. You have the option to disable non-essential data collection, Learn more\n(A) Functions\n\u00A9 App keys\nChange your resource\nB App files\n\n \n\nDeployment\n\n= Deployment slots\n@ Deployment Center\nSettings\n\nHl Configuration\n\n\u0026\u0026 Authentication\n\n\u00AE Application insights\n\n \n"
110+
},
111+
]
112+
}
113+
```

0 commit comments

Comments
 (0)