Skip to content

Commit 1ed9e01

Browse files
chuckbeasleyChuck Beasley
andauthored
Performance improvements and .Net 9 support (UglyToad#1116)
* Refactor letter handling by orientation for efficiency Improved the processing of letters based on their text orientation by preallocating separate lists for each orientation (horizontal, rotate270, rotate180, rotate90, and other). This change reduces multiple calls to `GetWords` and minimizes enumerations and allocations, enhancing performance and readability. Each letter is now added to the appropriate list in a single iteration over the `letters` collection. * Update target frameworks to include net9.0 Expanded compatibility in `UglyToad.PdfPig.csproj` by adding `net9.0` to the list of target frameworks, alongside existing versions. * Add .NET 9.0 support and refactor key components Updated project files for UglyToad.PdfPig to target .NET 9.0, enhancing compatibility with the latest framework features. Refactored `GetBlocks` in `DocstrumBoundingBoxes.cs` for improved input handling and performance. Significantly optimized `NearestNeighbourWordExtractor.cs` by replacing multiple lists with an array of buckets and implementing parallel processing for better efficiency. Consistent updates across `Fonts`, `Tests`, `Tokenization`, and `Tokens` project files to include .NET 9.0 support. * Improve null checks and optimize list handling - Updated null check for `words` in `DocstrumBoundingBoxes.cs` for better readability and performance. - Changed from `ToList()` to `ToArray()` to avoid unnecessary enumeration. - Added `results.TrimExcess()` in `NearestNeighbourWordExtractor.cs` to optimize memory usage. --------- Co-authored-by: Chuck Beasley <CBeasley@kilpatricktownsend.com>
1 parent 83d6fc6 commit 1ed9e01

File tree

9 files changed

+56
-34
lines changed

9 files changed

+56
-34
lines changed

src/UglyToad.PdfPig.Core/UglyToad.PdfPig.Core.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
3+
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
44
<LangVersion>12</LangVersion>
55
<Version>0.1.12-alpha001</Version>
66
<IsTestProject>False</IsTestProject>

src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,19 @@ public DocstrumBoundingBoxes(DocstrumBoundingBoxesOptions options)
4848
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
4949
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
5050
{
51-
if (words?.Any() != true)
51+
if (words is null)
5252
{
5353
return Array.Empty<TextBlock>();
5454
}
5555

56-
return GetBlocks(words.ToList(),
56+
// Avoid multiple enumeration and unnecessary ToArray() if already a list
57+
var wordList = words as IReadOnlyList<Word> ?? words.ToArray();
58+
if (wordList.Count == 0)
59+
{
60+
return Array.Empty<TextBlock>();
61+
}
62+
63+
return GetBlocks(wordList,
5764
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
5865
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
5966
options.AngularDifferenceBounds,

src/UglyToad.PdfPig.DocumentLayoutAnalysis/UglyToad.PdfPig.DocumentLayoutAnalysis.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
3+
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
44
<LangVersion>12</LangVersion>
55
<Version>0.1.12-alpha001</Version>
66
<IsTestProject>False</IsTestProject>

src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -51,34 +51,49 @@ public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
5151

5252
if (options.GroupByOrientation)
5353
{
54-
// axis aligned
55-
List<Word> words = GetWords(
56-
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
57-
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
58-
options.Filter, options.MaxDegreeOfParallelism);
59-
60-
words.AddRange(GetWords(
61-
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
62-
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
63-
options.Filter, options.MaxDegreeOfParallelism));
54+
var buckets = new List<Letter>[5];
55+
for (int i = 0; i < buckets.Length; i++) buckets[i] = new List<Letter>();
6456

65-
words.AddRange(GetWords(
66-
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
67-
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
68-
options.Filter, options.MaxDegreeOfParallelism));
69-
70-
words.AddRange(GetWords(
71-
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
72-
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
73-
options.Filter, options.MaxDegreeOfParallelism));
57+
foreach (var l in letters)
58+
{
59+
switch (l.TextOrientation)
60+
{
61+
case TextOrientation.Horizontal: buckets[0].Add(l); break;
62+
case TextOrientation.Rotate270: buckets[1].Add(l); break;
63+
case TextOrientation.Rotate180: buckets[2].Add(l); break;
64+
case TextOrientation.Rotate90: buckets[3].Add(l); break;
65+
default: buckets[4].Add(l); break;
66+
}
67+
}
7468

75-
// not axis aligned
76-
words.AddRange(GetWords(
77-
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
78-
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
79-
options.Filter, options.MaxDegreeOfParallelism));
69+
// Use a thread-safe collection to avoid lock contention.
70+
var results = new List<Word>(letters.Count); // Pre-allocate for performance
8071

81-
return words;
72+
// Limit parallelism to avoid oversubscription.
73+
var parallelOptions = new System.Threading.Tasks.ParallelOptions
74+
{
75+
MaxDegreeOfParallelism = options.MaxDegreeOfParallelism > 0 ? options.MaxDegreeOfParallelism : Environment.ProcessorCount
76+
};
77+
78+
// Use partitioner for better load balancing and avoid ConcurrentBag overhead
79+
System.Threading.Tasks.Parallel.ForEach(
80+
System.Collections.Concurrent.Partitioner.Create(0, buckets.Length),
81+
parallelOptions,
82+
range =>
83+
{
84+
for (int i = range.Item1; i < range.Item2; i++)
85+
{
86+
if (buckets[i].Count == 0) continue;
87+
var measure = (i == 4) ? options.DistanceMeasure : options.DistanceMeasureAA;
88+
var words = GetWords(buckets[i], options.MaximumDistance, measure, options.FilterPivot, options.Filter, options.MaxDegreeOfParallelism);
89+
lock (results)
90+
{
91+
results.AddRange(words);
92+
}
93+
}
94+
});
95+
results.TrimExcess();
96+
return results;
8297
}
8398
else
8499
{

src/UglyToad.PdfPig.Fonts/UglyToad.PdfPig.Fonts.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
3+
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
44
<LangVersion>12</LangVersion>
55
<Version>0.1.12-alpha001</Version>
66
<IsTestProject>False</IsTestProject>

src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
4-
<TargetFrameworks>net471;net8.0</TargetFrameworks>
4+
<TargetFrameworks>net471;net8.0;net9.0</TargetFrameworks>
55
<IsTestProject>true</IsTestProject>
66
<IsPackable>false</IsPackable>
77
<DebugType>full</DebugType>

src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
3+
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
44
<LangVersion>12</LangVersion>
55
<Version>0.1.12-alpha001</Version>
66
<IsTestProject>False</IsTestProject>

src/UglyToad.PdfPig.Tokens/UglyToad.PdfPig.Tokens.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
3+
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
44
<LangVersion>12</LangVersion>
55
<Version>0.1.12-alpha001</Version>
66
<IsTestProject>False</IsTestProject>

src/UglyToad.PdfPig/UglyToad.PdfPig.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
3-
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
3+
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
44
<LangVersion>12</LangVersion>
55
<Version>0.1.12-alpha001</Version>
66
<IsTestProject>False</IsTestProject>

0 commit comments

Comments
 (0)