Skip to content

Commit 85fc63d

Browse files
EliotJonesBobLd
authored andcommitted
rework numeric tokenizer hot path
the existing numeric tokenizer involved allocations and string parsing. since the number formats in pdf files are fairly predictable we can improve this substantially
1 parent 5abdfcb commit 85fc63d

File tree

2 files changed

+173
-160
lines changed

2 files changed

+173
-160
lines changed

src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public void FirstByteInvalid_ReturnsFalse(string s)
2929
public static IEnumerable<object[]> ValidNumberTestData => new []
3030
{
3131
new object[] {"0", 0},
32+
new object[] {"0003", 3},
3233
new object[] {"1", 1},
3334
new object[] {"2", 2},
3435
new object[] {"3", 3},
@@ -55,19 +56,29 @@ public void FirstByteInvalid_ReturnsFalse(string s)
5556
new object[] { "4.", 4},
5657
new object[] { "-.002", -0.002},
5758
new object[] { "0.0", 0},
58-
new object[] {"1.57e3", 1570}
59+
new object[] {"1.57e3", 1570},
60+
new object[] {"1.57e-3", 0.00157, 0.0000001},
61+
new object[] {"1.24e1", 12.4},
62+
new object[] { "1.457E2", 145.7 }
5963
};
6064

6165
[Theory]
6266
[MemberData(nameof(ValidNumberTestData))]
63-
public void ParsesValidNumbers(string s, double expected)
67+
public void ParsesValidNumbers(string s, double expected, double? tolerance = null)
6468
{
6569
var input = StringBytesTestConverter.Convert(s);
6670

6771
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
6872

6973
Assert.True(result);
70-
Assert.Equal(expected, AssertNumericToken(token).Data);
74+
if (tolerance.HasValue)
75+
{
76+
Assert.Equal(expected, AssertNumericToken(token).Data, tolerance: tolerance.Value);
77+
}
78+
else
79+
{
80+
Assert.Equal(expected, AssertNumericToken(token).Data);
81+
}
7182
}
7283

7384
[Fact]
Lines changed: 159 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -1,195 +1,197 @@
1-
namespace UglyToad.PdfPig.Tokenization
1+
#nullable enable
2+
namespace UglyToad.PdfPig.Tokenization;
3+
4+
using System;
5+
using Core;
6+
using Tokens;
7+
8+
internal sealed class NumericTokenizer : ITokenizer
29
{
3-
using System;
4-
using System.Globalization;
5-
using System.Text;
6-
using Core;
7-
using Tokens;
10+
private const byte Zero = 48;
11+
private const byte Nine = 57;
12+
private const byte Negative = (byte)'-';
13+
private const byte Positive = (byte)'+';
14+
private const byte Period = (byte)'.';
15+
private const byte ExponentLower = (byte)'e';
16+
private const byte ExponentUpper = (byte)'E';
17+
18+
public bool ReadsNextByte => true;
819

9-
internal sealed class NumericTokenizer : ITokenizer
20+
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token)
1021
{
11-
private const byte Zero = 48;
12-
private const byte Nine = 57;
22+
token = null;
1323

14-
public bool ReadsNextByte { get; } = true;
24+
var readBytes = 0;
1525

16-
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
17-
{
18-
token = null;
26+
// Everything before the decimal part.
27+
var isNegative = false;
28+
double integerPart = 0;
1929

20-
using var characters = new ValueStringBuilder(stackalloc char[32]);
30+
// Everything after the decimal point.
31+
var hasFraction = false;
32+
long fractionalPart = 0;
33+
var fractionalCount = 0;
2134

22-
var initialSymbol = currentByte is (byte)'-' or (byte)'+';
35+
// Support scientific notation in some font files.
36+
var hasExponent = false;
37+
var isExponentNegative = false;
38+
var exponentPart = 0;
2339

24-
if ((currentByte >= Zero && currentByte <= Nine) || currentByte == '.')
40+
do
41+
{
42+
var b = inputBytes.CurrentByte;
43+
if (b >= Zero && b <= Nine)
2544
{
26-
characters.Append((char)currentByte);
45+
if (hasExponent)
46+
{
47+
exponentPart = (exponentPart * 10) + (b - Zero);
48+
}
49+
else if (hasFraction)
50+
{
51+
fractionalPart = (fractionalPart * 10) + (b - Zero);
52+
fractionalCount++;
53+
}
54+
else
55+
{
56+
integerPart = (integerPart * 10) + (b - Zero);
57+
}
2758
}
28-
else if (initialSymbol)
59+
else if (b == Positive)
2960
{
30-
characters.Append((char) currentByte);
61+
// Has no impact
3162
}
32-
else
63+
else if (b == Negative)
3364
{
34-
return false;
65+
if (hasExponent)
66+
{
67+
isExponentNegative = true;
68+
}
69+
else
70+
{
71+
isNegative = true;
72+
}
3573
}
36-
37-
var previousSymbol = initialSymbol;
38-
39-
while (inputBytes.MoveNext())
74+
else if (b == Period)
4075
{
41-
var b = inputBytes.CurrentByte;
42-
43-
if (b == '+' || b == '-')
76+
if (hasExponent || hasFraction)
4477
{
45-
if (previousSymbol)
46-
{
47-
continue;
48-
}
49-
50-
characters.Append((char) b);
51-
previousSymbol = true;
78+
return false;
5279
}
53-
else if ((b >= Zero && b <= Nine) ||
54-
b == '.' ||
55-
b == 'E' ||
56-
b == 'e')
80+
81+
hasFraction = true;
82+
}
83+
else if (b == ExponentLower || b == ExponentUpper)
84+
{
85+
// Don't allow leading exponent.
86+
if (readBytes == 0)
5787
{
58-
previousSymbol = false;
59-
characters.Append((char)b);
88+
return false;
6089
}
61-
else
90+
91+
if (hasExponent)
6292
{
63-
break;
93+
return false;
6494
}
95+
96+
hasExponent = true;
6597
}
98+
else
99+
{
100+
// No valid first character.
101+
if (readBytes == 0)
102+
{
103+
return false;
104+
}
66105

67-
var str = characters.ToString();
106+
break;
107+
}
68108

69-
switch (str)
70-
{
71-
case "-1":
72-
token = NumericToken.MinusOne;
73-
return true;
74-
case "-":
75-
case ".":
76-
case "0":
77-
case "0000":
78-
token = NumericToken.Zero;
79-
return true;
80-
case "1":
81-
token = NumericToken.One;
82-
return true;
83-
case "2":
84-
token = NumericToken.Two;
85-
return true;
86-
case "3":
87-
token = NumericToken.Three;
88-
return true;
89-
case "4":
90-
token = NumericToken.Four;
91-
return true;
92-
case "5":
93-
token = NumericToken.Five;
94-
return true;
95-
case "6":
96-
token = NumericToken.Six;
97-
return true;
98-
case "7":
99-
token = NumericToken.Seven;
100-
return true;
101-
case "8":
102-
token = NumericToken.Eight;
103-
return true;
104-
case "9":
105-
token = NumericToken.Nine;
106-
return true;
107-
case "10":
108-
token = NumericToken.Ten;
109-
return true;
110-
case "11":
111-
token = NumericToken.Eleven;
112-
return true;
113-
case "12":
114-
token = NumericToken.Twelve;
115-
return true;
116-
case "13":
117-
token = NumericToken.Thirteen;
118-
return true;
119-
case "14":
120-
token = NumericToken.Fourteen;
121-
return true;
122-
case "15":
123-
token = NumericToken.Fifteen;
124-
return true;
125-
case "16":
126-
token = NumericToken.Sixteen;
127-
return true;
128-
case "17":
129-
token = NumericToken.Seventeen;
130-
return true;
131-
case "18":
132-
token = NumericToken.Eighteen;
133-
return true;
134-
case "19":
135-
token = NumericToken.Nineteen;
136-
return true;
137-
case "20":
138-
token = NumericToken.Twenty;
139-
return true;
140-
case "100":
141-
token = NumericToken.OneHundred;
142-
return true;
143-
case "500":
144-
token = NumericToken.FiveHundred;
145-
return true;
146-
case "1000":
147-
token = NumericToken.OneThousand;
148-
return true;
149-
default:
150-
if (!double.TryParse(str, NumberStyles.Any, CultureInfo.InvariantCulture, out var value))
151-
{
152-
if (TryParseInvalidNumber(str, out value))
153-
{
154-
token = new NumericToken(value);
155-
return true;
156-
}
157-
158-
return false;
159-
}
160-
161-
token = new NumericToken(value);
162-
return true;
163-
}
164-
}
109+
readBytes++;
110+
} while (inputBytes.MoveNext());
165111

166-
private static bool TryParseInvalidNumber(string numeric, out double result)
112+
if (hasExponent && !isExponentNegative)
167113
{
168-
result = 0;
114+
// Apply the multiplication before any fraction logic to avoid loss of precision.
115+
// E.g. 1.53E3 should be exactly 1,530.
169116

170-
if (!numeric.Contains("-") && !numeric.Contains("+"))
117+
// Move the whole part to the left of the decimal point.
118+
var combined = integerPart * Pow10(fractionalCount) + fractionalPart;
119+
120+
// For 1.53E3 we changed this to 153 above, 2 fractional parts, so now we are missing (3-2) 1 additional power of 10.
121+
var shift = exponentPart - fractionalCount;
122+
123+
if (shift >= 0)
124+
{
125+
integerPart = combined * Pow10(shift);
126+
}
127+
else
171128
{
172-
return false;
129+
// Still a positive exponent, but not enough to fully shift
130+
// For example 1.457E2 becomes 1,457 but shift is (2-3) -1, the outcome should be 145.7
131+
integerPart = combined / Pow10(-shift);
173132
}
174133

175-
var parts = numeric.Split(new string[] { "+", "-" }, StringSplitOptions.RemoveEmptyEntries);
134+
hasFraction = false;
135+
hasExponent = false;
136+
}
176137

177-
if (parts.Length == 0)
138+
if (hasFraction && fractionalCount > 0)
139+
{
140+
switch (fractionalCount)
178141
{
179-
return false;
142+
case 1:
143+
integerPart += fractionalPart / 10.0;
144+
break;
145+
case 2:
146+
integerPart += fractionalPart / 100.0;
147+
break;
148+
case 3:
149+
integerPart += fractionalPart / 1000.0;
150+
break;
151+
default:
152+
integerPart += fractionalPart / Math.Pow(10, fractionalCount);
153+
break;
180154
}
155+
}
181156

182-
foreach (var part in parts)
183-
{
184-
if (!double.TryParse(part, NumberStyles.Any, CultureInfo.InvariantCulture, out var partNumber))
185-
{
186-
return false;
187-
}
157+
if (hasExponent)
158+
{
159+
var signedExponent = isExponentNegative ? -exponentPart : exponentPart;
160+
integerPart *= Math.Pow(10, signedExponent);
161+
}
188162

189-
result += partNumber;
190-
}
163+
if (isNegative)
164+
{
165+
integerPart = -integerPart;
166+
}
191167

192-
return true;
168+
if (integerPart == 0)
169+
{
170+
token = NumericToken.Zero;
171+
}
172+
else
173+
{
174+
token = new NumericToken(integerPart);
193175
}
176+
177+
return true;
178+
}
179+
180+
private static double Pow10(int exp)
181+
{
182+
return exp switch
183+
{
184+
0 => 1,
185+
1 => 10,
186+
2 => 100,
187+
3 => 1000,
188+
4 => 10000,
189+
5 => 100000,
190+
6 => 1000000,
191+
7 => 10000000,
192+
8 => 100000000,
193+
9 => 1000000000,
194+
_ => Math.Pow(10, exp)
195+
};
194196
}
195-
}
197+
}

0 commit comments

Comments
 (0)