1- namespace UglyToad . PdfPig . Tokenization
1+ #nullable enable
2+ namespace UglyToad . PdfPig . Tokenization ;
3+
4+ using System ;
5+ using Core ;
6+ using Tokens ;
7+
8+ internal sealed class NumericTokenizer : ITokenizer
29{
3- using System ;
4- using System . Globalization ;
5- using System . Text ;
6- using Core ;
7- using Tokens ;
10+ private const byte Zero = 48 ;
11+ private const byte Nine = 57 ;
12+ private const byte Negative = ( byte ) '-' ;
13+ private const byte Positive = ( byte ) '+' ;
14+ private const byte Period = ( byte ) '.' ;
15+ private const byte ExponentLower = ( byte ) 'e' ;
16+ private const byte ExponentUpper = ( byte ) 'E' ;
17+
18+ public bool ReadsNextByte => true ;
819
9- internal sealed class NumericTokenizer : ITokenizer
20+ public bool TryTokenize ( byte currentByte , IInputBytes inputBytes , out IToken ? token )
1021 {
11- private const byte Zero = 48 ;
12- private const byte Nine = 57 ;
22+ token = null ;
1323
14- public bool ReadsNextByte { get ; } = true ;
24+ var readBytes = 0 ;
1525
16- public bool TryTokenize ( byte currentByte , IInputBytes inputBytes , out IToken token )
17- {
18- token = null ;
26+ // Everything before the decimal part.
27+ var isNegative = false ;
28+ double integerPart = 0 ;
1929
20- using var characters = new ValueStringBuilder ( stackalloc char [ 32 ] ) ;
30+ // Everything after the decimal point.
31+ var hasFraction = false ;
32+ long fractionalPart = 0 ;
33+ var fractionalCount = 0 ;
2134
22- var initialSymbol = currentByte is ( byte ) '-' or ( byte ) '+' ;
35+ // Support scientific notation in some font files.
36+ var hasExponent = false ;
37+ var isExponentNegative = false ;
38+ var exponentPart = 0 ;
2339
24- if ( ( currentByte >= Zero && currentByte <= Nine ) || currentByte == '.' )
40+ do
41+ {
42+ var b = inputBytes . CurrentByte ;
43+ if ( b >= Zero && b <= Nine )
2544 {
26- characters . Append ( ( char ) currentByte ) ;
45+ if ( hasExponent )
46+ {
47+ exponentPart = ( exponentPart * 10 ) + ( b - Zero ) ;
48+ }
49+ else if ( hasFraction )
50+ {
51+ fractionalPart = ( fractionalPart * 10 ) + ( b - Zero ) ;
52+ fractionalCount ++ ;
53+ }
54+ else
55+ {
56+ integerPart = ( integerPart * 10 ) + ( b - Zero ) ;
57+ }
2758 }
28- else if ( initialSymbol )
59+ else if ( b == Positive )
2960 {
30- characters . Append ( ( char ) currentByte ) ;
61+ // Has no impact
3162 }
32- else
63+ else if ( b == Negative )
3364 {
34- return false ;
65+ if ( hasExponent )
66+ {
67+ isExponentNegative = true ;
68+ }
69+ else
70+ {
71+ isNegative = true ;
72+ }
3573 }
36-
37- var previousSymbol = initialSymbol ;
38-
39- while ( inputBytes . MoveNext ( ) )
74+ else if ( b == Period )
4075 {
41- var b = inputBytes . CurrentByte ;
42-
43- if ( b == '+' || b == '-' )
76+ if ( hasExponent || hasFraction )
4477 {
45- if ( previousSymbol )
46- {
47- continue ;
48- }
49-
50- characters . Append ( ( char ) b ) ;
51- previousSymbol = true ;
78+ return false ;
5279 }
53- else if ( ( b >= Zero && b <= Nine ) ||
54- b == '.' ||
55- b == 'E' ||
56- b == 'e' )
80+
81+ hasFraction = true ;
82+ }
83+ else if ( b == ExponentLower || b == ExponentUpper )
84+ {
85+ // Don't allow leading exponent.
86+ if ( readBytes == 0 )
5787 {
58- previousSymbol = false ;
59- characters . Append ( ( char ) b ) ;
88+ return false ;
6089 }
61- else
90+
91+ if ( hasExponent )
6292 {
63- break ;
93+ return false ;
6494 }
95+
96+ hasExponent = true ;
6597 }
98+ else
99+ {
100+ // No valid first character.
101+ if ( readBytes == 0 )
102+ {
103+ return false ;
104+ }
66105
67- var str = characters . ToString ( ) ;
106+ break ;
107+ }
68108
69- switch ( str )
70- {
71- case "-1" :
72- token = NumericToken . MinusOne ;
73- return true ;
74- case "-" :
75- case "." :
76- case "0" :
77- case "0000" :
78- token = NumericToken . Zero ;
79- return true ;
80- case "1" :
81- token = NumericToken . One ;
82- return true ;
83- case "2" :
84- token = NumericToken . Two ;
85- return true ;
86- case "3" :
87- token = NumericToken . Three ;
88- return true ;
89- case "4" :
90- token = NumericToken . Four ;
91- return true ;
92- case "5" :
93- token = NumericToken . Five ;
94- return true ;
95- case "6" :
96- token = NumericToken . Six ;
97- return true ;
98- case "7" :
99- token = NumericToken . Seven ;
100- return true ;
101- case "8" :
102- token = NumericToken . Eight ;
103- return true ;
104- case "9" :
105- token = NumericToken . Nine ;
106- return true ;
107- case "10" :
108- token = NumericToken . Ten ;
109- return true ;
110- case "11" :
111- token = NumericToken . Eleven ;
112- return true ;
113- case "12" :
114- token = NumericToken . Twelve ;
115- return true ;
116- case "13" :
117- token = NumericToken . Thirteen ;
118- return true ;
119- case "14" :
120- token = NumericToken . Fourteen ;
121- return true ;
122- case "15" :
123- token = NumericToken . Fifteen ;
124- return true ;
125- case "16" :
126- token = NumericToken . Sixteen ;
127- return true ;
128- case "17" :
129- token = NumericToken . Seventeen ;
130- return true ;
131- case "18" :
132- token = NumericToken . Eighteen ;
133- return true ;
134- case "19" :
135- token = NumericToken . Nineteen ;
136- return true ;
137- case "20" :
138- token = NumericToken . Twenty ;
139- return true ;
140- case "100" :
141- token = NumericToken . OneHundred ;
142- return true ;
143- case "500" :
144- token = NumericToken . FiveHundred ;
145- return true ;
146- case "1000" :
147- token = NumericToken . OneThousand ;
148- return true ;
149- default :
150- if ( ! double . TryParse ( str , NumberStyles . Any , CultureInfo . InvariantCulture , out var value ) )
151- {
152- if ( TryParseInvalidNumber ( str , out value ) )
153- {
154- token = new NumericToken ( value ) ;
155- return true ;
156- }
157-
158- return false ;
159- }
160-
161- token = new NumericToken ( value ) ;
162- return true ;
163- }
164- }
109+ readBytes ++ ;
110+ } while ( inputBytes . MoveNext ( ) ) ;
165111
166- private static bool TryParseInvalidNumber ( string numeric , out double result )
112+ if ( hasExponent && ! isExponentNegative )
167113 {
168- result = 0 ;
114+ // Apply the multiplication before any fraction logic to avoid loss of precision.
115+ // E.g. 1.53E3 should be exactly 1,530.
169116
170- if ( ! numeric . Contains ( "-" ) && ! numeric . Contains ( "+" ) )
117+ // Move the whole part to the left of the decimal point.
118+ var combined = integerPart * Pow10 ( fractionalCount ) + fractionalPart ;
119+
120+ // For 1.53E3 we changed this to 153 above, 2 fractional parts, so now we are missing (3-2) 1 additional power of 10.
121+ var shift = exponentPart - fractionalCount ;
122+
123+ if ( shift >= 0 )
124+ {
125+ integerPart = combined * Pow10 ( shift ) ;
126+ }
127+ else
171128 {
172- return false ;
129+ // Still a positive exponent, but not enough to fully shift
130+ // For example 1.457E2 becomes 1,457 but shift is (2-3) -1, the outcome should be 145.7
131+ integerPart = combined / Pow10 ( - shift ) ;
173132 }
174133
175- var parts = numeric . Split ( new string [ ] { "+" , "-" } , StringSplitOptions . RemoveEmptyEntries ) ;
134+ hasFraction = false ;
135+ hasExponent = false ;
136+ }
176137
177- if ( parts . Length == 0 )
138+ if ( hasFraction && fractionalCount > 0 )
139+ {
140+ switch ( fractionalCount )
178141 {
179- return false ;
142+ case 1 :
143+ integerPart += fractionalPart / 10.0 ;
144+ break ;
145+ case 2 :
146+ integerPart += fractionalPart / 100.0 ;
147+ break ;
148+ case 3 :
149+ integerPart += fractionalPart / 1000.0 ;
150+ break ;
151+ default :
152+ integerPart += fractionalPart / Math . Pow ( 10 , fractionalCount ) ;
153+ break ;
180154 }
155+ }
181156
182- foreach ( var part in parts )
183- {
184- if ( ! double . TryParse ( part , NumberStyles . Any , CultureInfo . InvariantCulture , out var partNumber ) )
185- {
186- return false ;
187- }
157+ if ( hasExponent )
158+ {
159+ var signedExponent = isExponentNegative ? - exponentPart : exponentPart ;
160+ integerPart *= Math . Pow ( 10 , signedExponent ) ;
161+ }
188162
189- result += partNumber ;
190- }
163+ if ( isNegative )
164+ {
165+ integerPart = - integerPart ;
166+ }
191167
192- return true ;
168+ if ( integerPart == 0 )
169+ {
170+ token = NumericToken . Zero ;
171+ }
172+ else
173+ {
174+ token = new NumericToken ( integerPart ) ;
193175 }
176+
177+ return true ;
178+ }
179+
180+ private static double Pow10 ( int exp )
181+ {
182+ return exp switch
183+ {
184+ 0 => 1 ,
185+ 1 => 10 ,
186+ 2 => 100 ,
187+ 3 => 1000 ,
188+ 4 => 10000 ,
189+ 5 => 100000 ,
190+ 6 => 1000000 ,
191+ 7 => 10000000 ,
192+ 8 => 100000000 ,
193+ 9 => 1000000000 ,
194+ _ => Math . Pow ( 10 , exp )
195+ } ;
194196 }
195- }
197+ }
0 commit comments