Skip to content

Commit bb9c578

Browse files
committed
Avoid a lot of seeks by making most tokenizers no longer read to far by using seek
1 parent 29114ac commit bb9c578

File tree

17 files changed

+130
-92
lines changed

17 files changed

+130
-92
lines changed

src/UglyToad.PdfPig.Core/ReadHelper.cs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,17 @@ public static class ReadHelper
2424
/// </summary>
2525
public const byte AsciiCarriageReturn = 13;
2626

27+
/// <summary>
28+
/// The tab '\t' character.
29+
/// </summary>
30+
public const byte AsciiTab = 9;
31+
2732
private static readonly HashSet<int> EndOfNameCharacters =
2833
[
2934
' ',
3035
AsciiCarriageReturn,
3136
AsciiLineFeed,
32-
9,
37+
AsciiTab,
3338
'>',
3439
'<',
3540
'[',

src/UglyToad.PdfPig.Core/StreamInputBytes.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,17 @@ public bool IsAtEnd()
9696
/// <inheritdoc />
9797
public void Seek(long position)
9898
{
99+
var current = CurrentOffset;
100+
if (position == current)
101+
{
102+
return;
103+
}
104+
else if (peekByte.HasValue && position == current + 1)
105+
{
106+
MoveNext();
107+
return;
108+
}
109+
99110
isAtEnd = false;
100111
peekByte = null;
101112

src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ public sealed class Type1ArrayTokenizer : ITokenizer
1414
/// <inheritdoc />
1515
public bool ReadsNextByte { get; } = false;
1616

17-
private static readonly string[] Space = [" "];
17+
private static readonly char[] Space = [' '];
1818

1919
/// <inheritdoc />
2020
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)

src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ public static Type1Font Parse(IInputBytes inputBytes, int length1, int length2)
8888
{
8989
int offset = 0;
9090

91+
while (inputBytes.Peek() is { } b && ReadHelper.IsWhitespace(b))
92+
{
93+
inputBytes.MoveNext();
94+
}
95+
9196
while (inputBytes.MoveNext())
9297
{
9398
if (inputBytes.CurrentByte == (byte)ClearToMark[offset])

src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
{
33
using System;
44
using System.Collections.Generic;
5+
using System.Diagnostics;
56
using System.Globalization;
67
using System.Text;
78
using Core;
@@ -41,35 +42,43 @@ private Type1Token ReadNextToken()
4142
do
4243
{
4344
skip = false;
44-
while (bytes.MoveNext())
45+
while (bytes.Peek() is { } b)
4546
{
46-
var b = bytes.CurrentByte;
4747
var c = (char)b;
4848

4949
switch (c)
5050
{
5151
case '%':
52+
bytes.MoveNext();
5253
comments.Add(ReadComment());
5354
break;
5455
case '(':
56+
bytes.MoveNext();
5557
return ReadString();
5658
case ')':
5759
throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
5860
case '[':
61+
bytes.MoveNext();
5962
return new Type1Token(c, Type1Token.TokenType.StartArray);
6063
case ']':
64+
bytes.MoveNext();
6165
return new Type1Token(c, Type1Token.TokenType.EndArray);
6266
case '{':
67+
bytes.MoveNext();
6368
return new Type1Token(c, Type1Token.TokenType.StartProc);
6469
case '}':
70+
bytes.MoveNext();
6571
return new Type1Token(c, Type1Token.TokenType.EndProc);
6672
case '/':
6773
{
68-
var name = ReadLiteral();
74+
bytes.MoveNext();
75+
TryReadLiteral(out var name);
76+
Debug.Assert(name != null);
6977
return new Type1Token(name, Type1Token.TokenType.Literal);
7078
}
7179
case '<':
7280
{
81+
bytes.MoveNext();
7382
var following = bytes.Peek();
7483
if (following == '<')
7584
{
@@ -81,6 +90,7 @@ private Type1Token ReadNextToken()
8190
}
8291
case '>':
8392
{
93+
bytes.MoveNext();
8494
var following = bytes.Peek();
8595
if (following == '>')
8696
{
@@ -94,23 +104,24 @@ private Type1Token ReadNextToken()
94104
{
95105
if (ReadHelper.IsWhitespace(b))
96106
{
107+
bytes.MoveNext();
97108
skip = true;
98109
break;
99110
}
100111

101112
if (b == 0)
102113
{
114+
bytes.MoveNext();
103115
skip = true;
104116
break;
105117
}
106118

107-
if (TryReadNumber(c, out var number))
119+
if (TryReadNumber(out var number))
108120
{
109121
return number;
110122
}
111123

112-
var name = ReadLiteral(c);
113-
if (name == null)
124+
if (!TryReadLiteral(out var name))
114125
{
115126
throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
116127
}
@@ -197,12 +208,21 @@ char GetNext()
197208
return null;
198209
}
199210

200-
private bool TryReadNumber(char c, out Type1Token numberToken)
211+
private bool TryReadNumber(out Type1Token numberToken)
201212
{
202213
char GetNext()
203214
{
204215
bytes.MoveNext();
205-
return (char)bytes.CurrentByte;
216+
return (char)(bytes.Peek() ?? 0);
217+
}
218+
219+
char c = (char)(bytes.Peek() ?? 0);
220+
221+
if (!((c >= '0' && c <= '9') || c is '+' or '-'))
222+
{
223+
// Easy out. Not a valid number
224+
numberToken = null;
225+
return false;
206226
}
207227

208228
numberToken = null;
@@ -251,8 +271,6 @@ char GetNext()
251271
else
252272
{
253273
// integer
254-
bytes.Seek(bytes.CurrentOffset - 1);
255-
256274
numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer);
257275
return true;
258276
}
@@ -309,7 +327,6 @@ char GetNext()
309327
}
310328
}
311329

312-
bytes.Seek(bytes.CurrentOffset - 1);
313330
if (radix != null)
314331
{
315332
var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture));
@@ -323,14 +340,9 @@ char GetNext()
323340
return true;
324341
}
325342

326-
private string ReadLiteral(char? previousCharacter = null)
343+
private bool TryReadLiteral(out string? value)
327344
{
328345
literalBuffer.Clear();
329-
if (previousCharacter.HasValue)
330-
{
331-
literalBuffer.Append(previousCharacter);
332-
}
333-
334346
do
335347
{
336348
var b = bytes.Peek();
@@ -350,8 +362,16 @@ private string ReadLiteral(char? previousCharacter = null)
350362
literalBuffer.Append(c);
351363
} while (bytes.MoveNext());
352364

353-
var literal = literalBuffer.ToString();
354-
return literal.Length == 0 ? null : literal;
365+
if (literalBuffer.Length > 0)
366+
{
367+
value = literalBuffer.ToString();
368+
return true;
369+
}
370+
else
371+
{
372+
value = null;
373+
return false;
374+
}
355375
}
356376

357377
private string ReadComment()
@@ -377,7 +397,7 @@ private Type1DataToken ReadCharString(int length)
377397
// Skip preceding space.
378398
bytes.MoveNext();
379399
// TODO: may be wrong
380-
// bytes.MoveNext();
400+
// bytes.MoveNext();
381401

382402
byte[] data = new byte[length];
383403
for (int i = 0; i < length; i++)

src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,10 @@ public void OnlyParsesNumberPart()
9191
Assert.True(result);
9292
Assert.Equal(135.6654, AssertNumericToken(token).Data);
9393

94-
Assert.Equal('/', (char)input.Bytes.CurrentByte);
94+
if (tokenizer.ReadsNextByte)
95+
Assert.Equal('/', (char)input.Bytes.CurrentByte);
96+
else
97+
Assert.Equal('4', (char)input.Bytes.CurrentByte);
9598
}
9699

97100
[Fact]

src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ internal sealed class ArrayTokenizer : ITokenizer
99
{
1010
private readonly bool usePdfDocEncoding;
1111

12-
public bool ReadsNextByte { get; } = false;
12+
public bool ReadsNextByte => false;
1313

1414
public ArrayTokenizer(bool usePdfDocEncoding)
1515
{

src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
internal sealed class CommentTokenizer : ITokenizer
88
{
9-
public bool ReadsNextByte { get; } = true;
9+
public bool ReadsNextByte => false;
1010

1111
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
1212
{
@@ -17,10 +17,11 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
1717
return false;
1818
}
1919

20-
using var builder = new ValueStringBuilder();
20+
using var builder = new ValueStringBuilder(stackalloc char[32]);
2121

22-
while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte))
22+
while (inputBytes.Peek() is { } c && !ReadHelper.IsEndOfLine(c))
2323
{
24+
inputBytes.MoveNext();
2425
builder.Append((char) inputBytes.CurrentByte);
2526
}
2627

src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ internal class DictionaryTokenizer : ITokenizer
1111
private readonly IReadOnlyList<NameToken> requiredKeys;
1212
private readonly bool useLenientParsing;
1313

14-
public bool ReadsNextByte { get; } = false;
14+
public bool ReadsNextByte => false;
1515

1616
/// <summary>
1717
/// Create a new <see cref="DictionaryTokenizer"/>.

src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
public sealed class EndOfLineTokenizer : ITokenizer
1010
{
1111
/// <inheritdoc />
12-
public bool ReadsNextByte { get; } = false;
12+
public bool ReadsNextByte => false;
1313

1414
/// <inheritdoc />
1515
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)

0 commit comments

Comments
 (0)