Skip to content

Commit 7c9019b

Browse files
committed
Optimize the FirstPassParser to just fetch a final chunk before doing things char-by-char backwards
1 parent bb9c578 commit 7c9019b

File tree

3 files changed

+48
-4
lines changed

3 files changed

+48
-4
lines changed

src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,10 @@ private string ReadComment()
395395
private Type1DataToken ReadCharString(int length)
396396
{
397397
// Skip preceding space.
398-
bytes.MoveNext();
399-
// TODO: may be wrong
400-
// bytes.MoveNext();
398+
if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws))
399+
{
400+
bytes.MoveNext();
401+
}
401402

402403
byte[] data = new byte[length];
403404
for (int i = 0; i < length; i++)

src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,56 @@ internal static partial class FirstPassParser
1010
{
1111
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
1212

13+
public const long EndOfFileBufferSize = 1024;
14+
1315
public static StartXRefLocation GetFirstCrossReferenceOffset(
1416
IInputBytes bytes,
1517
ISeekableTokenScanner scanner,
1618
ILog log)
1719
{
20+
// We used to read backward through the file, but this is quite expensive for streams that directly wrap OS files.
21+
// Instead we fetch the last 1024 bytes of the file and do a memory search, as cheap first attempt. This is significantly faster
22+
// in practice, if there is no in-process caching of the file involved
23+
//
24+
// If that fails (in practice it should never) we fall back to the old method of reading backwards.
1825
var fileLength = bytes.Length;
26+
{
27+
var fetchFrom = Math.Max(bytes.Length - EndOfFileBufferSize, 0L);
28+
29+
bytes.Seek(fetchFrom);
30+
31+
Span<byte> byteBuffer = new byte[bytes.Length - fetchFrom]; // TODO: Maybe use PoolArray?
32+
33+
int n = bytes.Read(byteBuffer);
34+
35+
if (n == byteBuffer.Length)
36+
{
37+
int lx = byteBuffer.LastIndexOf("startxref"u8);
38+
39+
if (lx < 0)
40+
{
41+
// See old code. We also try a mangled version
42+
lx = byteBuffer.LastIndexOf("startref"u8);
43+
}
44+
45+
if (lx >= 0)
46+
{
47+
scanner.Seek(fetchFrom + lx);
48+
49+
if (scanner.TryReadToken(out OperatorToken startXrefOp) && (startXrefOp.Data == "startxref" || startXrefOp.Data == "startref"))
50+
{
51+
var pos = GetNumericTokenFollowingCurrent(scanner);
52+
53+
log.Debug($"Found startxref at {pos}");
54+
55+
return new StartXRefLocation(fetchFrom + lx, pos);
56+
}
57+
}
58+
59+
}
60+
}
1961

62+
// Now fall through in the old code
2063
var buffer = new CircularByteBuffer(StartXRefBytes.Length);
2164

2265
// Start from the end of the file

src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ public static IReadOnlyDictionary<IndirectReference, long> GetObjectLocations(II
5757
{
5858
var next = bytes.Peek();
5959

60-
if (next.HasValue && next == 'n')
60+
if (next == 'n')
6161
{
6262
if (ReadHelper.IsString(bytes, "endobj"))
6363
{

0 commit comments

Comments
 (0)