Skip to content

Commit c3c477a

Browse files
ricflamsBobLd
authored andcommitted
Bugfix and optimize GetStartXrefPosition
The bugfix was the important part but the optimization is pretty nice too. - Bugfix: If startxref was found so far back (eg in the very beginning which can be the case for Linearized PDFs) that we ended up setting actualStartOffset to 0 then the loop would exit immediately without actually searching that part. - Optimization: GetStartXrefPosition would search for startxref in the last 2048 bytes and then double that search-range (looking back 4096, 8192, etc bytes) to look for startxref until the entire file was searched. This was rather inefficient since each step would search the same parts over and over again. This has been changed to properly search (still increasingly larger) chunks that doesn't overlap. On a test of 5000 PDFs that reduced their load-time by 10%. - Change: No need for the exception to say that startxref couldn't be found "in the last 2048 characters" since the entire file was searched anyway.
1 parent 4dab2ef commit c3c477a

File tree

1 file changed

+36
-30
lines changed

1 file changed

+36
-30
lines changed

src/UglyToad.PdfPig/Parser/FileStructure/FileTrailerParser.cs

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -76,55 +76,61 @@ public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableToke
7676
return numeric.Long;
7777
}
7878

79-
private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd)
79+
private static long GetStartXrefPosition(IInputBytes bytes, int chunkSize)
8080
{
81-
int startXref = 0;
82-
int startXrefsCount = 0;
81+
// Initialize startpos to the end to get the loop below started
82+
var startPos = bytes.Length;
8383

84-
var index = 0;
85-
86-
var fileLength = bytes.Length;
87-
var multiple = 1;
88-
89-
var actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
9084
do
9185
{
92-
multiple *= 2;
93-
bytes.Seek(actualStartOffset);
86+
// Make a sliding-window search region where each subsequent search will look further
87+
// back and not search in the already searched chunks. Make sure to search just beyond
88+
// the chunk to account for the possibility of startxref crossing chunk-boundaries.
89+
// The start-position is inclusive and the end-position is exclusive for the chunk.
90+
// Each search will look in an increasingly bigger chunk, doubling every time.
91+
var endPos = Math.Min(startPos + StartXRefBytes.Length, bytes.Length);
92+
startPos = Math.Max(0, endPos - chunkSize);
93+
chunkSize *= 2;
94+
95+
// Prepare to search this region; mark startXrefPos as "not found".
96+
bytes.Seek(startPos);
97+
var startXrefPos = -1L;
98+
var index = 0;
9499

95100
// Starting scanning the file bytes.
96-
while (bytes.MoveNext())
101+
while (bytes.CurrentOffset < endPos && bytes.MoveNext())
97102
{
98103
if (bytes.CurrentByte == StartXRefBytes[index])
99104
{
100105
// We might be reading "startxref".
101-
index++;
106+
if (++index == StartXRefBytes.Length)
107+
{
108+
// Set this "startxref" (position from the start of the document to the first 's').
109+
startXrefPos = (int)bytes.CurrentOffset - StartXRefBytes.Length;
110+
111+
// Continue scanning to make sure we find the last startxref in case there are more
112+
// that just one, which can be the case for incrementally updated PDFs with multiple
113+
// generations of sections.
114+
index = 0;
115+
}
102116
}
103117
else
104118
{
105-
index = 0;
106-
}
107-
108-
if (index == StartXRefBytes.Length)
109-
{
110-
// Set this "startxref" (position from the start of the document to the first 's').
111-
startXref = (int)bytes.CurrentOffset - StartXRefBytes.Length;
112-
startXrefsCount++;
113-
114-
// Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
119+
// Not a match for "startxref" so set index back to 0
115120
index = 0;
116121
}
117122
}
118123

119-
actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
120-
} while (startXrefsCount == 0 && actualStartOffset > 0);
124+
// If we found a startxref then we're done.
125+
if (startXrefPos >= 0)
126+
{
127+
return startXrefPos;
128+
}
121129

122-
if (startXrefsCount == 0)
123-
{
124-
throw new PdfDocumentFormatException($"Could not find the startxref within the last {offsetFromEnd} characters.");
125-
}
130+
} while (startPos > 0); // Keep on searching until we've read from the very start.
126131

127-
return startXref;
132+
// No startxref position was found.
133+
throw new PdfDocumentFormatException($"Could not find the startxref");
128134
}
129135
}
130136
}

0 commit comments

Comments
 (0)