Skip to content

Commit 6911f31

Browse files
committed
Try to repair xref offset by looking for all startxref and fix #1040
1 parent bf7c3c0 commit 6911f31

File tree

5 files changed

+140
-21
lines changed

5 files changed

+140
-21
lines changed

src/UglyToad.PdfPig.Core/ReadHelper.cs

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,34 @@ public static bool IsString(IInputBytes bytes, string s)
189189

190190
return found;
191191
}
192-
192+
193+
194+
/// <summary>
195+
/// Whether the given string is at this position in the input.
196+
/// Resets to the current offset once read.
197+
/// </summary>
198+
public static bool IsString(IInputBytes bytes, ReadOnlySpan<byte> s)
199+
{
200+
bool found = true;
201+
202+
var startOffset = bytes.CurrentOffset;
203+
204+
foreach (var c in s)
205+
{
206+
if (bytes.CurrentByte != c)
207+
{
208+
found = false;
209+
break;
210+
}
211+
212+
bytes.MoveNext();
213+
}
214+
215+
bytes.Seek(startOffset);
216+
217+
return found;
218+
}
219+
193220
/// <summary>
194221
/// Read a long from the input.
195222
/// </summary>
@@ -252,14 +279,6 @@ public static int ReadInt(IInputBytes bytes)
252279
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}, instead got \'{OtherEncodings.BytesAsLatin1String(intBytes)}\'");
253280
}
254281
}
255-
256-
/// <summary>
257-
/// Whether the given character is a space.
258-
/// </summary>
259-
public static bool IsSpace(int c)
260-
{
261-
return c == ' ';
262-
}
263282

264283
/// <summary>
265284
/// Whether the given character value is a valid hex value.

src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77

88
public class GithubIssuesTests
99
{
10+
[Fact]
11+
public void Issue1040()
12+
{
13+
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pdfpig-issue-1040.pdf");
14+
15+
using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true}))
16+
{
17+
var page1 = document.GetPage(1);
18+
Assert.NotEmpty(page1.Letters);
19+
20+
var page2 = document.GetPage(2);
21+
Assert.NotEmpty(page2.Letters);
22+
}
23+
}
24+
1025
[Fact]
1126
public void Issue1013()
1227
{
Binary file not shown.

src/UglyToad.PdfPig/Parser/FileStructure/FileTrailerParser.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
namespace UglyToad.PdfPig.Parser.FileStructure
22
{
33
using System;
4-
using System.Collections.Generic;
54
using Core;
65
using Tokenization.Scanner;
76
using Tokens;
@@ -26,7 +25,7 @@ internal static class FileTrailerParser
2625
/// </summary>
2726
private const int EndOfFileSearchRange = 2048;
2827

29-
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
28+
internal static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
3029

3130
public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
3231
{

src/UglyToad.PdfPig/Parser/FileStructure/XrefOffsetValidator.cs

Lines changed: 96 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,16 @@
77
using Tokenization.Scanner;
88
using Tokens;
99

10-
internal class XrefOffsetValidator
10+
internal sealed class XrefOffsetValidator
1111
{
12-
private static readonly long MinimumSearchOffset = 6;
12+
private const long MinimumSearchOffset = 6;
13+
14+
private static ReadOnlySpan<byte> XRefBytes => "xref"u8;
15+
private static ReadOnlySpan<byte> SpaceObjBytes => " obj"u8;
1316

1417
private readonly ILog log;
1518

19+
private List<long>? bfSearchStartXRefTablesOffsets;
1620
private List<long>? bfSearchXRefTablesOffsets;
1721
private List<long>? bfSearchXRefStreamsOffsets;
1822

@@ -90,16 +94,18 @@ private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scan
9094

9195
BfSearchForXRefStreams(reader);
9296

93-
if (bfSearchXRefTablesOffsets != null)
97+
if (bfSearchXRefTablesOffsets != null && bfSearchXRefTablesOffsets.Count > 0)
9498
{
9599
// TODO to be optimized, this won't work in every case
96100
newOffsetTable = SearchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
97101
}
98-
if (bfSearchXRefStreamsOffsets != null)
102+
103+
if (bfSearchXRefStreamsOffsets != null && bfSearchXRefStreamsOffsets.Count > 0)
99104
{
100105
// TODO to be optimized, this won't work in every case
101106
newOffsetStream = SearchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
102107
}
108+
103109
// choose the nearest value
104110
if (newOffsetTable > -1 && newOffsetStream > -1)
105111
{
@@ -126,9 +132,91 @@ private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scan
126132
newOffset = newOffsetStream;
127133
bfSearchXRefStreamsOffsets!.Remove(newOffsetStream);
128134
}
135+
else
136+
{
137+
log.Warn("Trying to repair xref offset by looking for all startxref.");
138+
if (TryBruteForceSearchForXrefFromStartxref(xrefOffset, scanner, reader, out long newOffsetFromStartxref))
139+
{
140+
newOffset = newOffsetFromStartxref;
141+
}
142+
}
143+
129144
return newOffset;
130145
}
131146

147+
private bool TryBruteForceSearchForXrefFromStartxref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader, out long newOffset)
148+
{
149+
newOffset = -1;
150+
BruteForceSearchForStartxref(reader);
151+
long newStartXRefOffset = SearchNearestValue(bfSearchStartXRefTablesOffsets, xrefOffset);
152+
if (newStartXRefOffset < reader.Length)
153+
{
154+
long tempNewOffset = -1;
155+
var startOffset = scanner.CurrentPosition;
156+
scanner.Seek(newStartXRefOffset + 9);
157+
158+
if (scanner.MoveNext() && scanner.CurrentToken is NumericToken token)
159+
{
160+
tempNewOffset = token.Long;
161+
}
162+
163+
if (tempNewOffset > -1)
164+
{
165+
scanner.Seek(tempNewOffset);
166+
scanner.MoveNext();
167+
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
168+
{
169+
newOffset = tempNewOffset;
170+
}
171+
172+
if (CheckXRefStreamOffset(tempNewOffset, scanner, true))
173+
{
174+
newOffset = tempNewOffset;
175+
}
176+
}
177+
178+
scanner.Seek(startOffset);
179+
}
180+
181+
return newOffset != -1;
182+
}
183+
184+
private void BruteForceSearchForStartxref(IInputBytes bytes)
185+
{
186+
if (bfSearchStartXRefTablesOffsets != null)
187+
{
188+
return;
189+
}
190+
191+
// a pdf may contain more than one startxref entry
192+
bfSearchStartXRefTablesOffsets = new List<long>();
193+
194+
var startOffset = bytes.CurrentOffset;
195+
196+
bytes.Seek(MinimumSearchOffset);
197+
198+
// search for startxref
199+
while (bytes.MoveNext() && !bytes.IsAtEnd())
200+
{
201+
if (ReadHelper.IsString(bytes, FileTrailerParser.StartXRefBytes))
202+
{
203+
var newOffset = bytes.CurrentOffset;
204+
205+
bytes.Seek(newOffset - 1);
206+
207+
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
208+
{
209+
bfSearchStartXRefTablesOffsets.Add(newOffset);
210+
}
211+
212+
bytes.Seek(newOffset + 9);
213+
}
214+
215+
}
216+
217+
bytes.Seek(startOffset);
218+
}
219+
132220
private void BruteForceSearchForTables(IInputBytes bytes)
133221
{
134222
if (bfSearchXRefTablesOffsets != null)
@@ -146,7 +234,7 @@ private void BruteForceSearchForTables(IInputBytes bytes)
146234
// search for xref tables
147235
while (bytes.MoveNext() && !bytes.IsAtEnd())
148236
{
149-
if (ReadHelper.IsString(bytes, "xref"))
237+
if (ReadHelper.IsString(bytes, XRefBytes))
150238
{
151239
var newOffset = bytes.CurrentOffset;
152240

@@ -180,11 +268,9 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
180268
bytes.Seek(MinimumSearchOffset);
181269

182270
// search for XRef streams
183-
var objString = " obj";
184-
185271
while (bytes.MoveNext() && !bytes.IsAtEnd())
186272
{
187-
if (!ReadHelper.IsString(bytes, "xref"))
273+
if (!ReadHelper.IsString(bytes, XRefBytes))
188274
{
189275
continue;
190276
}
@@ -209,7 +295,7 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
209295

210296
for (int j = 0; j < 10; j++)
211297
{
212-
if (ReadHelper.IsString(bytes, objString))
298+
if (ReadHelper.IsString(bytes, SpaceObjBytes))
213299
{
214300
long tempOffset = currentOffset - 1;
215301

@@ -224,7 +310,7 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
224310
bytes.Seek(tempOffset);
225311

226312
// is the digit preceded by a space?
227-
if (ReadHelper.IsSpace(bytes.CurrentByte))
313+
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
228314
{
229315
int length = 0;
230316
bytes.Seek(--tempOffset);

0 commit comments

Comments
 (0)