Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions src/UglyToad.PdfPig.Core/ReadHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,34 @@ public static bool IsString(IInputBytes bytes, string s)

return found;
}



/// <summary>
/// Whether the given string is at this position in the input.
/// Resets to the current offset once read.
/// </summary>
public static bool IsString(IInputBytes bytes, ReadOnlySpan<byte> s)
{
bool found = true;

var startOffset = bytes.CurrentOffset;

foreach (var c in s)
{
if (bytes.CurrentByte != c)
{
found = false;
break;
}

bytes.MoveNext();
}

bytes.Seek(startOffset);

return found;
}

/// <summary>
/// Read a long from the input.
/// </summary>
Expand Down Expand Up @@ -252,14 +279,6 @@ public static int ReadInt(IInputBytes bytes)
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}, instead got \'{OtherEncodings.BytesAsLatin1String(intBytes)}\'");
}
}

/// <summary>
/// Whether the given character is a space.
/// </summary>
public static bool IsSpace(int c)
{
return c == ' ';
}

/// <summary>
/// Whether the given character value is a valid hex value.
Expand Down
15 changes: 15 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@

public class GithubIssuesTests
{
[Fact]
public void Issue1040()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pdfpig-issue-1040.pdf");

using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true}))
{
var page1 = document.GetPage(1);
Assert.NotEmpty(page1.Letters);

var page2 = document.GetPage(2);
Assert.NotEmpty(page2.Letters);
}
}

[Fact]
public void Issue1013()
{
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using Core;
using Tokenization.Scanner;
using Tokens;
Expand All @@ -26,7 +25,7 @@ internal static class FileTrailerParser
/// </summary>
private const int EndOfFileSearchRange = 2048;

private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
internal static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;

public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{
Expand Down
106 changes: 96 additions & 10 deletions src/UglyToad.PdfPig/Parser/FileStructure/XrefOffsetValidator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@
using Tokenization.Scanner;
using Tokens;

internal class XrefOffsetValidator
internal sealed class XrefOffsetValidator
{
private static readonly long MinimumSearchOffset = 6;
private const long MinimumSearchOffset = 6;

private static ReadOnlySpan<byte> XRefBytes => "xref"u8;
private static ReadOnlySpan<byte> SpaceObjBytes => " obj"u8;

private readonly ILog log;

private List<long>? bfSearchStartXRefTablesOffsets;
private List<long>? bfSearchXRefTablesOffsets;
private List<long>? bfSearchXRefStreamsOffsets;

Expand Down Expand Up @@ -90,16 +94,18 @@ private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scan

BfSearchForXRefStreams(reader);

if (bfSearchXRefTablesOffsets != null)
if (bfSearchXRefTablesOffsets != null && bfSearchXRefTablesOffsets.Count > 0)
{
// TODO to be optimized, this won't work in every case
newOffsetTable = SearchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
}
if (bfSearchXRefStreamsOffsets != null)

if (bfSearchXRefStreamsOffsets != null && bfSearchXRefStreamsOffsets.Count > 0)
{
// TODO to be optimized, this won't work in every case
newOffsetStream = SearchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
}

// choose the nearest value
if (newOffsetTable > -1 && newOffsetStream > -1)
{
Expand All @@ -126,9 +132,91 @@ private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scan
newOffset = newOffsetStream;
bfSearchXRefStreamsOffsets!.Remove(newOffsetStream);
}
else
{
log.Warn("Trying to repair xref offset by looking for all startxref.");
if (TryBruteForceSearchForXrefFromStartxref(xrefOffset, scanner, reader, out long newOffsetFromStartxref))
{
newOffset = newOffsetFromStartxref;
}
}

return newOffset;
}

private bool TryBruteForceSearchForXrefFromStartxref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader, out long newOffset)
{
newOffset = -1;
BruteForceSearchForStartxref(reader);
long newStartXRefOffset = SearchNearestValue(bfSearchStartXRefTablesOffsets, xrefOffset);
if (newStartXRefOffset < reader.Length)
{
long tempNewOffset = -1;
var startOffset = scanner.CurrentPosition;
scanner.Seek(newStartXRefOffset + 9);

if (scanner.MoveNext() && scanner.CurrentToken is NumericToken token)
{
tempNewOffset = token.Long;
}

if (tempNewOffset > -1)
{
scanner.Seek(tempNewOffset);
scanner.MoveNext();
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
{
newOffset = tempNewOffset;
}

if (CheckXRefStreamOffset(tempNewOffset, scanner, true))
{
newOffset = tempNewOffset;
}
}

scanner.Seek(startOffset);
}

return newOffset != -1;
}

private void BruteForceSearchForStartxref(IInputBytes bytes)
{
if (bfSearchStartXRefTablesOffsets != null)
{
return;
}

// a pdf may contain more than one startxref entry
bfSearchStartXRefTablesOffsets = new List<long>();

var startOffset = bytes.CurrentOffset;

bytes.Seek(MinimumSearchOffset);

// search for startxref
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, FileTrailerParser.StartXRefBytes))
{
var newOffset = bytes.CurrentOffset;

bytes.Seek(newOffset - 1);

if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bfSearchStartXRefTablesOffsets.Add(newOffset);
}

bytes.Seek(newOffset + 9);
}

}

bytes.Seek(startOffset);
}

private void BruteForceSearchForTables(IInputBytes bytes)
{
if (bfSearchXRefTablesOffsets != null)
Expand All @@ -146,7 +234,7 @@ private void BruteForceSearchForTables(IInputBytes bytes)
// search for xref tables
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, "xref"))
if (ReadHelper.IsString(bytes, XRefBytes))
{
var newOffset = bytes.CurrentOffset;

Expand Down Expand Up @@ -180,11 +268,9 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
bytes.Seek(MinimumSearchOffset);

// search for XRef streams
var objString = " obj";

while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (!ReadHelper.IsString(bytes, "xref"))
if (!ReadHelper.IsString(bytes, XRefBytes))
{
continue;
}
Expand All @@ -209,7 +295,7 @@ private void BfSearchForXRefStreams(IInputBytes bytes)

for (int j = 0; j < 10; j++)
{
if (ReadHelper.IsString(bytes, objString))
if (ReadHelper.IsString(bytes, SpaceObjBytes))
{
long tempOffset = currentOffset - 1;

Expand All @@ -224,7 +310,7 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
bytes.Seek(tempOffset);

// is the digit preceded by a space?
if (ReadHelper.IsSpace(bytes.CurrentByte))
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
int length = 0;
bytes.Seek(--tempOffset);
Expand Down
Loading