Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,76 @@ public void ScansStringWithWeirdWeirdDoubleSymbolNumerics()
AssertCorrectToken<NameToken, string>(tokens[3], "F1");
AssertCorrectToken<NumericToken, double>(tokens[4], 8);
AssertCorrectToken<OperatorToken, string>(tokens[5], "Tf");
}

[Fact]
public void SkipsCommentsInStreams()
{
const string content =
"""
% 641 0 obj
<<
/Type /Encoding
/Differences [16/quotedblleft/quotedblright 21/endash 27/ff/fi/fl/ffi 39/quoteright/parenleft/parenright 43/plus/comma/hyphen/period/slash/zero/one/two/three/four/five/six/seven/eight/nine/colon 64/at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W/X/Y/Z/bracketleft 93/bracketright 97/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/braceleft 125/braceright 225/aacute 232/egrave/eacute 252/udieresis]
>>
% 315 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /IXNPPI+CMEX10
/FontDescriptor 661 0 R
/FirstChar 80
/LastChar 88
/Widths 644 0 R
/ToUnicode 699 0 R
>>
% 306 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /MSNKTF+CMMI10
/FontDescriptor 663 0 R
/FirstChar 58
/LastChar 119
/Widths 651 0 R
/ToUnicode 700 0 R
>>
""";

var tokens = new List<IToken>();

var scanner = new CoreTokenScanner(
StringBytesTestConverter.Convert(content, false).Bytes,
true,
isStream: true);

while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}

Assert.Equal(3, tokens.Count);

Assert.All(tokens, x => Assert.IsType<DictionaryToken>(x));

tokens.Clear();

var nonStreamScanner = new CoreTokenScanner(
StringBytesTestConverter.Convert(content, false).Bytes,
true,
isStream: false);

while (nonStreamScanner.MoveNext())
{
tokens.Add(nonStreamScanner.CurrentToken);
}

Assert.Equal(6, tokens.Count);

Assert.Equal(3, tokens.OfType<CommentToken>().Count());
Assert.Equal(3, tokens.OfType<DictionaryToken>().Count());
}

private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
{
var cast = Assert.IsType<T>(token);
Expand Down
30 changes: 29 additions & 1 deletion src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ public class CoreTokenScanner : ISeekableTokenScanner

private bool hasBytePreRead;
private bool isInInlineImage;
/// <summary>
/// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it.
/// </summary>
/// <remarks>
/// PDFBox skips all of a line following a comment character inside streams, see:
/// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319
/// </remarks>
private readonly bool isStream;

/// <summary>
/// Create a new <see cref="CoreTokenScanner"/> from the input.
Expand All @@ -52,7 +60,8 @@ public CoreTokenScanner(
bool usePdfDocEncoding,
ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
bool useLenientParsing = false)
bool useLenientParsing = false,
bool isStream = false)
{
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
this.usePdfDocEncoding = usePdfDocEncoding;
Expand All @@ -62,6 +71,7 @@ public CoreTokenScanner(
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
this.useLenientParsing = useLenientParsing;
this.isStream = isStream;
}

/// <inheritdoc />
Expand Down Expand Up @@ -94,13 +104,25 @@ public bool MoveNext()
{
var endAngleBracesRead = 0;

bool isSkippingLine = false;
bool isSkippingSymbol = false;
while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
{
hasBytePreRead = false;
var currentByte = inputBytes.CurrentByte;
var c = (char) currentByte;

if (isSkippingLine)
{
if (ReadHelper.IsEndOfLine(c))
{
isSkippingLine = false;
continue;
}

continue;
}

ITokenizer tokenizer = null;
foreach (var customTokenizer in customTokenizers)
{
Expand All @@ -119,6 +141,12 @@ public bool MoveNext()
continue;
}

if (currentByte == (byte)'%' && isStream)
{
isSkippingLine = true;
continue;
}

// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol && c != '>')
{
Expand Down
6 changes: 5 additions & 1 deletion src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,11 @@ private IReadOnlyList<ObjectToken> ParseObjectStream(StreamToken stream, long of
// Read the N integers
var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this));

var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
var scanner = new CoreTokenScanner(
bytes,
true,
useLenientParsing: parsingOptions.UseLenientParsing,
isStream: true);

var objects = new List<(long, long)>();

Expand Down
Loading