diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs index 89682804e..da16e242b 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs @@ -191,8 +191,76 @@ public void ScansStringWithWeirdWeirdDoubleSymbolNumerics() AssertCorrectToken(tokens[3], "F1"); AssertCorrectToken(tokens[4], 8); AssertCorrectToken(tokens[5], "Tf"); + } + + [Fact] + public void SkipsCommentsInStreams() + { + const string content = + """ + % 641 0 obj + << + /Type /Encoding + /Differences [16/quotedblleft/quotedblright 21/endash 27/ff/fi/fl/ffi 39/quoteright/parenleft/parenright 43/plus/comma/hyphen/period/slash/zero/one/two/three/four/five/six/seven/eight/nine/colon 64/at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W/X/Y/Z/bracketleft 93/bracketright 97/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/braceleft 125/braceright 225/aacute 232/egrave/eacute 252/udieresis] + >> + % 315 0 obj + << + /Type /Font + /Subtype /Type1 + /BaseFont /IXNPPI+CMEX10 + /FontDescriptor 661 0 R + /FirstChar 80 + /LastChar 88 + /Widths 644 0 R + /ToUnicode 699 0 R + >> + % 306 0 obj + << + /Type /Font + /Subtype /Type1 + /BaseFont /MSNKTF+CMMI10 + /FontDescriptor 663 0 R + /FirstChar 58 + /LastChar 119 + /Widths 651 0 R + /ToUnicode 700 0 R + >> + """; + + var tokens = new List(); + + var scanner = new CoreTokenScanner( + StringBytesTestConverter.Convert(content, false).Bytes, + true, + isStream: true); + + while (scanner.MoveNext()) + { + tokens.Add(scanner.CurrentToken); + } + + Assert.Equal(3, tokens.Count); + + Assert.All(tokens, x => Assert.IsType(x)); + tokens.Clear(); + + var nonStreamScanner = new CoreTokenScanner( + StringBytesTestConverter.Convert(content, false).Bytes, + true, + isStream: false); + + while (nonStreamScanner.MoveNext()) + { + tokens.Add(nonStreamScanner.CurrentToken); + } + + Assert.Equal(6, tokens.Count); + + Assert.Equal(3, tokens.OfType().Count()); + Assert.Equal(3, tokens.OfType().Count()); } + private static void AssertCorrectToken(IToken token, TData expected) where T : IDataToken { var cast = Assert.IsType(token); diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 32040fac0..303bfe162 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -43,6 +43,14 @@ public class CoreTokenScanner : ISeekableTokenScanner private bool hasBytePreRead; private bool isInInlineImage; + /// + /// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it. + /// + /// + /// PDFBox skips all of a line following a comment character inside streams, see: + /// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319 + /// + private readonly bool isStream; /// /// Create a new from the input. @@ -52,7 +60,8 @@ public CoreTokenScanner( bool usePdfDocEncoding, ScannerScope scope = ScannerScope.None, IReadOnlyDictionary> namedDictionaryRequiredKeys = null, - bool useLenientParsing = false) + bool useLenientParsing = false, + bool isStream = false) { this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes)); this.usePdfDocEncoding = usePdfDocEncoding; @@ -62,6 +71,7 @@ public CoreTokenScanner( this.scope = scope; this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys; this.useLenientParsing = useLenientParsing; + this.isStream = isStream; } /// @@ -94,6 +104,7 @@ public bool MoveNext() { var endAngleBracesRead = 0; + bool isSkippingLine = false; bool isSkippingSymbol = false; while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext()) { @@ -101,6 +112,17 @@ public bool MoveNext() var currentByte = inputBytes.CurrentByte; var c = (char) currentByte; + if (isSkippingLine) + { + if (ReadHelper.IsEndOfLine(c)) + { + isSkippingLine = false; + continue; + } + + continue; + } + ITokenizer tokenizer = null; foreach (var customTokenizer in customTokenizers) { @@ -119,6 +141,12 @@ public bool MoveNext() continue; } + if (currentByte == (byte)'%' && isStream) + { + isSkippingLine = true; + continue; + } + // If we failed to read the symbol for whatever reason we pass over it. if (isSkippingSymbol && c != '>') { diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 16f057072..4bfd802d2 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -846,7 +846,11 @@ private IReadOnlyList ParseObjectStream(StreamToken stream, long of // Read the N integers var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this)); - var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing); + var scanner = new CoreTokenScanner( + bytes, + true, + useLenientParsing: parsingOptions.UseLenientParsing, + isStream: true); var objects = new List<(long, long)>();