UglyToad · BobLd · Jul 6, 2025 · Jul 5, 2025 · Jul 5, 2025
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs
@@ -191,8 +191,76 @@ public void ScansStringWithWeirdWeirdDoubleSymbolNumerics()
             AssertCorrectToken<NameToken, string>(tokens[3], "F1");
             AssertCorrectToken<NumericToken, double>(tokens[4], 8);
             AssertCorrectToken<OperatorToken, string>(tokens[5], "Tf");
+        }
+
+        [Fact]
+        public void SkipsCommentsInStreams()
+        {
+            const string content = 
+                """
+                % 641 0 obj
+                <<
+                /Type /Encoding
+                /Differences [16/quotedblleft/quotedblright 21/endash 27/ff/fi/fl/ffi 39/quoteright/parenleft/parenright 43/plus/comma/hyphen/period/slash/zero/one/two/three/four/five/six/seven/eight/nine/colon 64/at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W/X/Y/Z/bracketleft 93/bracketright 97/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y/z/braceleft 125/braceright 225/aacute 232/egrave/eacute 252/udieresis]
+                >>
+                % 315 0 obj
+                <<
+                /Type /Font
+                /Subtype /Type1
+                /BaseFont /IXNPPI+CMEX10
+                /FontDescriptor 661 0 R
+                /FirstChar 80
+                /LastChar 88
+                /Widths 644 0 R
+                /ToUnicode 699 0 R
+                >>
+                % 306 0 obj
+                <<
+                /Type /Font
+                /Subtype /Type1
+                /BaseFont /MSNKTF+CMMI10
+                /FontDescriptor 663 0 R
+                /FirstChar 58
+                /LastChar 119
+                /Widths 651 0 R
+                /ToUnicode 700 0 R
+                >>
+                """;
+
+            var tokens = new List<IToken>();
+
+            var scanner = new CoreTokenScanner(
+                StringBytesTestConverter.Convert(content, false).Bytes,
+                true,
+                isStream: true);
+
+            while (scanner.MoveNext())
+            {
+                tokens.Add(scanner.CurrentToken);
+            }
+
+            Assert.Equal(3, tokens.Count);
+
+            Assert.All(tokens, x => Assert.IsType<DictionaryToken>(x));
 
+            tokens.Clear();
+
+            var nonStreamScanner = new CoreTokenScanner(
+                StringBytesTestConverter.Convert(content, false).Bytes,
+                true,
+                isStream: false);
+
+            while (nonStreamScanner.MoveNext())
+            {
+                tokens.Add(nonStreamScanner.CurrentToken);
+            }
+
+            Assert.Equal(6, tokens.Count);
+
+            Assert.Equal(3, tokens.OfType<CommentToken>().Count());
+            Assert.Equal(3, tokens.OfType<DictionaryToken>().Count());
         }
+
         private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
         {
             var cast = Assert.IsType<T>(token);

diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -43,6 +43,14 @@ public class CoreTokenScanner : ISeekableTokenScanner
 
         private bool hasBytePreRead;
         private bool isInInlineImage;
+        /// <summary>
+        /// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it.
+        /// </summary>
+        /// <remarks>
+        /// PDFBox skips all of a line following a comment character inside streams, see:
+        /// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319
+        /// </remarks>
+        private readonly bool isStream;
 
         /// <summary>
         /// Create a new <see cref="CoreTokenScanner"/> from the input.
@@ -52,7 +60,8 @@ public CoreTokenScanner(
             bool usePdfDocEncoding,
             ScannerScope scope = ScannerScope.None,
             IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
-            bool useLenientParsing = false)
+            bool useLenientParsing = false,
+            bool isStream = false)
         {
             this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
             this.usePdfDocEncoding = usePdfDocEncoding;
@@ -62,6 +71,7 @@ public CoreTokenScanner(
             this.scope = scope;
             this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
             this.useLenientParsing = useLenientParsing;
+            this.isStream = isStream;
         }
 
         /// <inheritdoc />
@@ -94,13 +104,25 @@ public bool MoveNext()
         {
             var endAngleBracesRead = 0;
 
+            bool isSkippingLine = false;
             bool isSkippingSymbol = false;
             while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
             {
                 hasBytePreRead = false;
                 var currentByte = inputBytes.CurrentByte;
                 var c = (char) currentByte;
 
+                if (isSkippingLine)
+                {
+                    if (ReadHelper.IsEndOfLine(c))
+                    {
+                        isSkippingLine = false;
+                        continue;
+                    }
+
+                    continue;
+                }
+
                 ITokenizer tokenizer = null;
                 foreach (var customTokenizer in customTokenizers)
                 {
@@ -119,6 +141,12 @@ public bool MoveNext()
                         continue;
                     }
 
+                    if (currentByte == (byte)'%' && isStream)
+                    {
+                        isSkippingLine = true;
+                        continue;
+                    }
+
                     // If we failed to read the symbol for whatever reason we pass over it.
                     if (isSkippingSymbol && c != '>')
                     {

diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -846,7 +846,11 @@ private IReadOnlyList<ObjectToken> ParseObjectStream(StreamToken stream, long of
             // Read the N integers
             var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this));
 
-            var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
+            var scanner = new CoreTokenScanner(
+                bytes,
+                true,
+                useLenientParsing: parsingOptions.UseLenientParsing,
+                isStream: true);
 
             var objects = new List<(long, long)>();