Skip to content

Commit 0586713

Browse files
EliotJonesBobLd
authored andcommitted
skip comments in pdf objects streams #926
the file provided in issue #926 contains the following syntax in pdf object streams: ``` % 750 0 obj << >> ``` currently we read the comment token and skip the rest however this producer is writing nonsense to the stream. comment tokens are only valid outside streams in pdf files so we align to the behavior of pdfbox here by skipping the entire line containing a comment inside a stream which fixes parsing this file.
1 parent 6261258 commit 0586713

File tree

2 files changed

+34
-2
lines changed

2 files changed

+34
-2
lines changed

src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,14 @@ public class CoreTokenScanner : ISeekableTokenScanner
4343

4444
private bool hasBytePreRead;
4545
private bool isInInlineImage;
46+
/// <summary>
47+
/// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it.
48+
/// </summary>
49+
/// <remarks>
50+
/// PDFBox skips all of a line following a comment character inside streams, see:
51+
/// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319
52+
/// </remarks>
53+
private readonly bool isStream;
4654

4755
/// <summary>
4856
/// Create a new <see cref="CoreTokenScanner"/> from the input.
@@ -52,7 +60,8 @@ public CoreTokenScanner(
5260
bool usePdfDocEncoding,
5361
ScannerScope scope = ScannerScope.None,
5462
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
55-
bool useLenientParsing = false)
63+
bool useLenientParsing = false,
64+
bool isStream = false)
5665
{
5766
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
5867
this.usePdfDocEncoding = usePdfDocEncoding;
@@ -62,6 +71,7 @@ public CoreTokenScanner(
6271
this.scope = scope;
6372
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
6473
this.useLenientParsing = useLenientParsing;
74+
this.isStream = isStream;
6575
}
6676

6777
/// <inheritdoc />
@@ -94,13 +104,25 @@ public bool MoveNext()
94104
{
95105
var endAngleBracesRead = 0;
96106

107+
bool isSkippingLine = false;
97108
bool isSkippingSymbol = false;
98109
while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
99110
{
100111
hasBytePreRead = false;
101112
var currentByte = inputBytes.CurrentByte;
102113
var c = (char) currentByte;
103114

115+
if (isSkippingLine)
116+
{
117+
if (ReadHelper.IsEndOfLine(c))
118+
{
119+
isSkippingLine = false;
120+
continue;
121+
}
122+
123+
continue;
124+
}
125+
104126
ITokenizer tokenizer = null;
105127
foreach (var customTokenizer in customTokenizers)
106128
{
@@ -119,6 +141,12 @@ public bool MoveNext()
119141
continue;
120142
}
121143

144+
if (currentByte == (byte)'%' && isStream)
145+
{
146+
isSkippingLine = true;
147+
continue;
148+
}
149+
122150
// If we failed to read the symbol for whatever reason we pass over it.
123151
if (isSkippingSymbol && c != '>')
124152
{

src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,11 @@ private IReadOnlyList<ObjectToken> ParseObjectStream(StreamToken stream, long of
846846
// Read the N integers
847847
var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this));
848848

849-
var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
849+
var scanner = new CoreTokenScanner(
850+
bytes,
851+
true,
852+
useLenientParsing: parsingOptions.UseLenientParsing,
853+
isStream: true);
850854

851855
var objects = new List<(long, long)>();
852856

0 commit comments

Comments
 (0)