Skip to content

Commit 31658ca

Browse files
EliotJonesBobLd
authored andcommitted
allow reading to continue if encountering an invalid surrogate pair
investigating the corpus at https://digitalcorpora.s3.amazonaws.com/s3_browser.html#corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/ the input file 0000000.pdf contained a utf-16 surrogate pair in an input defined as usc2. the approach of various parsers varies here, adobe acrobat seems to hard crash, pdf js returns the same text we now parse, chrome parses the intended text (2 invalid characters and "ib exam"). we don't care too much about matching chrome exactly so doing the same as firefox is fine here
1 parent 1021729 commit 31658ca

File tree

1 file changed

+16
-1
lines changed
  • src/UglyToad.PdfPig/PdfFonts/Cmap

1 file changed

+16
-1
lines changed

src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ public override string ToString()
147147

148148
public int ReadCode(IInputBytes bytes, bool useLenientParsing)
149149
{
150+
var myPosition = bytes.CurrentOffset;
151+
150152
if (hasEmptyCodespace)
151153
{
152154
var data = new byte[minCodeLength];
@@ -184,7 +186,20 @@ public int ReadCode(IInputBytes bytes, bool useLenientParsing)
184186
}
185187
}
186188

187-
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
189+
// If we encounter invalid inputs we read min bytes and convert directly to an integer.
190+
if (useLenientParsing)
191+
{
192+
bytes.Seek(myPosition);
193+
for (var i = 0; i < minCodeLength; i++)
194+
{
195+
result[i] = ReadByte(bytes, useLenientParsing);
196+
}
197+
198+
// https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java#L207
199+
return ByteArrayToInt(result.AsSpan(0, minCodeLength));
200+
}
201+
202+
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}. Bytes: {BitConverter.ToString(result)}.");
188203
}
189204

190205
private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)

0 commit comments

Comments
 (0)