Skip to content

Commit 618e0ff

Browse files
committed
fall back to times-roman as standard 14 font when lenient
if parsing in lenient mode and encountering a malformed base name (in this case 'helveticai') we fallback to times-roman as the adobe font metrics file for a standard 14 font. this aligns with the behavior of pdfbox. we also log a more informative error in non-lenient modes this fixes document 0000086.pdf from the corpus
1 parent 016b754 commit 618e0ff

File tree

2 files changed

+30
-7
lines changed

2 files changed

+30
-7
lines changed

src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,11 @@ private static PdfDocument OpenDocument(
168168
pdfScanner,
169169
parsingOptions);
170170

171-
var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader);
171+
var type1Handler = new Type1FontHandler(
172+
pdfScanner,
173+
filterProvider,
174+
encodingReader,
175+
parsingOptions.UseLenientParsing);
172176

173177
var trueTypeHandler = new TrueTypeFontHandler(parsingOptions.Logger,
174178
pdfScanner,

src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type1FontHandler.cs

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,18 @@ internal class Type1FontHandler : IFontHandler
2020
private readonly IPdfTokenScanner pdfScanner;
2121
private readonly ILookupFilterProvider filterProvider;
2222
private readonly IEncodingReader encodingReader;
23+
private readonly bool isLenientParsing;
2324

24-
public Type1FontHandler(IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider,
25-
IEncodingReader encodingReader)
25+
public Type1FontHandler(
26+
IPdfTokenScanner pdfScanner,
27+
ILookupFilterProvider filterProvider,
28+
IEncodingReader encodingReader,
29+
bool isLenientParsing)
2630
{
2731
this.pdfScanner = pdfScanner;
2832
this.filterProvider = filterProvider;
2933
this.encodingReader = encodingReader;
34+
this.isLenientParsing = isLenientParsing;
3035
}
3136

3237
public IFont Generate(DictionaryToken dictionary)
@@ -69,12 +74,26 @@ public IFont Generate(DictionaryToken dictionary)
6974
widths = [];
7075
}
7176

72-
if (!dictionary.TryGet(NameToken.FontDescriptor, out var _))
77+
if (!dictionary.TryGet(NameToken.FontDescriptor, out _))
7378
{
74-
if (dictionary.TryGet(NameToken.BaseFont, out var baseFontToken) &&
75-
DirectObjectFinder.TryGet(baseFontToken, pdfScanner, out NameToken? baseFontName))
79+
if (dictionary.TryGet(NameToken.BaseFont, pdfScanner, out NameToken? baseFontToken))
7680
{
77-
var metrics = Standard14.GetAdobeFontMetrics(baseFontName.Data);
81+
var metrics = Standard14.GetAdobeFontMetrics(baseFontToken.Data);
82+
83+
if (metrics == null)
84+
{
85+
if (isLenientParsing)
86+
{
87+
// We can support a fallback here to return content.
88+
// https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FontMapperImpl.java#L304
89+
metrics = Standard14.GetAdobeFontMetrics(Standard14Font.TimesRoman);
90+
}
91+
else
92+
{
93+
throw new PdfDocumentFormatException(
94+
$"Type 1 Standard 14 font with name {baseFontToken} requested, this is an invalid name.");
95+
}
96+
}
7897

7998
var overrideEncoding = encodingReader.Read(dictionary);
8099

0 commit comments

Comments
 (0)