Skip to content

Commit 1815e7d

Browse files
fix: Robustify parser v2 (#49)
* avoiding fatals Signed-off-by: Peter Staar <[email protected]> * ran entire DLN-v1 with only two errors Signed-off-by: Peter Staar <[email protected]> * added a few more safeguards Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 09224e4 commit 1815e7d

File tree

10 files changed

+130
-28
lines changed

10 files changed

+130
-28
lines changed

src/v2/pdf_resources/page_font.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,7 @@ namespace pdflib
688688
else
689689
{
690690
subtype=NULL_TYPE;
691-
LOG_S(FATAL) << "could not find subtype in font: " << json_font.dump(2);
691+
LOG_S(ERROR) << "could not find subtype in font: " << json_font.dump(2);
692692
}
693693
}
694694

@@ -1050,15 +1050,21 @@ namespace pdflib
10501050

10511051
if(values.size()!=(lchar-fchar+1))
10521052
{
1053-
LOG_S(FATAL) << "values.size()!=(lchar-fchar+1) -> "
1053+
LOG_S(ERROR) << "values.size()!=(lchar-fchar+1) -> "
10541054
<< values.size() << "!=" << lchar << "-" << fchar << "+1";
10551055
}
10561056

10571057
int cnt=0;
10581058
for(int ind=fchar; ind<=lchar; ind++)
10591059
{
1060+
if(cnt>=values.size())
1061+
{
1062+
LOG_S(ERROR) << "going out of bounds with " << cnt << " >= " << values.size();
1063+
continue;
1064+
}
1065+
10601066
numb_to_widths[ind] = values[cnt++];
1061-
LOG_S(INFO) << "index: " << ind << " -> width: " << numb_to_widths.at(ind);
1067+
//LOG_S(INFO) << "index: " << ind << " -> width: " << numb_to_widths.at(ind);
10621068
}
10631069
}
10641070

src/v2/pdf_resources/page_font/base_fonts.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,15 @@ namespace pdflib
4343

4444
private:
4545

46+
bool initialized;
47+
4648
std::set<std::string> core_14_fonts;
4749

4850
std::map<std::string, base_font_type> name_to_basefont;
4951
};
5052

51-
base_fonts::base_fonts()
53+
base_fonts::base_fonts():
54+
initialized(false)
5255
{}
5356

5457
base_fonts::~base_fonts()
@@ -159,6 +162,12 @@ namespace pdflib
159162
template<typename glyphs_type>
160163
void base_fonts::initialise(std::string dirname, glyphs_type& glyphs)
161164
{
165+
if(initialized)
166+
{
167+
LOG_S(WARNING) << "skipping base_fonts::initialise, already initialized ...";
168+
return;
169+
}
170+
162171
std::vector<std::string> standard = utils::filesystem::list_files(dirname+"/standard");
163172
std::sort(standard.begin(), standard.end());
164173

@@ -217,7 +226,9 @@ namespace pdflib
217226
{
218227
//LOG_S(WARNING) << "\t font-name (=" << fontname << ") already read";
219228
}
220-
}
229+
}
230+
231+
initialized = true;
221232
}
222233

223234
std::string base_fonts::read_fontname(std::string filename)

src/v2/pdf_resources/page_font/cmap.h

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ namespace pdflib
504504

505505
if(_map.count(begin+i)==1)
506506
{
507-
LOG_S(FATAL) << "overwriting number c=" << begin+i;
507+
LOG_S(ERROR) << "overwriting number c=" << begin+i;
508508
}
509509

510510
_map[begin + i] = tgt.at(i);
@@ -514,18 +514,43 @@ namespace pdflib
514514
void cmap_parser::parse_beginbfchar(std::vector<qpdf_instruction>& parameters)
515515
{
516516
LOG_S(INFO) << __FUNCTION__;
517-
assert(parameters.size()==1);
517+
//assert(parameters.size()==1);
518518

519-
char_count = parameters[0].to_int();
519+
if(parameters.size()==1)
520+
{
521+
char_count = parameters[0].to_int();
522+
}
523+
else if(parameters.size()>0)
524+
{
525+
LOG_S(WARNING) << "parameters.size()>0 for parse_beginbfchar";
526+
char_count = parameters[0].to_int();
527+
}
528+
else
529+
{
530+
LOG_S(ERROR) << "parameters.size()!=1 for parse_beginbfchar";
531+
}
520532
}
521533

522534
void cmap_parser::parse_endbfchar(std::vector<qpdf_instruction>& parameters)
523535
{
524536
LOG_S(INFO) << __FUNCTION__ << ": starting ...";
525-
assert(parameters.size()==2*char_count);
526537

538+
if(parameters.size()!=2*char_count)
539+
{
540+
LOG_S(WARNING) << "parameters.size()!=2*char_count -> "
541+
<< "parameters: " << parameters.size() << ", "
542+
<< "char_count: " << char_count;
543+
}
544+
//assert(parameters.size()==2*char_count);
545+
527546
for(size_t i=0; i<char_count; i++)
528547
{
548+
if(2*i>=parameters.size())
549+
{
550+
LOG_S(ERROR) << "going out of bounds: skipping parse_endbfchar";
551+
continue;
552+
}
553+
529554
QPDFObjectHandle source_ = parameters[2*i+0].obj;
530555
QPDFObjectHandle target_ = parameters[2*i+1].obj;
531556

src/v2/pdf_resources/page_font/encodings.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ namespace pdflib
2121

2222
private:
2323

24+
bool initialized;
25+
2426
std::map<font_encoding_name, font_encoding> name_to_encoding;
2527
};
2628

27-
font_encodings::font_encodings()
29+
font_encodings::font_encodings():
30+
initialized(false)
2831
{}
2932

3033
font_encodings::~font_encodings()
@@ -38,6 +41,12 @@ namespace pdflib
3841
template<typename glyphs_type>
3942
void font_encodings::initialise(std::string dirname, glyphs_type& glyphs)
4043
{
44+
if(initialized)
45+
{
46+
LOG_S(WARNING) << "skipping font_encodings::initialise, already initialized ...";
47+
return;
48+
}
49+
4150
std::vector<std::pair<font_encoding_name, std::string> > items = {
4251
{STANDARD, "std.dat"},
4352
{MACROMAN, "macroman.dat"},
@@ -50,6 +59,8 @@ namespace pdflib
5059
font_encoding& encoding = name_to_encoding[item.first];
5160
encoding.initialise(item.first, dirname+"/"+item.second, glyphs);
5261
}
62+
63+
initialized = true;
5364
}
5465

5566
}

src/v2/pdf_resources/page_font/font_cids.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ namespace pdflib
3333

3434
private:
3535

36+
bool initialized;
3637
std::string directory;
3738

3839
std::map<std::string, int> ro_2_sup;
@@ -45,7 +46,8 @@ namespace pdflib
4546
std::map<std::string, font_cid> cids;
4647
};
4748

48-
font_cids::font_cids()
49+
font_cids::font_cids():
50+
initialized(false)
4951
{}
5052

5153
font_cids::~font_cids()
@@ -84,7 +86,13 @@ namespace pdflib
8486

8587
void font_cids::initialise(std::string dirname)
8688
{
87-
LOG_S(INFO) << __FUNCTION__;
89+
if(initialized)
90+
{
91+
LOG_S(WARNING) << "skipping font_cids::initialise, already initialized ...";
92+
return;
93+
}
94+
95+
LOG_S(INFO) << "initialise font_cids";
8896

8997
directory = dirname;
9098
directory += (directory.back()=='/'? "" : "/");
@@ -126,6 +134,8 @@ namespace pdflib
126134
cmap_2_filename[file] = cdir+"/"+file;
127135
}
128136
}
137+
138+
initialized = true;
129139
}
130140

131141
bool font_cids::decode_cmap_resource(std::string cmap_name)

src/v2/pdf_resources/page_font/glyphs.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ namespace pdflib
2828
std::string operator[](std::string key);
2929

3030
void initialise(std::string dirname);
31-
31+
3232
private:
3333

3434
void read_file_hex(std::string filename);
@@ -39,13 +39,16 @@ namespace pdflib
3939

4040
private:
4141

42+
bool initialized;
43+
4244
std::set<std::string> unknown_glyphs;
4345

4446
std::map<std::string, std::string> name_to_code;
4547
std::map<std::string, std::string> name_to_utf8;
4648
};
4749

48-
font_glyphs::font_glyphs()
50+
font_glyphs::font_glyphs():
51+
initialized(false)
4952
{}
5053

5154
font_glyphs::~font_glyphs()
@@ -103,6 +106,12 @@ namespace pdflib
103106

104107
void font_glyphs::initialise(std::string dirname)
105108
{
109+
if(initialized)
110+
{
111+
LOG_S(WARNING) << "skipping font_glyphs::initialise, already initialized ...";
112+
return;
113+
}
114+
106115
LOG_S(INFO) << "font-glyphs initialise from directory: "
107116
<< dirname;
108117

@@ -116,7 +125,7 @@ namespace pdflib
116125
"/custom/MathematicalPi/MathematicalPi.hex.dat"
117126
};
118127

119-
for(auto path : paths_hex)
128+
for(auto path:paths_hex)
120129
{
121130
std::string fpath = dirname + path;
122131
read_file_hex(fpath);
@@ -126,11 +135,13 @@ namespace pdflib
126135
"/custom/MathematicalPi/MathematicalPi.uni.dat"
127136
};
128137

129-
for(auto path : paths_uni)
138+
for(auto path:paths_uni)
130139
{
131140
std::string fpath = dirname + path;
132141
read_file_uni(fpath);
133142
}
143+
144+
initialized = true;
134145
}
135146

136147
void font_glyphs::read_file_hex(std::string filename)
@@ -165,7 +176,7 @@ namespace pdflib
165176
{
166177
name_to_utf8[key] = utils::string::hex_to_utf8(val_, 4);
167178
}
168-
else if(name_to_utf8.count(key)==1)
179+
else if(name_to_utf8.count(key)==1) // already present
169180
{
170181
LOG_S(ERROR) << "key [" << key << "] is defined twice";
171182
}

src/v2/pdf_states/grph.h

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ namespace pdflib
7272
int line_cap;
7373
int line_join;
7474

75-
int dash_phase;
75+
double dash_phase;
7676
std::vector<double> dash_array;
7777

7878
double flatness;
@@ -213,8 +213,20 @@ namespace pdflib
213213
dash_array.push_back(val);
214214
}
215215

216-
assert(instructions[1].is_integer());
217-
dash_phase = instructions[1].to_int();
216+
if(instructions[1].is_integer())
217+
{
218+
dash_phase = instructions[1].to_int();
219+
}
220+
else if(instructions[1].is_number())
221+
{
222+
dash_phase = instructions[1].to_double();
223+
}
224+
else
225+
{
226+
dash_phase = 0;
227+
LOG_S(ERROR) << "failed instructions[1] with is_integer() and is_number"
228+
<< instructions[1].unparse();
229+
}
218230
}
219231

220232
void pdf_state<GRPH>::ri(std::vector<qpdf_instruction>& instructions)
@@ -227,8 +239,16 @@ namespace pdflib
227239
//assert(instructions.size()==1);
228240
if(not verify(instructions, 1, __FUNCTION__) ) { return; }
229241

230-
assert(instructions[0].is_number());
231-
flatness = instructions[0].to_double();
242+
if(instructions[0].is_number())
243+
{
244+
flatness = instructions[0].to_double();
245+
}
246+
else
247+
{
248+
flatness = 0;
249+
LOG_S(ERROR) << "failed instructions[0].is_number(): "
250+
<< instructions[0].unparse();
251+
}
232252
}
233253

234254
void pdf_state<GRPH>::gs(std::vector<qpdf_instruction>& instructions)

src/v2/pdf_states/line.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,13 @@ namespace pdflib
518518

519519
// first close
520520
auto& line = curr_lines.back();
521+
522+
if(line.size()==0)
523+
{
524+
LOG_S(WARNING) << "applying 'h' on empty line";
525+
return;
526+
}
527+
521528
std::pair<double, double> coor = line.front();
522529

523530
line.append(coor.first, coor.second);

src/v2/pdf_states/text.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ namespace pdflib
260260
}
261261
else
262262
{
263-
LOG_S(FATAL) << "unknown page-font: " << font_name;
263+
LOG_S(ERROR) << "unknown page-font: " << font_name;
264264
}
265265
}
266266

@@ -313,8 +313,9 @@ namespace pdflib
313313
}
314314
else
315315
{
316-
LOG_S(FATAL) << "item is not a string nor a value: "
317-
<< item.unparse() << " [" << item.getTypeName() << "]";
316+
LOG_S(ERROR) << "item is not a string nor a value: "
317+
<< item.unparse() << " [" << item.getTypeName() << "]"
318+
<< " -> skipping for now ...";
318319
}
319320
}
320321
}

src/v2/qpdf/to_json.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ namespace pdflib
1616
// FIXME: add a begin time to cap the max time spent in this routine
1717
nlohmann::json to_json(QPDFObjectHandle obj, std::set<std::string> prev_objs={}, int level=0)
1818
{
19-
//const static int max_level=32;
20-
const static int max_level=128;
19+
const static int max_level=32;
20+
//const static int max_level=128;
2121

2222
LOG_S(INFO) << "to_json (level=" << level << "): " << prev_objs.size();
2323

@@ -47,7 +47,7 @@ namespace pdflib
4747

4848
if(level<max_level)
4949
{
50-
const static std::set<std::string> keys_to_be_skipped = {"/Parent", "/P", "/Annots"};
50+
const static std::set<std::string> keys_to_be_skipped = {"/Parent", "/P", "/Annots", "/B"};
5151

5252
if(obj.isDictionary())
5353
{

0 commit comments

Comments
 (0)