Skip to content

Commit bb978c2

Browse files
fix: removing asserts that break parse-v2 (#55)
Signed-off-by: Peter Staar <[email protected]>
1 parent e59c904 commit bb978c2

File tree

12 files changed

+349
-148
lines changed

12 files changed

+349
-148
lines changed

poetry.lock

Lines changed: 5 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ pytest = "^7.4.2"
6969

7070
[tool.poetry.group.visualisation.dependencies]
7171
pillow = "^10.4.0"
72+
tqdm = "^4.67.0"
7273

7374
[tool.poetry.scripts]
7475
docling-parse = "docling_parse.run:main"

src/v2/pdf_resources/page_font.h

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -671,11 +671,19 @@ namespace pdflib
671671
if(subtype==TYPE_0 and utils::json::has(keys_0, json_font))
672672
{
673673
auto desc_fonts = utils::json::get(keys_0, json_font);
674-
assert(desc_fonts.size()==1);
674+
//assert(desc_fonts.size()==1);
675675

676-
desc_font = desc_fonts[0];
677-
678-
LOG_S(INFO) << "found the descendant font";// << desc_font.dump(2);
676+
if(desc_fonts.size()==1)
677+
{
678+
LOG_S(INFO) << "found the descendant font";// << desc_font.dump(2);
679+
desc_font = desc_fonts[0];
680+
}
681+
else
682+
{
683+
std::string message = "no descendant font!";
684+
LOG_S(ERROR) << message;
685+
throw std::logic_error(message);
686+
}
679687
}
680688
else if(subtype==TYPE_0)
681689
{
@@ -777,7 +785,7 @@ namespace pdflib
777785
}
778786
else if(utils::json::has(keys_1, json_font))
779787
{
780-
assert(subtype==TYPE_3);
788+
//assert(subtype==TYPE_3);
781789

782790
auto result = utils::json::get(keys_1, json_font);
783791

@@ -788,7 +796,7 @@ namespace pdflib
788796
}
789797
else if(utils::json::has(keys_1, desc_font))
790798
{
791-
assert(subtype==TYPE_3);
799+
//assert(subtype==TYPE_3);
792800

793801
auto result = utils::json::get(keys_1, desc_font);
794802

@@ -1102,8 +1110,8 @@ namespace pdflib
11021110
{
11031111
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t beg: " << ws[l].dump();
11041112

1105-
assert(l<ws.size());
1106-
1113+
//assert(l<ws.size());
1114+
11071115
beg = ws[l].get<int>();
11081116
l += 1;
11091117

@@ -1116,13 +1124,26 @@ namespace pdflib
11161124
{
11171125
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t end: " << ws[l].dump();
11181126

1119-
assert(l<ws.size());
1127+
//assert(l<ws.size());
1128+
1129+
if(l>=ws.size())
1130+
{
1131+
LOG_S(WARNING) << "index " << l << " is out of bounds " << ws.size();
1132+
continue;
1133+
}
1134+
11201135
end = ws[l].get<int>();
11211136
l += 1;
11221137

11231138
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t w: " << ws[l].dump();
11241139

1125-
assert(l<ws.size());
1140+
//assert(l<ws.size());
1141+
if(l>=ws.size())
1142+
{
1143+
LOG_S(WARNING) << "index " << l << " is out of bounds " << ws.size();
1144+
continue;
1145+
}
1146+
11261147
double w = ws[l].get<double>();
11271148
l += 1;
11281149

@@ -1136,7 +1157,13 @@ namespace pdflib
11361157
{
11371158
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t widths: " << ws[l].dump();
11381159

1139-
assert(l<ws.size());
1160+
//assert(l<ws.size());
1161+
if(l>=ws.size())
1162+
{
1163+
LOG_S(WARNING) << "index " << l << " is out of bounds " << ws.size();
1164+
continue;
1165+
}
1166+
11401167
std::vector<double> w = ws[l].get<std::vector<double> >();
11411168
l += 1;
11421169

@@ -1180,8 +1207,15 @@ namespace pdflib
11801207
}
11811208

11821209
auto qpdf_obj = qpdf_font.getKey("/ToUnicode");
1183-
assert(qpdf_obj.isStream());
1210+
//assert(qpdf_obj.isStream());
11841211

1212+
if(not qpdf_obj.isStream())
1213+
{
1214+
std::string message = "not qpdf_obj.isStream()";
1215+
LOG_S(ERROR) << message;
1216+
throw std::logic_error(message);
1217+
}
1218+
11851219
std::vector<qpdf_instruction> stream;
11861220

11871221
// decode the stream
@@ -1396,7 +1430,7 @@ namespace pdflib
13961430
else if(name_to_descr.count(name)==1 and
13971431
cmap_numb_to_char.count(numb)==0)
13981432
{
1399-
assert(subtype==TYPE_3);
1433+
//assert(subtype==TYPE_3);
14001434
14011435
LOG_S(WARNING) << "could not resolve the character (name="<<name
14021436
<<", numb="<<numb<<") for TYPE_3 font:" << font_name;
@@ -1462,7 +1496,7 @@ namespace pdflib
14621496

14631497
if(utils::json::has(keys, json_font))
14641498
{
1465-
assert(subtype==TYPE_3);
1499+
//assert(subtype==TYPE_3);
14661500

14671501
QPDFObjectHandle qpdf_char_procs = qpdf_font.getKey(keys.front());
14681502
LOG_S(WARNING) << "found CharProcs: " << qpdf_char_procs.getTypeName();
@@ -1478,8 +1512,14 @@ namespace pdflib
14781512
QPDFObjectHandle qpdf_char_proc = qpdf_char_procs.getKey(key);
14791513
//LOG_S(INFO) << "decoding: " << key << " -> " << qpdf_char_proc.getTypeName();
14801514

1481-
assert(qpdf_char_proc.isStream());
1482-
1515+
//assert(qpdf_char_proc.isStream());
1516+
if(not qpdf_char_proc.isStream())
1517+
{
1518+
std::string message = "not qpdf_obj.isStream()";
1519+
LOG_S(ERROR) << message;
1520+
throw std::logic_error(message);
1521+
}
1522+
14831523
std::vector<qpdf_instruction> stream={};
14841524

14851525
// decode the stream

0 commit comments

Comments
 (0)