Skip to content

Commit cd15d00

Browse files
fix: Replace all the FATAL with ERROR messages in the v2 parser (#53)
* updated the visualize script Signed-off-by: Peter Staar <[email protected]> * replaced all the errors with fatals Signed-off-by: Peter Staar <[email protected]> * reformatted the python code Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 004df07 commit cd15d00

File tree

14 files changed

+186
-69
lines changed

14 files changed

+186
-69
lines changed

docling_parse/visualize.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ def parse_args():
5353
help="Enable interactive mode (default: False)",
5454
)
5555

56+
# Add an optional boolean argument for interactive mode
57+
parser.add_argument(
58+
"--display-text",
59+
action="store_true",
60+
help="Enable interactive mode (default: False)",
61+
)
62+
5663
# Add an argument for the output directory, defaulting to "./tmp"
5764
parser.add_argument(
5865
"-o",
@@ -91,11 +98,17 @@ def parse_args():
9198
args.interactive,
9299
args.output_dir,
93100
int(args.page),
101+
args.display_text,
94102
)
95103

96104

97105
def visualise_v1(
98-
log_level: str, pdf_path: str, interactive: str, output_dir: str, page_num: int
106+
log_level: str,
107+
pdf_path: str,
108+
interactive: str,
109+
output_dir: str,
110+
page_num: int,
111+
display_text: bool,
99112
):
100113

101114
parser = pdf_parser_v1()
@@ -200,7 +213,12 @@ def visualise_v1(
200213

201214

202215
def visualise_v2(
203-
log_level: str, pdf_path: str, interactive: str, output_dir: str, page_num: int
216+
log_level: str,
217+
pdf_path: str,
218+
interactive: str,
219+
output_dir: str,
220+
page_num: int,
221+
display_text: bool,
204222
):
205223

206224
parser = pdf_parser_v2(log_level)
@@ -214,10 +232,17 @@ def visualise_v2(
214232

215233
doc = None
216234

217-
if page_num == -1:
218-
doc = parser.parse_pdf_from_key(doc_key)
219-
else:
220-
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
235+
try:
236+
if page_num == -1:
237+
doc = parser.parse_pdf_from_key(doc_key)
238+
else:
239+
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
240+
except Exception as exc:
241+
print(f"Could not parse pdf-document: {exc}")
242+
doc = None
243+
244+
if doc == None:
245+
return
221246

222247
parser.unload_document(doc_key)
223248

@@ -295,6 +320,9 @@ def visualise_v2(
295320
(x[3], H - y[3]),
296321
]
297322

323+
if display_text:
324+
print(row[cells_header.index("text")])
325+
298326
if "glyph" in row[cells_header.index("text")]:
299327
print(f" skip cell -> {row}")
300328
continue
@@ -328,12 +356,12 @@ def visualise_v2(
328356

329357
def main():
330358

331-
log_level, version, pdf, interactive, output_dir, page = parse_args()
359+
log_level, version, pdf, interactive, output_dir, page, display_text = parse_args()
332360

333361
if version == "v1":
334-
visualise_v1(log_level, pdf, interactive, output_dir, page)
362+
visualise_v1(log_level, pdf, interactive, output_dir, page, display_text)
335363
elif version == "v2":
336-
visualise_v2(log_level, pdf, interactive, output_dir, page)
364+
visualise_v2(log_level, pdf, interactive, output_dir, page, display_text)
337365
else:
338366
return -1
339367

src/v2/enums.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace pdflib
3636
else if (name=="CID_FONT_TYPE_2" or name=="/CIDFontType2") { return CID_FONT_TYPE_2; }
3737
else
3838
{
39-
LOG_S(FATAL) << "unknown subtype " << name;
39+
LOG_S(ERROR) << "unknown subtype " << name;
4040
return NULL_TYPE;
4141
}
4242
}
@@ -59,7 +59,7 @@ namespace pdflib
5959

6060
default:
6161
{
62-
LOG_S(FATAL) << "encountered a NULL_ENCODING";
62+
LOG_S(ERROR) << "encountered a NULL_ENCODING";
6363
return "NULL_ENCODING";
6464
}
6565
}
@@ -90,7 +90,7 @@ namespace pdflib
9090
else if(name=="CMAP_RESOURCES" ) { return CMAP_RESOURCES; }
9191
else
9292
{
93-
LOG_S(FATAL) << __FILE__ << ":" << __LINE__ << " --> unknown encoding " << name;
93+
LOG_S(ERROR) << __FILE__ << ":" << __LINE__ << " --> unknown encoding " << name;
9494
return NULL_ENCODING;
9595
}
9696
}
@@ -109,7 +109,7 @@ namespace pdflib
109109

110110
default:
111111
{
112-
LOG_S(FATAL) << "encountered a NULL_ENCODING";
112+
LOG_S(ERROR) << "encountered a NULL_ENCODING";
113113
return "NULL_ENCODING";
114114
}
115115
}

src/v2/pdf_decoders/stream.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,11 @@ namespace pdflib
253253
{
254254
if(stack.size()==0)
255255
{
256-
LOG_S(FATAL) << "stack-size is zero!";
256+
std::stringstream message;
257+
message << "stack-size is zero in " << __FILE__ << ":" << __LINE__;
258+
259+
LOG_S(ERROR) << message.str();
260+
throw std::logic_error(message.str());
257261
}
258262

259263
pdf_state<GLOBAL>& state = stack.back();

src/v2/pdf_resources/page_dimension.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ namespace pdflib
110110
}
111111
else
112112
{
113-
LOG_S(FATAL) << "The page is missing the required '/MediaBox'";
113+
LOG_S(ERROR) << "The page is missing the required '/MediaBox'";
114114
}
115115

116116
if(json_resources.count("/CropBox"))
@@ -173,8 +173,12 @@ namespace pdflib
173173
}
174174
else
175175
{
176-
LOG_S(FATAL) << "could not find the page-dimensions: "
177-
<< json_resources.dump(4);
176+
std::stringstream ss;
177+
ss << "could not find the page-dimensions: "
178+
<< json_resources.dump(4);
179+
180+
LOG_S(ERROR) << ss.str();
181+
throw std::logic_error(ss.str());
178182
}
179183
}
180184

src/v2/pdf_resources/page_font.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,9 @@ namespace pdflib
180180
}
181181
else
182182
{
183-
LOG_S(FATAL) << "no existing pdf_resources_dir: "
184-
<< pdf_resources_dir;
183+
std::string message = "no existing pdf_resources_dir: " + pdf_resources_dir;
184+
LOG_S(ERROR) << message;
185+
throw std::logic_error(message);
185186
}
186187

187188
utils::timer timer;
@@ -1148,7 +1149,11 @@ namespace pdflib
11481149
}
11491150
else
11501151
{
1151-
LOG_S(FATAL) << "unknown type in " << __FUNCTION__;
1152+
std::stringstream message;
1153+
message << "unknown type in " << __FUNCTION__;
1154+
1155+
LOG_S(ERROR) << message.str();
1156+
throw std::logic_error(message.str());
11521157
}
11531158
}
11541159
}
@@ -1166,7 +1171,12 @@ namespace pdflib
11661171
if(not qpdf_font.hasKey("/ToUnicode"))
11671172
{
11681173
auto tmp = to_json(qpdf_font);
1169-
LOG_S(FATAL) << "qpdf-font: " << tmp.dump();
1174+
1175+
std::stringstream ss;
1176+
ss << "qpdf-font: " << tmp.dump();
1177+
1178+
LOG_S(ERROR) << ss.str();
1179+
throw std::logic_error(ss.str());
11701180
}
11711181

11721182
auto qpdf_obj = qpdf_font.getKey("/ToUnicode");

src/v2/pdf_resources/page_font/base_font.h

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,15 @@ namespace pdflib
157157
return bbox[3];
158158
}
159159

160-
LOG_S(FATAL) << "properties does not have key 'Ascender': "
161-
<< properties.dump(2);
162-
160+
{
161+
std::stringstream ss;
162+
ss << "properties does not have key 'Ascender': "
163+
<< properties.dump(2);
164+
165+
LOG_S(ERROR) << ss.str();
166+
throw std::logic_error(ss.str());
167+
}
168+
163169
return -1.;
164170
}
165171

@@ -179,9 +185,15 @@ namespace pdflib
179185
return bbox[1];
180186
}
181187

182-
LOG_S(FATAL) << "properties does not have key 'Descender': "
183-
<< properties.dump(2);
188+
{
189+
std::stringstream ss;
190+
ss << "properties does not have key 'Descender': "
191+
<< properties.dump(2);
184192

193+
LOG_S(ERROR) << ss.str();
194+
throw std::logic_error(ss.str());
195+
}
196+
185197
return -1.;
186198
}
187199

@@ -194,9 +206,15 @@ namespace pdflib
194206
return properties["FontBBox"].get<std::array<double, 4> >();
195207
}
196208

197-
LOG_S(FATAL) << "properties does not have key 'FontBBox': "
198-
<< properties.dump(2);
199-
209+
{
210+
std::stringstream ss;
211+
ss << "properties does not have key 'FontBBox': "
212+
<< properties.dump(2);
213+
214+
LOG_S(ERROR) << ss.str();
215+
throw std::logic_error(ss.str());
216+
}
217+
200218
return {0.0, 0.0, 0.0, 0.0};
201219
}
202220

src/v2/pdf_resources/page_font/base_fonts.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,6 @@ namespace pdflib
121121
{
122122
if(norm_name.find(itr->first)!=std::string::npos)
123123
{
124-
//return itr->first;
125-
126124
// we have to be careful that "Helvetica" is not returned for Helvetice-Bold!
127125
if(result.size()<(itr->first).size())
128126
{
@@ -136,8 +134,8 @@ namespace pdflib
136134
return result;
137135
}
138136

139-
LOG_S(FATAL) << "unkown " << font_name << "[norm_name=" << norm_name << "]";
140-
137+
LOG_S(ERROR) << "unkown " << font_name << "[norm_name=" << norm_name << "]";
138+
141139
return "Unknown";
142140
}
143141

@@ -269,7 +267,11 @@ namespace pdflib
269267

270268
if(fontname=="unknown")
271269
{
272-
LOG_S(FATAL) << "no FontName found in " << filename;
270+
std::stringstream ss;
271+
ss << "no FontName found in " << filename;
272+
273+
LOG_S(ERROR) << ss.str();
274+
throw std::logic_error(ss.str());
273275
}
274276

275277
return fontname;

src/v2/pdf_resources/page_font/cmap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ namespace pdflib
434434

435435
if(_map.count(begin+i)==1)
436436
{
437-
LOG_S(FATAL) << "overwriting number c=" << begin+i;
437+
LOG_S(WARNING) << "overwriting number c=" << begin+i;
438438
}
439439

440440
_map[begin + i] = tmp;

src/v2/pdf_resources/page_font/font_cid.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ namespace pdflib
8787
if(file.fail())
8888
{
8989
LOG_S(ERROR) << "filename does not exists: " << filename;
90-
LOG_S(FATAL) << "unknown data-file!";
90+
91+
9192
}
9293

9394
bool cmap=false;
@@ -143,8 +144,11 @@ namespace pdflib
143144

144145
if(file.fail())
145146
{
146-
LOG_S(ERROR) << "filename does not exists: " << filename;
147-
LOG_S(FATAL) << "unknown data-file!";
147+
std::stringstream ss;
148+
ss << "filename does not exists: " << filename;
149+
150+
LOG_S(ERROR) << ss.str();
151+
throw std::logic_error(ss.str());
148152
}
149153

150154
std::vector<int> col_inds = {};
@@ -246,7 +250,7 @@ namespace pdflib
246250
}
247251
else
248252
{
249-
LOG_S(FATAL) << "we should never arrive here!";
253+
LOG_S(ERROR) << "all options exhausted for " << __FUNCTION__;
250254
}
251255
}
252256

src/v2/pdf_resources/page_font/glyphs.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,11 @@ namespace pdflib
152152

153153
if(file.fail())
154154
{
155-
LOG_S(ERROR) << "filename does not exists: " << filename;
156-
LOG_S(FATAL) << "unknown data-file!";
155+
std::stringstream ss;
156+
ss << "filename does not exists: " << filename;
157+
158+
LOG_S(ERROR) << ss.str();
159+
throw std::logic_error(ss.str());
157160
}
158161

159162
std::string line;
@@ -218,8 +221,11 @@ namespace pdflib
218221

219222
if(file.fail())
220223
{
221-
LOG_S(ERROR) << "filename does not exists: " << filename;
222-
LOG_S(FATAL) << "unknown data-file!";
224+
std::stringstream ss;
225+
ss << "filename does not exists: " << filename;
226+
227+
LOG_S(ERROR) << ss.str();
228+
throw std::logic_error(ss.str());
223229
}
224230

225231
std::string line;

0 commit comments

Comments
 (0)