Skip to content

Commit c3a6b03

Browse files
fix: resolve more assert errors (#16)
* added eval scripts Signed-off-by: Peter Staar <[email protected]> * fix errors in PMC Signed-off-by: Peter Staar <[email protected]> * reformat code Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent e842657 commit c3a6b03

File tree

3 files changed

+179
-235
lines changed

3 files changed

+179
-235
lines changed

docling_parse/eval.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import argparse
2+
import glob
3+
import io
4+
import os
5+
6+
from tabulate import tabulate
7+
8+
# from docling_parse.docling_parse import pdf_parser
9+
import docling_parse
10+
from docling_parse import pdf_parser
11+
12+
13+
def main():
14+
# Create the argument parser
15+
parser = argparse.ArgumentParser(description="Process a PDF file.")
16+
17+
# Add an argument for the path to the PDF file
18+
parser.add_argument(
19+
"-l",
20+
"--log-level",
21+
type=int,
22+
required=False,
23+
default=2,
24+
help="log-level 1,2,3,4",
25+
)
26+
27+
# Add an argument for the path to the PDF file
28+
parser.add_argument(
29+
"-d",
30+
"--pdfdir",
31+
type=str,
32+
help="Path to the directory with PDF files",
33+
required=True,
34+
)
35+
36+
# Add an argument for the path to the PDF file
37+
parser.add_argument(
38+
"-m",
39+
"--max-docs",
40+
type=int,
41+
required=False,
42+
default=None,
43+
help="max number of documents to run on",
44+
)
45+
46+
# Parse the command-line arguments
47+
args = parser.parse_args()
48+
print(f"The provided PDF path is: {args.pdfdir}")
49+
50+
# Check if the provided path is valid
51+
if not os.path.exists(args.pdfdir):
52+
print(f"Error: The directory {args.pdfdir} does not exist.")
53+
return
54+
55+
# Print the path to the PDF file (or add your processing logic here)
56+
57+
parser = docling_parse.pdf_parser()
58+
parser.set_loglevel(args.log_level)
59+
60+
overview = []
61+
62+
doc_files = sorted(glob.glob(os.path.join(args.pdfdir, "*.pdf")))
63+
if args.max_docs != None:
64+
doc_files = doc_files[0 : args.max_docs]
65+
66+
for doc_id, doc_file in enumerate(doc_files):
67+
print(doc_file)
68+
69+
doc_key = f"key={doc_file}" # unique document key (eg hash, UUID, etc)
70+
71+
# Load the document
72+
success = parser.load_document(doc_key, doc_file)
73+
# parser.set_loglevel(args.log_level)
74+
75+
# Get number of pages
76+
num_pages = parser.number_of_pages(doc_key)
77+
print("#-pages: ", num_pages)
78+
79+
failed = False
80+
81+
try:
82+
# Parse page by page to minimize memory footprint
83+
for page in range(0, num_pages):
84+
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
85+
86+
if "pages" not in json_doc: # page could not get parsed
87+
print(f"ERROR: page {page} is not parsed ... ")
88+
failed = True
89+
else:
90+
print(f"SUCCESS: page {page} is parsed ... ")
91+
except Exception as e:
92+
print(f"ERROR: page {page} is not parsed: {e}")
93+
failed = True
94+
95+
# Unload the document
96+
parser.unload_document(doc_key)
97+
98+
overview.append([doc_file, (not failed), num_pages])
99+
100+
print(tabulate(overview, headers=["filename", "success", "#-pages"]))
101+
102+
103+
if __name__ == "__main__":
104+
main()

src/proj_folders/pdf_library/core/object/cmap.h

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,15 @@ namespace pdf_lib
200200
auto itr = src.begin();
201201
c = utf8::next(itr, src.end());
202202
}
203-
assert(_range.first<=c and c<=_range.second);
203+
//assert(_range.first<=c and c<=_range.second);
204204

205+
if(c<_range.first or _range.second<c)
206+
{
207+
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
208+
<< "\tchar-index " << c << " for " << tgt
209+
<< " is out of range [" << _range.first << ", " << _range.second << "]";
210+
}
211+
205212
_map[c] = tgt;
206213

207214
return *this;
@@ -239,8 +246,15 @@ namespace pdf_lib
239246
{
240247
for(uint32_t i = 0; i < end - begin + 1; i++)
241248
{
242-
assert(_range.first<=begin+i and begin+i<=_range.second);
249+
//assert(_range.first<=begin+i and begin+i<=_range.second);
243250

251+
if(begin+i<_range.first or _range.second<begin+i)
252+
{
253+
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
254+
<< "\tchar-index " << begin+i //<< " for " << tgt.at(i)
255+
<< " is out of range [" << _range.first << ", " << _range.second << "]";
256+
}
257+
244258
try
245259
{
246260
std::string tmp(16, 0);
@@ -254,6 +268,9 @@ namespace pdf_lib
254268
}
255269
catch(...)
256270
{
271+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
272+
<< "could not determine char-value for cmap at index " << (begin+i);
273+
257274
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
258275
}
259276
}
@@ -262,8 +279,15 @@ namespace pdf_lib
262279
{
263280
for(uint32_t i = 0; i < end - begin + 1; i++)
264281
{
265-
assert(_range.first<=begin+i and begin+i<=_range.second);
282+
//assert(_range.first<=begin+i and begin+i<=_range.second);
266283

284+
if(begin+i<_range.first or _range.second<begin+i)
285+
{
286+
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
287+
<< "\tchar-index " << begin+i //<< " for " << tgt.at(i)
288+
<< " is out of range [" << _range.first << ", " << _range.second << "]";
289+
}
290+
267291
try
268292
{
269293
std::string tmp(128, 0);
@@ -280,6 +304,8 @@ namespace pdf_lib
280304
}
281305
catch(...)
282306
{
307+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
308+
<< "could not determine char-value for cmap at index " << (begin+i);
283309
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
284310
}
285311

@@ -336,8 +362,15 @@ namespace pdf_lib
336362
//if(begin + i>255)
337363
//std::cout << src_begin << "\t" << tgt.at(i) << "\t" << __FUNCTION__ << "\n" ;
338364

339-
assert(_range.first<=begin+i and begin+i<=_range.second);
365+
//assert(_range.first<=begin+i and begin+i<=_range.second);
340366

367+
if(begin+i<_range.first or _range.second<begin+i)
368+
{
369+
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__
370+
<< "\tchar-index " << begin+i << " for " << tgt.at(i)
371+
<< " is out of range [" << _range.first << ", " << _range.second << "]";
372+
}
373+
341374
_map[begin + i] = tgt.at(i);
342375

343376
//std::cout << __FUNCTION__ << "-2\t[" << src_begin << ":" << src_end

0 commit comments

Comments
 (0)