Skip to content

Commit 400fcb3

Browse files
feat: deal with qpdf errors on a page by page basis (#11)
* adding load/unload from key Signed-off-by: Peter Staar <[email protected]> * updated tests Signed-off-by: Peter Staar <[email protected]> * all fixed, still need to clean all commented out code Signed-off-by: Peter Staar <[email protected]> * ran pre-commit hooks Signed-off-by: Peter Staar <[email protected]> * allow more tabulate versions Signed-off-by: Michele Dolfi <[email protected]> * renamed some key functions Signed-off-by: Peter Staar <[email protected]> * ran pre-commit hooks (2) Signed-off-by: Peter Staar <[email protected]> * added log-level to run Signed-off-by: Peter Staar <[email protected]> * Added try-catch to avoid poisonous pdf-pages (2) Signed-off-by: Peter Staar <[email protected]> * updated the readme Signed-off-by: Peter Staar <[email protected]> * removed multi-threading build Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent ecb2da8 commit 400fcb3

File tree

4 files changed

+67
-9
lines changed

4 files changed

+67
-9
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ for page in range(0, num_pages):
5858
# No need to unload a specifc page
5959
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
6060

61+
if "pages" not in json_doc: # page could not get parsed
62+
continue
63+
6164
# parsed page is the first one!
6265
json_page = json_doc["pages"][0]
6366

build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def build_local(multi_threaded=True):
3939

4040
cmd = f"cmake --build {BUILD_DIR} --target install"
4141
if multi_threaded:
42-
cmd += " -j"
42+
cmd += " -j 4"
4343
success = run(cmd, cwd=ROOT_DIR)
4444
if not success:
4545
raise RuntimeError("Error building.")

docling_parse/run.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@ def main():
1313
# Create the argument parser
1414
parser = argparse.ArgumentParser(description="Process a PDF file.")
1515

16+
# Add an argument for the path to the PDF file
17+
parser.add_argument(
18+
"-l",
19+
"--log-level",
20+
type=int,
21+
required=False,
22+
default=2,
23+
help="log-level 1,2,3,4",
24+
)
25+
1626
# Add an argument for the path to the PDF file
1727
parser.add_argument(
1828
"-p", "--pdf", type=str, help="Path to the PDF file", required=True
@@ -30,6 +40,7 @@ def main():
3040
# Print the path to the PDF file (or add your processing logic here)
3141

3242
parser = docling_parse.pdf_parser()
43+
parser.set_loglevel(args.log_level)
3344

3445
doc_file = args.pdf # filename
3546
doc_key = f"key={args.pdf}" # unique document key (eg hash, UUID, etc)
@@ -43,6 +54,10 @@ def main():
4354
# Parse page by page to minimize memory footprint
4455
for page in range(0, num_pages):
4556
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
57+
58+
if "pages" not in json_doc: # page could not get parsed
59+
continue
60+
4661
json_page = json_doc["pages"][0]
4762

4863
page_dimensions = [

src/proj_folders/pdf_interface/parser.h

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,23 @@ namespace pdf_lib
691691
auto& parser = loaded_parsers.at(key);
692692

693693
doc->resize_pages(0);
694-
parser->process_all();
694+
try
695+
{
696+
parser->process_all();
697+
}
698+
catch (const std::exception& e) // Catch standard exceptions
699+
{
700+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
701+
<< " error with process_page_from_document: "
702+
<< e.what();
703+
return false;
704+
}
705+
catch (...) // Catch any other types of exceptions
706+
{
707+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
708+
<< " unknown error with process_page_from_document";
709+
return false;
710+
}
695711

696712
try
697713
{
@@ -725,8 +741,24 @@ namespace pdf_lib
725741

726742
auto& doc = loaded_documents.at(key);
727743
auto& parser = loaded_parsers.at(key);
728-
729-
parser->process_page_from_document(page);
744+
745+
try
746+
{
747+
parser->process_page_from_document(page);
748+
}
749+
catch (const std::exception& e) // Catch standard exceptions
750+
{
751+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
752+
<< " error with process_page_from_document: "
753+
<< e.what();
754+
return false;
755+
}
756+
catch (...)
757+
{
758+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
759+
<< " error with process_page_from_document";
760+
return false;
761+
}
730762

731763
try
732764
{
@@ -739,7 +771,7 @@ namespace pdf_lib
739771
catch (...)
740772
{
741773
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
742-
<< "\t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
774+
<< " ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
743775
return false;
744776
}
745777

@@ -755,10 +787,18 @@ namespace pdf_lib
755787

756788
pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
757789

758-
{
759-
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
760-
parser.load_document(filename).process_all();
761-
}
790+
try
791+
{
792+
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
793+
parser.load_document(filename).process_all();
794+
}
795+
catch (const std::exception& e) // Catch standard exceptions
796+
{
797+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
798+
<< " error with process_page_from_document: "
799+
<< e.what();
800+
return false;
801+
}
762802

763803
try
764804
{

0 commit comments

Comments
 (0)