Skip to content

Commit 92e02ec

Browse files
feat: read page by page (#7)
* first working version to parse page-by-page Signed-off-by: Peter Staar <[email protected]> * added the read page-by-page using bytesio Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent db951c8 commit 92e02ec

File tree

13 files changed

+6307
-45
lines changed

13 files changed

+6307
-45
lines changed

app/pybind_parse.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,24 @@ PYBIND11_MODULE(docling_parse, m) {
1313

1414
pybind11::class_<docling::docling_parser>(m, "pdf_parser")
1515
.def(pybind11::init())
16+
1617
.def("set_loglevel", &docling::docling_parser::set_loglevel)
18+
19+
.def("unload_documents", &docling::docling_parser::unload_documents)
20+
1721
.def("find_cells",
1822
pybind11::overload_cast<std::string>(&docling::docling_parser::find_cells),
19-
"parse pdf-document from path into json")
23+
"parse pdf-document from path into json")
24+
2025
.def("find_cells_from_bytesio",
2126
&docling::docling_parser::find_cells_from_bytesio,
22-
"A function to read a BytesIO object");
27+
"parse pdf-document from a BytesIO object")
28+
29+
.def("find_cells_on_page",
30+
&docling::docling_parser::find_cells_on_page,
31+
"parse specific page in pdf-document from path into json")
32+
33+
.def("find_cells_from_bytesio_on_page",
34+
&docling::docling_parser::find_cells_from_bytesio_on_page,
35+
"parse pdf-document from a BytesIO object for a specific page");
2336
}

src/proj_folders/pdf_interface/parser.h

Lines changed: 215 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ namespace pdf_lib
2828
interface();
2929
~interface();
3030

31+
void clear();
32+
3133
int query(std::string input_file);
3234

3335
//private:
@@ -36,12 +38,25 @@ namespace pdf_lib
3638

3739
void parse(container_lib::container& input);
3840

39-
bool parse_pdf_page(std::string id,
40-
container_lib::container& raw_page);
41+
bool load_document(std::string filename);
42+
bool unload_document(std::string filename);
4143

44+
std::string load_document(const char* buffer, std::size_t size);
45+
46+
bool unload_documents();
47+
48+
bool parse_pdf_page(std::string filename,
49+
container_lib::container& raw_page);
50+
51+
bool parse_pdf_page(std::string filename, int page,
52+
container_lib::container& raw_page);
53+
4254
bool parse_pdf_page(const char* buffer, std::size_t size,
4355
container_lib::container& raw_page);
4456

57+
bool parse_pdf_page(const char* buffer, std::size_t size, int page,
58+
container_lib::container& raw_page);
59+
4560
bool clean_raw_page(container_lib::container& raw_page);
4661

4762
void clean_pages(container_lib::container& raw_doc);
@@ -77,6 +92,9 @@ namespace pdf_lib
7792

7893
std::vector<parser_task> tasks;
7994
std::vector<ocr_merge_task> ocr_merge_tasks;
95+
96+
std::map<std::string, std::shared_ptr<pdf_lib::core::object<pdf_lib::core::DOCUMENT> > > loaded_documents;
97+
std::map<std::string, std::shared_ptr<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> > > loaded_parsers;
8098
};
8199

82100
interface<PARSER>::interface():
@@ -95,6 +113,12 @@ namespace pdf_lib
95113
core::object<core::FONT>::finalize();
96114
}
97115

116+
void interface<PARSER>::clear()
117+
{
118+
loaded_documents.clear();
119+
loaded_parsers.clear();
120+
}
121+
98122
int interface<PARSER>::query(std::string input_file)
99123
{
100124
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
@@ -503,6 +527,104 @@ namespace pdf_lib
503527
return true;
504528
}
505529

530+
bool interface<PARSER>::load_document(std::string filename)
531+
{
532+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
533+
534+
if(loaded_documents.count(filename)==0 and
535+
loaded_parsers.count(filename)==0)
536+
{
537+
auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
538+
auto parser = std::make_shared<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> >(*doc);
539+
540+
//pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
541+
//pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
542+
543+
parser->load_document(filename);
544+
doc->resize_pages(parser->number_of_pages());
545+
546+
loaded_documents[filename] = doc;
547+
loaded_parsers[filename] = parser;
548+
}
549+
550+
return true;
551+
}
552+
553+
bool interface<PARSER>::unload_document(std::string filename)
554+
{
555+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
556+
557+
if(loaded_documents.count(filename))
558+
{
559+
loaded_documents.erase(filename);
560+
}
561+
562+
if(loaded_parsers.count(filename))
563+
{
564+
loaded_parsers.erase(filename);
565+
}
566+
567+
return true;
568+
}
569+
570+
bool interface<PARSER>::unload_documents()
571+
{
572+
loaded_documents.clear();
573+
loaded_parsers.clear();
574+
575+
return true;
576+
}
577+
578+
bool interface<PARSER>::parse_pdf_page(std::string filename, int page,
579+
container_lib::container &raw_page)
580+
{
581+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
582+
583+
raw_page.clear();
584+
585+
/*
586+
pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
587+
588+
{
589+
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
590+
parser.load_document(filename).process_page(page);
591+
}
592+
*/
593+
594+
// lazy loading
595+
load_document(filename);
596+
597+
// clean the document
598+
{
599+
//auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
600+
//loaded_documents[filename] = doc;
601+
}
602+
603+
auto& doc = loaded_documents.at(filename);
604+
auto& parser = loaded_parsers.at(filename);
605+
606+
//parser->set_object(*doc);
607+
608+
parser->process_page_from_document(page);
609+
610+
try
611+
{
612+
pdf_lib::core::writer writer;
613+
writer.execute(*doc, raw_page);
614+
615+
// do clean-up
616+
doc->delete_page(page);
617+
}
618+
catch (...)
619+
{
620+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
621+
<< "\t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
622+
return false;
623+
}
624+
625+
return true;
626+
}
627+
506628
bool interface<PARSER>::parse_pdf_page(const char* buffer, std::size_t size,
507629
container_lib::container &raw_page)
508630
{
@@ -541,15 +663,80 @@ namespace pdf_lib
541663
return true;
542664
}
543665

666+
std::string interface<PARSER>::load_document(const char* buffer, std::size_t size)
667+
{
668+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
669+
670+
// we are using the buffer as a way to disambiguate doc's. Is not completely fool-proof
671+
std::string key = "document-"+std::to_string(size);
672+
673+
if(loaded_documents.count(key)==0 and
674+
loaded_parsers.count(key)==0)
675+
{
676+
auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
677+
auto parser = std::make_shared<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> >(*doc);
678+
679+
std::string desc = "parsing document buffer via BytesIO";
680+
parser->load_buffer(desc.c_str(), buffer, size);
681+
682+
doc->resize_pages(parser->number_of_pages());
683+
684+
loaded_documents[key] = doc;
685+
loaded_parsers[key] = parser;
686+
}
687+
688+
return key;
689+
}
690+
691+
bool interface<PARSER>::parse_pdf_page(const char* buffer, std::size_t size, int page,
692+
container_lib::container &raw_page)
693+
{
694+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
695+
696+
raw_page.clear();
697+
698+
/*
699+
pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
700+
701+
{
702+
std::string desc = "parsing document buffer via BytesIO";
703+
704+
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
705+
parser.load_buffer(desc.c_str(), buffer, size).process_all();
706+
}
707+
*/
708+
709+
// lazy loading
710+
std::string key = load_document(buffer, size);
711+
712+
auto& doc = loaded_documents.at(key);
713+
auto& parser = loaded_parsers.at(key);
714+
715+
parser->process_page_from_document(page);
716+
717+
try
718+
{
719+
pdf_lib::core::writer writer;
720+
writer.execute(*doc, raw_page);
721+
}
722+
catch (...)
723+
{
724+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
725+
<< "\t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
726+
return false;
727+
}
728+
729+
return true;
730+
}
731+
544732
bool interface<PARSER>::clean_raw_page(container_lib::container& raw_page)
545733
{
546734
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
547735

548736
typedef float scalar_type;
549737

550-
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
551-
<< "#-cells: " << raw_page["pages"][0]["cells"].get_size();
552-
738+
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
739+
//<< "#-cells: " << raw_page["pages"][0]["cells"].get_size();
553740

554741
post_processor<BUILD_OVERVIEW, scalar_type> overview;
555742
{
@@ -623,22 +810,30 @@ namespace pdf_lib
623810
{
624811
container_lib::container &page = pages[k];
625812

626-
{
627-
post_processor<REMOVE_EMPTY_CELLS, scalar_type> post_processor;
628-
post_processor.execute(page);
629-
}
630-
631-
{
632-
post_processor<REMOVE_DUPLICATE_CELLS, scalar_type> post_processor;
633-
post_processor.execute(page);
634-
}
635-
636-
{
637-
post_processor<REMOVE_OUTLIER_CELLS, scalar_type> post_processor;
638-
post_processor.execute(page);
639-
}
813+
if(page.has(core::keys<core::PAGE>::cells()))
814+
{
815+
{
816+
post_processor<REMOVE_EMPTY_CELLS, scalar_type> post_processor;
817+
post_processor.execute(page);
818+
}
819+
820+
{
821+
post_processor<REMOVE_DUPLICATE_CELLS, scalar_type> post_processor;
822+
post_processor.execute(page);
823+
}
824+
825+
{
826+
post_processor<REMOVE_OUTLIER_CELLS, scalar_type> post_processor;
827+
post_processor.execute(page);
828+
}
829+
830+
}
831+
else
832+
{
833+
logging_lib::warn("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
834+
<< "skipping page: no cells";
835+
}
640836
}
641-
642837
}
643838

644839
bool interface<PARSER>::is_acceptable(container_lib::container& raw_doc, container_lib::container& page_stats)
@@ -664,7 +859,6 @@ namespace pdf_lib
664859

665860
container_lib::container& v_paths = page[core::keys<core::PAGE>::vertical_lines()];
666861
container_lib::container& h_paths = page[core::keys<core::PAGE>::horizontal_lines()];
667-
668862

669863
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << " raw-doc:\n"
670864
<< "\t#-cells: " << cells.get_size() << "\n"

0 commit comments

Comments
 (0)