@@ -28,6 +28,8 @@ namespace pdf_lib
2828 interface ();
2929 ~interface ();
3030
31+ void clear ();
32+
3133 int query (std::string input_file);
3234
3335 // private:
@@ -36,12 +38,25 @@ namespace pdf_lib
3638
3739 void parse (container_lib::container& input);
3840
39- bool parse_pdf_page (std::string id,
40- container_lib::container& raw_page );
41+ bool load_document (std::string filename);
42+ bool unload_document (std::string filename );
4143
44+ std::string load_document (const char * buffer, std::size_t size);
45+
46+ bool unload_documents ();
47+
48+ bool parse_pdf_page (std::string filename,
49+ container_lib::container& raw_page);
50+
51+ bool parse_pdf_page (std::string filename, int page,
52+ container_lib::container& raw_page);
53+
4254 bool parse_pdf_page (const char * buffer, std::size_t size,
4355 container_lib::container& raw_page);
4456
57+ bool parse_pdf_page (const char * buffer, std::size_t size, int page,
58+ container_lib::container& raw_page);
59+
4560 bool clean_raw_page (container_lib::container& raw_page);
4661
4762 void clean_pages (container_lib::container& raw_doc);
@@ -77,6 +92,9 @@ namespace pdf_lib
7792
7893 std::vector<parser_task> tasks;
7994 std::vector<ocr_merge_task> ocr_merge_tasks;
95+
96+ std::map<std::string, std::shared_ptr<pdf_lib::core::object<pdf_lib::core::DOCUMENT> > > loaded_documents;
97+ std::map<std::string, std::shared_ptr<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> > > loaded_parsers;
8098 };
8199
82100 interface<PARSER>::interface():
@@ -95,6 +113,12 @@ namespace pdf_lib
95113 core::object<core::FONT>::finalize ();
96114 }
97115
116+ void interface<PARSER>::clear()
117+ {
118+ loaded_documents.clear ();
119+ loaded_parsers.clear ();
120+ }
121+
98122 int interface<PARSER>::query(std::string input_file)
99123 {
100124 logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
@@ -503,6 +527,104 @@ namespace pdf_lib
503527 return true ;
504528 }
505529
530+ bool interface<PARSER>::load_document(std::string filename)
531+ {
532+ logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
533+
534+ if (loaded_documents.count (filename)==0 and
535+ loaded_parsers.count (filename)==0 )
536+ {
537+ auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
538+ auto parser = std::make_shared<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> >(*doc);
539+
540+ // pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
541+ // pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
542+
543+ parser->load_document (filename);
544+ doc->resize_pages (parser->number_of_pages ());
545+
546+ loaded_documents[filename] = doc;
547+ loaded_parsers[filename] = parser;
548+ }
549+
550+ return true ;
551+ }
552+
553+ bool interface<PARSER>::unload_document(std::string filename)
554+ {
555+ logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
556+
557+ if (loaded_documents.count (filename))
558+ {
559+ loaded_documents.erase (filename);
560+ }
561+
562+ if (loaded_parsers.count (filename))
563+ {
564+ loaded_parsers.erase (filename);
565+ }
566+
567+ return true ;
568+ }
569+
570+ bool interface<PARSER>::unload_documents()
571+ {
572+ loaded_documents.clear ();
573+ loaded_parsers.clear ();
574+
575+ return true ;
576+ }
577+
578+ bool interface<PARSER>::parse_pdf_page(std::string filename, int page,
579+ container_lib::container &raw_page)
580+ {
581+ logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
582+
583+ raw_page.clear ();
584+
585+ /*
586+ pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
587+
588+ {
589+ pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
590+ parser.load_document(filename).process_page(page);
591+ }
592+ */
593+
594+ // lazy loading
595+ load_document (filename);
596+
597+ // clean the document
598+ {
599+ // auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
600+ // loaded_documents[filename] = doc;
601+ }
602+
603+ auto & doc = loaded_documents.at (filename);
604+ auto & parser = loaded_parsers.at (filename);
605+
606+ // parser->set_object(*doc);
607+
608+ parser->process_page_from_document (page);
609+
610+ try
611+ {
612+ pdf_lib::core::writer writer;
613+ writer.execute (*doc, raw_page);
614+
615+ // do clean-up
616+ doc->delete_page (page);
617+ }
618+ catch (...)
619+ {
620+ logging_lib::error (" pdf-parser" ) << __FILE__ << " :" << __LINE__
621+ << " \t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n " ;
622+ return false ;
623+ }
624+
625+ return true ;
626+ }
627+
506628 bool interface<PARSER>::parse_pdf_page(const char * buffer, std::size_t size,
507629 container_lib::container &raw_page)
508630 {
@@ -541,15 +663,80 @@ namespace pdf_lib
541663 return true ;
542664 }
543665
666+ std::string interface<PARSER>::load_document(const char * buffer, std::size_t size)
667+ {
668+ logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
669+
670+ // we are using the buffer as a way to disambiguate doc's. Is not completely fool-proof
671+ std::string key = " document-" +std::to_string (size);
672+
673+ if (loaded_documents.count (key)==0 and
674+ loaded_parsers.count (key)==0 )
675+ {
676+ auto doc = std::make_shared<pdf_lib::core::object<pdf_lib::core::DOCUMENT> >();
677+ auto parser = std::make_shared<pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> >(*doc);
678+
679+ std::string desc = " parsing document buffer via BytesIO" ;
680+ parser->load_buffer (desc.c_str (), buffer, size);
681+
682+ doc->resize_pages (parser->number_of_pages ());
683+
684+ loaded_documents[key] = doc;
685+ loaded_parsers[key] = parser;
686+ }
687+
688+ return key;
689+ }
690+
691+ bool interface<PARSER>::parse_pdf_page(const char * buffer, std::size_t size, int page,
692+ container_lib::container &raw_page)
693+ {
694+ logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
695+
696+ raw_page.clear ();
697+
698+ /*
699+ pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
700+
701+ {
702+ std::string desc = "parsing document buffer via BytesIO";
703+
704+ pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
705+ parser.load_buffer(desc.c_str(), buffer, size).process_all();
706+ }
707+ */
708+
709+ // lazy loading
710+ std::string key = load_document (buffer, size);
711+
712+ auto & doc = loaded_documents.at (key);
713+ auto & parser = loaded_parsers.at (key);
714+
715+ parser->process_page_from_document (page);
716+
717+ try
718+ {
719+ pdf_lib::core::writer writer;
720+ writer.execute (*doc, raw_page);
721+ }
722+ catch (...)
723+ {
724+ logging_lib::error (" pdf-parser" ) << __FILE__ << " :" << __LINE__
725+ << " \t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n " ;
726+ return false ;
727+ }
728+
729+ return true ;
730+ }
731+
544732 bool interface<PARSER>::clean_raw_page(container_lib::container& raw_page)
545733 {
546734 logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
547735
548736 typedef float scalar_type;
549737
550- logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t "
551- << " #-cells: " << raw_page[" pages" ][0 ][" cells" ].get_size ();
552-
738+ // logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
739+ // << "#-cells: " << raw_page["pages"][0]["cells"].get_size();
553740
554741 post_processor<BUILD_OVERVIEW, scalar_type> overview;
555742 {
@@ -623,22 +810,30 @@ namespace pdf_lib
623810 {
624811 container_lib::container &page = pages[k];
625812
626- {
627- post_processor<REMOVE_EMPTY_CELLS, scalar_type> post_processor;
628- post_processor.execute (page);
629- }
630-
631- {
632- post_processor<REMOVE_DUPLICATE_CELLS, scalar_type> post_processor;
633- post_processor.execute (page);
634- }
635-
636- {
637- post_processor<REMOVE_OUTLIER_CELLS, scalar_type> post_processor;
638- post_processor.execute (page);
639- }
813+ if (page.has (core::keys<core::PAGE>::cells ()))
814+ {
815+ {
816+ post_processor<REMOVE_EMPTY_CELLS, scalar_type> post_processor;
817+ post_processor.execute (page);
818+ }
819+
820+ {
821+ post_processor<REMOVE_DUPLICATE_CELLS, scalar_type> post_processor;
822+ post_processor.execute (page);
823+ }
824+
825+ {
826+ post_processor<REMOVE_OUTLIER_CELLS, scalar_type> post_processor;
827+ post_processor.execute (page);
828+ }
829+
830+ }
831+ else
832+ {
833+ logging_lib::warn (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t "
834+ << " skipping page: no cells" ;
835+ }
640836 }
641-
642837 }
643838
644839 bool interface<PARSER>::is_acceptable(container_lib::container& raw_doc, container_lib::container& page_stats)
@@ -664,7 +859,6 @@ namespace pdf_lib
664859
665860 container_lib::container& v_paths = page[core::keys<core::PAGE>::vertical_lines ()];
666861 container_lib::container& h_paths = page[core::keys<core::PAGE>::horizontal_lines ()];
667-
668862
669863 logging_lib::info (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " raw-doc:\n "
670864 << " \t #-cells: " << cells.get_size () << " \n "
0 commit comments