@@ -26,14 +26,18 @@ namespace docling
2626
2727 docling_parser (std::string level);
2828
29- void set_loglevel (int level=0 );
3029 void set_loglevel_with_label (std::string level=" error" );
3130
3231 bool is_loaded (std::string key);
3332 std::vector<std::string> list_loaded_keys ();
3433
35- bool load_document (std::string key, std::string filename, std::optional<std::string> password);
36- bool load_document_from_bytesio (std::string key, pybind11::object bytes_io);
34+ bool load_document (std::string key,
35+ std::string filename,
36+ std::optional<std::string> password);
37+
38+ bool load_document_from_bytesio (std::string key,
39+ pybind11::object bytes_io,
40+ std::optional<std::string> password);
3741
3842 bool unload_document (std::string key);
3943 bool unload_document_pages (std::string key);
@@ -48,37 +52,10 @@ namespace docling
4852 nlohmann::json get_meta_xml (std::string key);
4953 nlohmann::json get_table_of_contents (std::string key);
5054
51- // Direct typed access to page decoder (avoids JSON serialization)
52- std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> get_page_decoder (
53- std::string key,
54- int page,
55- std::string page_boundary,
56- bool do_sanitization,
57- bool create_word_cells,
58- bool create_line_cells);
59-
60- // Config-based overload
61- std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> get_page_decoder (
62- std::string key,
55+ std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> get_page_decoder (std::string key,
6356 int page,
6457 const pdflib::decode_page_config& config);
6558
66- nlohmann::json sanitize_cells (nlohmann::json& original_cells,
67- nlohmann::json& page_dim,
68- nlohmann::json& page_shapes,
69- double horizontal_cell_tolerance,
70- bool enforce_same_font,
71- double space_width_factor_for_merge, // =1.5,
72- double space_width_factor_for_merge_with_space); // =0.33);
73-
74- nlohmann::json sanitize_cells_in_bbox (nlohmann::json& page,
75- std::array<double , 4 > bbox,
76- double cell_overlap,
77- double horizontal_cell_tolerance,
78- bool enforce_same_font,
79- double space_width_factor_for_merge, // =1.5,
80- double space_width_factor_for_merge_with_space); // =0.33);
81-
8259 private:
8360
8461 bool verify_page_boundary (std::string page_boundary);
@@ -140,30 +117,6 @@ namespace docling
140117 pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise (data, timings);
141118 }
142119
143- void docling_parser::set_loglevel (int level)
144- {
145- if (level>=3 )
146- {
147- loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
148- }
149- else if (level==2 )
150- {
151- loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
152- }
153- else if (level==1 )
154- {
155- loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
156- }
157- else if (level==0 )
158- {
159- loguru::g_stderr_verbosity = loguru::Verbosity_FATAL;
160- }
161- else
162- {
163- loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
164- }
165- }
166-
167120 void docling_parser::set_loglevel_with_label (std::string level)
168121 {
169122 if (level==" info" )
@@ -206,11 +159,13 @@ namespace docling
206159 return (key2doc.count (key)==1 );
207160 }
208161
209- bool docling_parser::load_document (std::string key, std::string filename, std::optional<std::string> password)
162+ bool docling_parser::load_document (std::string key,
163+ std::string filename,
164+ std::optional<std::string> password)
210165 {
211166#ifdef _WIN32
212167 // Convert UTF-8 string to UTF-16 wstring
213- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t >> converter;
168+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t > > converter;
214169 std::wstring wide_filename = converter.from_bytes (filename);
215170 std::filesystem::path path_filename (wide_filename);
216171#else
@@ -229,7 +184,9 @@ namespace docling
229184 return false ;
230185 }
231186
232- bool docling_parser::load_document_from_bytesio (std::string key, pybind11::object bytes_io)
187+ bool docling_parser::load_document_from_bytesio (std::string key,
188+ pybind11::object bytes_io,
189+ std::optional<std::string> password)
233190 {
234191 // logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
235192 LOG_S (INFO) << __FILE__ << " :" << __LINE__ << " \t " << __FUNCTION__;
@@ -252,7 +209,7 @@ namespace docling
252209 try
253210 {
254211 key2doc[key] = std::make_shared<decoder_type>();
255- std::optional<std::string> password = std::nullopt ;
212+ // std::optional<std::string> password = std::nullopt;
256213 std::string description = " parsing of " + key + " from bytesio" ;
257214 key2doc.at (key)->process_document_from_bytesio (data_str, password, description);
258215
@@ -382,25 +339,7 @@ namespace docling
382339 return (itr->second )->get_table_of_contents ();
383340 }
384341
385- std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> docling_parser::get_page_decoder (
386- std::string key,
387- int page,
388- std::string page_boundary,
389- bool do_sanitization,
390- bool create_word_cells,
391- bool create_line_cells)
392- {
393- pdflib::decode_page_config config;
394- config.page_boundary = page_boundary;
395- config.do_sanitization = do_sanitization;
396- config.create_word_cells = create_word_cells;
397- config.create_line_cells = create_line_cells;
398-
399- return get_page_decoder (key, page, config);
400- }
401-
402- std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> docling_parser::get_page_decoder (
403- std::string key,
342+ std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> docling_parser::get_page_decoder (std::string key,
404343 int page,
405344 const pdflib::decode_page_config& config)
406345 {
@@ -417,110 +356,6 @@ namespace docling
417356 return decoder->decode_page (page, config);
418357 }
419358
420- nlohmann::json docling_parser::sanitize_cells (nlohmann::json& json_cells,
421- nlohmann::json& json_dim,
422- nlohmann::json& json_shapes,
423- double horizontal_cell_tolerance,
424- bool enforce_same_font,
425- double space_width_factor_for_merge, // =1.5,
426- double space_width_factor_for_merge_with_space) // =0.33);
427- {
428- pdflib::page_item<pdflib::PAGE_DIMENSION> dim;
429- dim.init_from (json_dim);
430-
431- pdflib::page_item<pdflib::PAGE_SHAPES> shapes;
432- shapes.init_from (json_shapes);
433-
434- pdflib::page_item<pdflib::PAGE_CELLS> cells;
435- cells.init_from (json_cells);
436-
437- pdflib::page_item_sanitator<pdflib::PAGE_CELLS> sanitizer;// (dim, shapes);
438- sanitizer.sanitize_bbox (cells, horizontal_cell_tolerance, enforce_same_font,
439- space_width_factor_for_merge,
440- space_width_factor_for_merge_with_space);
441-
442- sanitizer.sanitize_text (cells);
443-
444- return cells.get ();
445- }
446-
447- nlohmann::json docling_parser::sanitize_cells_in_bbox (nlohmann::json& page,
448- std::array<double , 4 > bbox,
449- double cell_overlap,
450- double horizontal_cell_tolerance,
451- bool enforce_same_font,
452- double space_width_factor_for_merge, // =1.5,
453- double space_width_factor_for_merge_with_space) // =0.33);
454- {
455- LOG_S (INFO) << __FUNCTION__
456- << " , cell_overlap: " << cell_overlap
457- << " , horizontal_cell_tolerance: " << horizontal_cell_tolerance
458- << " , enforce_same_font: " << enforce_same_font;
459-
460- // empty array
461- nlohmann::json sanitized_cells = nlohmann::json::array ({});
462-
463- double x0 = bbox[0 ];
464- double y0 = bbox[1 ];
465-
466- double x1 = bbox[2 ];
467- double y1 = bbox[3 ];
468-
469- pdflib::page_item<pdflib::PAGE_DIMENSION> dim;
470- if (not dim.init_from (page[" original" ][" dimension" ]))
471- {
472- LOG_S (WARNING) << " could not init dim" ;
473- return sanitized_cells;
474- }
475-
476- pdflib::page_item<pdflib::PAGE_SHAPES> shapes;
477- if (not shapes.init_from (page[" original" ][" shapes" ]))
478- {
479- LOG_S (WARNING) << " could not init shapes" ;
480- return sanitized_cells;
481- }
482-
483- pdflib::page_item<pdflib::PAGE_CELLS> cells;
484- if (not cells.init_from (page[" original" ][" cells" ][" data" ]))
485- {
486- LOG_S (WARNING) << " could not init cells" ;
487- return sanitized_cells;
488- }
489-
490- LOG_S (INFO) << " init done ... --> #-cells: " << cells.size ();
491-
492- // get all cells with an overlap over cell_overlap
493- pdflib::page_item<pdflib::PAGE_CELLS> selected_cells;
494- for (int i=0 ; i<cells.size (); i++)
495- {
496- double overlap = utils::values::compute_overlap (cells[i].x0 , cells[i].y0 , cells[i].x1 , cells[i].y1 ,
497- x0, y0, x1, y1);
498-
499- if (overlap>cell_overlap-1 .e -3 )
500- {
501- selected_cells.push_back (cells[i]);
502- }
503- }
504-
505- if (selected_cells.size ()==0 )
506- {
507- return sanitized_cells;
508- }
509-
510- pdflib::page_item_sanitator<pdflib::PAGE_CELLS> sanitizer;
511- sanitizer.sanitize_bbox (selected_cells,
512- horizontal_cell_tolerance,
513- enforce_same_font,
514- space_width_factor_for_merge,
515- space_width_factor_for_merge_with_space);
516-
517- sanitizer.sanitize_text (selected_cells);
518-
519- return selected_cells.get ();
520- }
521-
522-
523-
524359}
525360
526361#endif
0 commit comments