Skip to content

Commit b65a28d

Browse files
cleaned up unused code
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
1 parent e7812a1 commit b65a28d

File tree

2 files changed

+31
-191
lines changed

2 files changed

+31
-191
lines changed

app/pybind_parse.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -382,17 +382,16 @@ PYBIND11_MODULE(pdf_parsers, m) {
382382
List[str]: A list of keys for the currently loaded documents.)")
383383

384384
.def("load_document",
385-
[](
386-
docling::docling_parser &self,
387-
const std::string &key,
388-
const std::string &filename,
389-
std::optional<std::string>& password
390-
) -> bool {
385+
[](docling::docling_parser &self,
386+
const std::string &key,
387+
const std::string &filename,
388+
std::optional<std::string>& password
389+
) -> bool {
391390
return self.load_document(key, filename, password);
392391
},
393392
pybind11::arg("key"),
394393
pybind11::arg("filename"),
395-
pybind11::arg("password") = pybind11::none(),
394+
pybind11::arg("password") = pybind11::none(),
396395
R"(
397396
Load a document by key and filename.
398397
@@ -405,17 +404,23 @@ PYBIND11_MODULE(pdf_parsers, m) {
405404
bool: True if the document was successfully loaded, False otherwise.)")
406405

407406
.def("load_document_from_bytesio",
408-
[](docling::docling_parser &self, const std::string &key, pybind11::object bytes_io) -> bool {
409-
return self.load_document_from_bytesio(key, bytes_io);
407+
[](docling::docling_parser &self,
408+
const std::string &key,
409+
pybind11::object bytes_io,
410+
std::optional<std::string>& password
411+
) -> bool {
412+
return self.load_document_from_bytesio(key, bytes_io, password);
410413
},
411414
pybind11::arg("key"),
412415
pybind11::arg("bytes_io"),
416+
pybind11::arg("password") = pybind11::none(),
413417
R"(
414418
Load a document by key from a BytesIO-like object.
415419
416420
Parameters:
417421
key (str): The unique key to identify the document.
418422
bytes_io (Any): A BytesIO-like object containing the document data.
423+
password (str, optional): Optional password for password-protected files
419424
420425
Returns:
421426
bool: True if the document was successfully loaded, False otherwise.)")

src/pybind/docling_parser.h

Lines changed: 17 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,18 @@ namespace docling
2626

2727
docling_parser(std::string level);
2828

29-
void set_loglevel(int level=0);
3029
void set_loglevel_with_label(std::string level="error");
3130

3231
bool is_loaded(std::string key);
3332
std::vector<std::string> list_loaded_keys();
3433

35-
bool load_document(std::string key, std::string filename, std::optional<std::string> password);
36-
bool load_document_from_bytesio(std::string key, pybind11::object bytes_io);
34+
bool load_document(std::string key,
35+
std::string filename,
36+
std::optional<std::string> password);
37+
38+
bool load_document_from_bytesio(std::string key,
39+
pybind11::object bytes_io,
40+
std::optional<std::string> password);
3741

3842
bool unload_document(std::string key);
3943
bool unload_document_pages(std::string key);
@@ -48,37 +52,10 @@ namespace docling
4852
nlohmann::json get_meta_xml(std::string key);
4953
nlohmann::json get_table_of_contents(std::string key);
5054

51-
// Direct typed access to page decoder (avoids JSON serialization)
52-
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> get_page_decoder(
53-
std::string key,
54-
int page,
55-
std::string page_boundary,
56-
bool do_sanitization,
57-
bool create_word_cells,
58-
bool create_line_cells);
59-
60-
// Config-based overload
61-
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> get_page_decoder(
62-
std::string key,
55+
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> get_page_decoder(std::string key,
6356
int page,
6457
const pdflib::decode_page_config& config);
6558

66-
nlohmann::json sanitize_cells(nlohmann::json& original_cells,
67-
nlohmann::json& page_dim,
68-
nlohmann::json& page_shapes,
69-
double horizontal_cell_tolerance,
70-
bool enforce_same_font,
71-
double space_width_factor_for_merge, //=1.5,
72-
double space_width_factor_for_merge_with_space); //=0.33);
73-
74-
nlohmann::json sanitize_cells_in_bbox(nlohmann::json& page,
75-
std::array<double, 4> bbox,
76-
double cell_overlap,
77-
double horizontal_cell_tolerance,
78-
bool enforce_same_font,
79-
double space_width_factor_for_merge, //=1.5,
80-
double space_width_factor_for_merge_with_space); //=0.33);
81-
8259
private:
8360

8461
bool verify_page_boundary(std::string page_boundary);
@@ -140,30 +117,6 @@ namespace docling
140117
pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise(data, timings);
141118
}
142119

143-
void docling_parser::set_loglevel(int level)
144-
{
145-
if(level>=3)
146-
{
147-
loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
148-
}
149-
else if(level==2)
150-
{
151-
loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
152-
}
153-
else if(level==1)
154-
{
155-
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
156-
}
157-
else if(level==0)
158-
{
159-
loguru::g_stderr_verbosity = loguru::Verbosity_FATAL;
160-
}
161-
else
162-
{
163-
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
164-
}
165-
}
166-
167120
void docling_parser::set_loglevel_with_label(std::string level)
168121
{
169122
if(level=="info")
@@ -206,11 +159,13 @@ namespace docling
206159
return (key2doc.count(key)==1);
207160
}
208161

209-
bool docling_parser::load_document(std::string key, std::string filename, std::optional<std::string> password)
162+
bool docling_parser::load_document(std::string key,
163+
std::string filename,
164+
std::optional<std::string> password)
210165
{
211166
#ifdef _WIN32
212167
// Convert UTF-8 string to UTF-16 wstring
213-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
168+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter;
214169
std::wstring wide_filename = converter.from_bytes(filename);
215170
std::filesystem::path path_filename(wide_filename);
216171
#else
@@ -229,7 +184,9 @@ namespace docling
229184
return false;
230185
}
231186

232-
bool docling_parser::load_document_from_bytesio(std::string key, pybind11::object bytes_io)
187+
bool docling_parser::load_document_from_bytesio(std::string key,
188+
pybind11::object bytes_io,
189+
std::optional<std::string> password)
233190
{
234191
// logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
235192
LOG_S(INFO) << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
@@ -252,7 +209,7 @@ namespace docling
252209
try
253210
{
254211
key2doc[key] = std::make_shared<decoder_type>();
255-
std::optional<std::string> password = std::nullopt;
212+
//std::optional<std::string> password = std::nullopt;
256213
std::string description = "parsing of " + key + " from bytesio";
257214
key2doc.at(key)->process_document_from_bytesio(data_str, password, description);
258215

@@ -382,25 +339,7 @@ namespace docling
382339
return (itr->second)->get_table_of_contents();
383340
}
384341

385-
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> docling_parser::get_page_decoder(
386-
std::string key,
387-
int page,
388-
std::string page_boundary,
389-
bool do_sanitization,
390-
bool create_word_cells,
391-
bool create_line_cells)
392-
{
393-
pdflib::decode_page_config config;
394-
config.page_boundary = page_boundary;
395-
config.do_sanitization = do_sanitization;
396-
config.create_word_cells = create_word_cells;
397-
config.create_line_cells = create_line_cells;
398-
399-
return get_page_decoder(key, page, config);
400-
}
401-
402-
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> docling_parser::get_page_decoder(
403-
std::string key,
342+
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> docling_parser::get_page_decoder(std::string key,
404343
int page,
405344
const pdflib::decode_page_config& config)
406345
{
@@ -417,110 +356,6 @@ namespace docling
417356
return decoder->decode_page(page, config);
418357
}
419358

420-
nlohmann::json docling_parser::sanitize_cells(nlohmann::json& json_cells,
421-
nlohmann::json& json_dim,
422-
nlohmann::json& json_shapes,
423-
double horizontal_cell_tolerance,
424-
bool enforce_same_font,
425-
double space_width_factor_for_merge, //=1.5,
426-
double space_width_factor_for_merge_with_space) //=0.33);
427-
{
428-
pdflib::page_item<pdflib::PAGE_DIMENSION> dim;
429-
dim.init_from(json_dim);
430-
431-
pdflib::page_item<pdflib::PAGE_SHAPES> shapes;
432-
shapes.init_from(json_shapes);
433-
434-
pdflib::page_item<pdflib::PAGE_CELLS> cells;
435-
cells.init_from(json_cells);
436-
437-
pdflib::page_item_sanitator<pdflib::PAGE_CELLS> sanitizer;//(dim, shapes);
438-
sanitizer.sanitize_bbox(cells, horizontal_cell_tolerance, enforce_same_font,
439-
space_width_factor_for_merge,
440-
space_width_factor_for_merge_with_space);
441-
442-
sanitizer.sanitize_text(cells);
443-
444-
return cells.get();
445-
}
446-
447-
nlohmann::json docling_parser::sanitize_cells_in_bbox(nlohmann::json& page,
448-
std::array<double, 4> bbox,
449-
double cell_overlap,
450-
double horizontal_cell_tolerance,
451-
bool enforce_same_font,
452-
double space_width_factor_for_merge, //=1.5,
453-
double space_width_factor_for_merge_with_space) //=0.33);
454-
{
455-
LOG_S(INFO) << __FUNCTION__
456-
<< ", cell_overlap: " << cell_overlap
457-
<< ", horizontal_cell_tolerance: " << horizontal_cell_tolerance
458-
<< ", enforce_same_font: " << enforce_same_font;
459-
460-
// empty array
461-
nlohmann::json sanitized_cells = nlohmann::json::array({});
462-
463-
double x0 = bbox[0];
464-
double y0 = bbox[1];
465-
466-
double x1 = bbox[2];
467-
double y1 = bbox[3];
468-
469-
pdflib::page_item<pdflib::PAGE_DIMENSION> dim;
470-
if(not dim.init_from(page["original"]["dimension"]))
471-
{
472-
LOG_S(WARNING) << "could not init dim";
473-
return sanitized_cells;
474-
}
475-
476-
pdflib::page_item<pdflib::PAGE_SHAPES> shapes;
477-
if(not shapes.init_from(page["original"]["shapes"]))
478-
{
479-
LOG_S(WARNING) << "could not init shapes";
480-
return sanitized_cells;
481-
}
482-
483-
pdflib::page_item<pdflib::PAGE_CELLS> cells;
484-
if(not cells.init_from(page["original"]["cells"]["data"]))
485-
{
486-
LOG_S(WARNING) << "could not init cells";
487-
return sanitized_cells;
488-
}
489-
490-
LOG_S(INFO) << "init done ... --> #-cells: " << cells.size();
491-
492-
// get all cells with an overlap over cell_overlap
493-
pdflib::page_item<pdflib::PAGE_CELLS> selected_cells;
494-
for(int i=0; i<cells.size(); i++)
495-
{
496-
double overlap = utils::values::compute_overlap(cells[i].x0, cells[i].y0, cells[i].x1, cells[i].y1,
497-
x0, y0, x1, y1);
498-
499-
if(overlap>cell_overlap-1.e-3)
500-
{
501-
selected_cells.push_back(cells[i]);
502-
}
503-
}
504-
505-
if(selected_cells.size()==0)
506-
{
507-
return sanitized_cells;
508-
}
509-
510-
pdflib::page_item_sanitator<pdflib::PAGE_CELLS> sanitizer;
511-
sanitizer.sanitize_bbox(selected_cells,
512-
horizontal_cell_tolerance,
513-
enforce_same_font,
514-
space_width_factor_for_merge,
515-
space_width_factor_for_merge_with_space);
516-
517-
sanitizer.sanitize_text(selected_cells);
518-
519-
return selected_cells.get();
520-
}
521-
522-
523-
524359
}
525360

526361
#endif

0 commit comments

Comments
 (0)