Skip to content

Commit 0402b3f

Browse files
feat: reset to the old parameters in sanitation (#163)
Signed-off-by: Peter Staar <[email protected]>
1 parent 6195ff9 commit 0402b3f

File tree

7 files changed

+87
-86
lines changed

7 files changed

+87
-86
lines changed

app/pybind_parse.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -584,14 +584,14 @@ Sanitize table cells with specified parameters and return the processed JSON.
584584
[](docling::docling_sanitizer &self,
585585
double horizontal_cell_tolerance,
586586
bool enforce_same_font,
587-
double space_width_factor_for_merge = 1.0) -> nlohmann::json {
587+
double space_width_factor_for_merge/* = 1.0*/) -> nlohmann::json {
588588
return self.create_word_cells(horizontal_cell_tolerance,
589589
enforce_same_font,
590590
space_width_factor_for_merge);
591591
},
592-
pybind11::arg("horizontal_cell_tolerance")=1.0,
593-
pybind11::arg("enforce_same_font")=true,
594-
pybind11::arg("space_width_factor_for_merge")=0.33,
592+
pybind11::arg("horizontal_cell_tolerance"), // =1.0,
593+
pybind11::arg("enforce_same_font"), //=true,
594+
pybind11::arg("space_width_factor_for_merge"), //=0.33,
595595
R"(
596596
Create word cells
597597
@@ -607,17 +607,17 @@ Sanitize table cells with specified parameters and return the processed JSON.
607607
[](docling::docling_sanitizer &self,
608608
double horizontal_cell_tolerance,
609609
bool enforce_same_font,
610-
double space_width_factor_for_merge = 1.0,
611-
double space_width_factor_for_merge_with_space = 0.33) -> nlohmann::json {
610+
double space_width_factor_for_merge /*= 1.0*/,
611+
double space_width_factor_for_merge_with_space /*= 0.33*/) -> nlohmann::json {
612612
return self.create_line_cells(horizontal_cell_tolerance,
613613
enforce_same_font,
614614
space_width_factor_for_merge,
615615
space_width_factor_for_merge_with_space);
616616
},
617-
pybind11::arg("horizontal_cell_tolerance")=1.0,
618-
pybind11::arg("enforce_same_font")=true,
619-
pybind11::arg("space_width_factor_for_merge")=1.0,
620-
pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
617+
pybind11::arg("horizontal_cell_tolerance"), //=1.0,
618+
pybind11::arg("enforce_same_font"), //=true,
619+
pybind11::arg("space_width_factor_for_merge"), //=1.0,
620+
pybind11::arg("space_width_factor_for_merge_with_space"), //=0.33,
621621
R"(
622622
Create line cells
623623

docling_parse/pdf_parser.py

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -275,47 +275,6 @@ def _to_page_geometry(self, dimension: dict) -> PdfPageGeometry:
275275
bleed_bbox=bleed_bbox,
276276
)
277277

278-
"""
279-
def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
280-
281-
assert "data" in cells, '"data" in cells'
282-
assert "header" in cells, '"header" in cells'
283-
284-
data = cells["data"]
285-
header = cells["header"]
286-
287-
result: List[Union[PdfTextCell, TextCell]] = []
288-
for ind, row in enumerate(data):
289-
rect = BoundingRectangle(
290-
r_x0=row[header.index(f"r_x0")],
291-
r_y0=row[header.index(f"r_y0")],
292-
r_x1=row[header.index(f"r_x1")],
293-
r_y1=row[header.index(f"r_y1")],
294-
r_x2=row[header.index(f"r_x2")],
295-
r_y2=row[header.index(f"r_y2")],
296-
r_x3=row[header.index(f"r_x3")],
297-
r_y3=row[header.index(f"r_y3")],
298-
)
299-
cell = PdfTextCell(
300-
rect=rect,
301-
text=row[header.index(f"text")],
302-
orig=row[header.index(f"text")],
303-
font_key=row[header.index(f"font-key")],
304-
font_name=row[header.index(f"font-name")],
305-
widget=row[header.index(f"widget")],
306-
text_direction=(
307-
TextDirection.LEFT_TO_RIGHT
308-
if row[header.index(f"left_to_right")]
309-
else TextDirection.RIGHT_TO_LEFT
310-
),
311-
index=ind,
312-
rendering_mode=row[header.index(f"rendering-mode")],
313-
)
314-
result.append(cell)
315-
316-
return result
317-
"""
318-
319278
def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
320279
assert "data" in cells, '"data" in cells'
321280
assert "header" in cells, '"header" in cells'
@@ -481,14 +440,22 @@ def _to_segmented_page(
481440

482441
if create_words and ("word_cells" in page):
483442
segmented_page.word_cells = self._to_cells(page["word_cells"])
484-
elif create_words:
443+
segmented_page.has_words = len(segmented_page.word_cells) > 0
444+
elif keep_chars:
445+
logging.warning(
446+
"`words` will be created for segmented_page in an inefficient way!"
447+
)
485448
self._create_word_cells(segmented_page, enforce_same_font=enforce_same_font)
486449
else:
487450
logging.warning("No `words` will be created for segmented_page")
488451

489-
if create_textlines and ("word_cells" in page):
452+
if create_textlines and ("line_cells" in page):
490453
segmented_page.textline_cells = self._to_cells(page["line_cells"])
491-
elif create_textlines:
454+
segmented_page.has_lines = len(segmented_page.textline_cells) > 0
455+
elif keep_chars:
456+
logging.warning(
457+
"`text_lines` will be created for segmented_page in an inefficient way!"
458+
)
492459
self._create_textline_cells(
493460
segmented_page, enforce_same_font=enforce_same_font
494461
)
@@ -501,11 +468,11 @@ def _create_word_cells(
501468
self,
502469
segmented_page: SegmentedPdfPage,
503470
*,
471+
horizontal_cell_tolerance: float = 1.0,
504472
space_width_factor_for_merge: float = 0.33,
505473
enforce_same_font: bool = True,
506474
_loglevel: str = "fatal",
507475
):
508-
509476
if len(segmented_page.word_cells) > 0:
510477
return
511478

@@ -523,6 +490,7 @@ def _create_word_cells(
523490

524491
# data = sanitizer.create_word_cells(space_width_factor_for_merge=0.33)
525492
data = sanitizer.create_word_cells(
493+
horizontal_cell_tolerance=horizontal_cell_tolerance,
526494
space_width_factor_for_merge=space_width_factor_for_merge,
527495
enforce_same_font=enforce_same_font,
528496
)
@@ -538,6 +506,7 @@ def _create_textline_cells(
538506
self,
539507
segmented_page: SegmentedPdfPage,
540508
*,
509+
horizontal_cell_tolerance: float = 1.0,
541510
space_width_factor_for_merge: float = 1.0,
542511
space_width_factor_for_merge_with_space: float = 0.33,
543512
enforce_same_font: bool = True,
@@ -564,6 +533,7 @@ def _create_textline_cells(
564533

565534
# data = sanitizer.create_line_cells()
566535
data = sanitizer.create_line_cells(
536+
horizontal_cell_tolerance=horizontal_cell_tolerance,
567537
space_width_factor_for_merge=space_width_factor_for_merge,
568538
space_width_factor_for_merge_with_space=space_width_factor_for_merge_with_space,
569539
enforce_same_font=enforce_same_font,

docling_parse/performance_timings.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def test_performance_pdf_parse_py(
6666
create_textlines: bool = True,
6767
enforce_same_font: bool = True,
6868
lazy: bool = True,
69-
loglevel: str = "fatal",
69+
loglevel: str = "error",
7070
) -> list[DocumentTiming]:
7171

7272
pdf_docs = sorted(glob.glob(str(ifolder)))
@@ -110,10 +110,12 @@ def test_performance_pdf_parse_py(
110110
timing.num_pages += 1
111111

112112
elapsed = datetime.now() - start_1_time
113-
# timing.page_times.append(elapsed.total_seconds())
113+
timing.page_times.append(elapsed.total_seconds())
114114

115115
start_1_time = datetime.now()
116116

117+
break
118+
117119
elapsed = datetime.now() - start_0_time
118120
timing.total_time = elapsed.total_seconds()
119121

@@ -132,6 +134,7 @@ def main():
132134
for _ in timings:
133135
print(_)
134136

137+
# Optimized
135138
timings = test_performance_pdf_parse_py(
136139
ifolder=ifolder,
137140
keep_chars=False,
@@ -145,6 +148,7 @@ def main():
145148
for _ in timings:
146149
print(_)
147150

151+
# Optoimized for time, not memory
148152
timings = test_performance_pdf_parse_py(
149153
ifolder=ifolder,
150154
keep_chars=True,
@@ -158,6 +162,20 @@ def main():
158162
for _ in timings:
159163
print(_)
160164

165+
# Original ...
166+
timings = test_performance_pdf_parse_py(
167+
ifolder=ifolder,
168+
keep_chars=True,
169+
keep_lines=True,
170+
keep_bitmaps=True,
171+
create_words=False,
172+
create_textlines=False,
173+
enforce_same_font=True,
174+
)
175+
176+
for _ in timings:
177+
print(_)
178+
161179

162180
if __name__ == "__main__":
163181
main()

src/pybind/docling_sanitizer.h

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,14 @@ namespace docling
2020

2121
bool set_char_cells(nlohmann::json& data);
2222

23-
//bool set_char_cells(pdflib::pdf_resource<pdflib::PAGE_CELLS>& char_cells_) { char_cells = char_cells_; }
24-
25-
//nlohmann::json to_records(std::string label);
26-
27-
nlohmann::json create_word_cells(double horizontal_cell_tolerance=1.00,
28-
bool enforce_same_font=true,
29-
double space_width_factor_for_merge=0.05);
30-
31-
nlohmann::json create_line_cells(double horizontal_cell_tolerance=1.00,
32-
bool enforce_same_font=true,
33-
double space_width_factor_for_merge=1.00,
34-
double space_width_factor_for_merge_with_space=0.33);
23+
nlohmann::json create_word_cells(double horizontal_cell_tolerance, //=1.00,
24+
bool enforce_same_font, //=true,
25+
double space_width_factor_for_merge); //=0.05);
26+
27+
nlohmann::json create_line_cells(double horizontal_cell_tolerance, //=1.00,
28+
bool enforce_same_font, //=true,
29+
double space_width_factor_for_merge, //=1.00,
30+
double space_width_factor_for_merge_with_space); //=0.33);
3531

3632
private:
3733

@@ -287,9 +283,9 @@ namespace docling
287283
*/
288284

289285
word_cells = cell_sanitizer.create_word_cells(char_cells,
290-
horizontal_cell_tolerance,
291-
enforce_same_font,
292-
space_width_factor_for_merge);
286+
horizontal_cell_tolerance,
287+
enforce_same_font,
288+
space_width_factor_for_merge);
293289

294290
return cell_sanitizer.to_records(word_cells);
295291
}
@@ -326,8 +322,6 @@ namespace docling
326322

327323
return cell_sanitizer.to_records(line_cells);
328324
}
329-
330-
331325

332326
}
333327

src/v2/pdf_decoders/document.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,16 +310,32 @@ namespace pdflib
310310
if(create_word_cells)
311311
{
312312
LOG_S(INFO) << "creating word-cells in `original` (2)";
313+
314+
double horizontal_cell_tolerance=1.00;
315+
bool enforce_same_font=true;
316+
double space_width_factor_for_merge=0.33;
313317

314-
pdf_resource<PAGE_CELLS> word_cells = sanitizer.create_word_cells(page_decoder.get_page_cells());
318+
pdf_resource<PAGE_CELLS> word_cells = sanitizer.create_word_cells(page_decoder.get_page_cells(),
319+
horizontal_cell_tolerance,
320+
enforce_same_font,
321+
space_width_factor_for_merge);
315322
page["original"]["word_cells"] = word_cells.get();
316323
}
317324

318325
if(create_line_cells)
319326
{
320327
LOG_S(INFO) << "creating line-cells in `original` (2)";
328+
329+
double horizontal_cell_tolerance=1.00;
330+
bool enforce_same_font=true;
331+
double space_width_factor_for_merge=1.00;
332+
double space_width_factor_for_merge_with_space=0.33;
321333

322-
pdf_resource<PAGE_CELLS> line_cells = sanitizer.create_line_cells(page_decoder.get_page_cells());
334+
pdf_resource<PAGE_CELLS> line_cells = sanitizer.create_line_cells(page_decoder.get_page_cells(),
335+
horizontal_cell_tolerance,
336+
enforce_same_font,
337+
space_width_factor_for_merge,
338+
space_width_factor_for_merge_with_space);
323339
page["original"]["line_cells"] = line_cells.get();
324340
}
325341

src/v2/pdf_sanitators/cells.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ namespace pdflib
1717
nlohmann::json to_records(pdf_resource<PAGE_CELLS>& cells);
1818

1919
pdf_resource<PAGE_CELLS> create_word_cells(pdf_resource<PAGE_CELLS>& cells,
20-
double horizontal_cell_tolerance=1.00,
21-
bool enforce_same_font=true,
22-
double space_width_factor_for_merge=0.05);
20+
double horizontal_cell_tolerance, //=1.00,
21+
bool enforce_same_font, //=true,
22+
double space_width_factor_for_merge); //=0.05);
2323

2424
pdf_resource<PAGE_CELLS> create_line_cells(pdf_resource<PAGE_CELLS>& cells,
25-
double horizontal_cell_tolerance=1.00,
26-
bool enforce_same_font=true,
27-
double space_width_factor_for_merge=1.00,
28-
double space_width_factor_for_merge_with_space=0.33);
25+
double horizontal_cell_tolerance, //=1.00,
26+
bool enforce_same_font, //=true,
27+
double space_width_factor_for_merge, //=1.00,
28+
double space_width_factor_for_merge_with_space); //=0.33);
2929

3030

3131
void remove_duplicate_chars(pdf_resource<PAGE_CELLS>& cells, double eps=1.0e-1);
@@ -137,7 +137,8 @@ namespace pdflib
137137
double space_width_factor_for_merge)
138138
{
139139
LOG_S(INFO) << __FUNCTION__;
140-
140+
LOG_S(INFO) << "space_width_factor_for_merge (create_word_cells): " << space_width_factor_for_merge;
141+
141142
// do a deep copy
142143
pdf_resource<PAGE_CELLS> word_cells;
143144
word_cells = char_cells;
@@ -182,7 +183,9 @@ namespace pdflib
182183
double space_width_factor_for_merge_with_space)
183184
{
184185
LOG_S(INFO) << __FUNCTION__ << " -> char_cells: " << char_cells.size();
185-
186+
LOG_S(INFO) << "space_width_factor_for_merge (create_line_cells): " << space_width_factor_for_merge;
187+
LOG_S(INFO) << "space_width_factor_for_merge_with_space (create_line_cells): " << space_width_factor_for_merge_with_space;
188+
186189
// do a deep copy
187190
pdf_resource<PAGE_CELLS> line_cells;
188191
line_cells = char_cells;

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)