Skip to content

Commit de18986

Browse files
fix: added more updates to better font-parsing (#87)
Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Peter W. J. Staar <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent 0921cb6 commit de18986

File tree

101 files changed

+3564421
-35
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+3564421
-35
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ externals
33
build
44
dist
55
extlib_*/
6+
scratch_*
67

78
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,emacs,cmake,virtualenv
89
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,emacs,cmake,virtualenv

docling_parse/document.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ class PdfCell(PdfColoredElement):
154154

155155
rect: BoundingRectangle
156156

157+
rect_fontbbox: Optional[BoundingRectangle] = None
158+
rect_capheight: Optional[BoundingRectangle] = None
159+
157160
text: str
158161
orig: str
159162

@@ -358,6 +361,12 @@ def export_to_textlines(
358361
for cell in self.cells:
359362

360363
line = ""
364+
if add_location:
365+
line += f"({cell.rect.r_x0:03.02f}, {cell.rect.r_y0:03.02f}) "
366+
line += f"({cell.rect.r_x1:03.02f}, {cell.rect.r_y1:03.02f}) "
367+
line += f"({cell.rect.r_x2:03.02f}, {cell.rect.r_y2:03.02f}) "
368+
line += f"({cell.rect.r_x3:03.02f}, {cell.rect.r_y3:03.02f}) "
369+
361370
if add_fontkey:
362371
line += f"{cell.font_key} "
363372

@@ -453,8 +462,8 @@ def _draw_text_in_bounding_bbox(
453462
width, height = round(x1 - x0), round(y0 - y1)
454463

455464
if width <= 2 or height <= 2:
456-
logging.warning(f"skipping to draw text: {text}")
457-
return img # draw
465+
# logging.warning(f"skipping to draw text (width: {x1-x0}, height: {y1-y0}): {text}")
466+
return img
458467

459468
# Use the default font if no font is provided
460469
if font is None:

docling_parse/visualize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def visualise_py(
292292
).show()
293293

294294
lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
295-
print("\n".join(lines))
295+
print("text-lines: \n", "\n".join(lines))
296296

297297
"""
298298
lines = pdf_page.original.export_to_textlines(add_fontkey=True)

src/v2/pdf_resources/page_font.h

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -463,18 +463,31 @@ namespace pdflib
463463

464464
std::string pdf_resource<PAGE_FONT>::get_correct_character(uint32_t c)
465465
{
466-
// sometimes, a font has differences-map and a cmap
466+
// Sometimes, a font has differences-map and a cmap
467467
// defined at the same time. So far, it seems that the
468468
// differences should take precedent over the cmap. This
469-
// is however not really clear (eg p 292)
469+
// is however not really clear (eg p 292). Notice also that
470+
// we init the cmap before we init the difference and that the
471+
// difference inherits the content of a the cmap. It is a bit
472+
// messy and unclear her.
473+
474+
/*
475+
if(diff_numb_to_char.count(c)>0 and cmap_numb_to_char.count(c)>0)
476+
{
477+
LOG_S(WARNING) << "there might be some confusion here: "
478+
<< "diff["<<c<<"]: " << diff_numb_to_char.at(c) << " "
479+
<< "cmap["<<c<<"]: " << cmap_numb_to_char.at(c);
480+
}
481+
*/
482+
470483
if(diff_initialized and diff_numb_to_char.count(c)>0)
471484
{
472485
return diff_numb_to_char.at(c);
473486
}
474487
else if(cmap_initialized and cmap_numb_to_char.count(c)>0)
475488
{
476489
return cmap_numb_to_char.at(c);
477-
}
490+
}
478491
else if(bfonts.has_corresponding_font(font_name))
479492
{
480493
// check if the font-name is registered as a 'special' font, eg
@@ -878,6 +891,7 @@ namespace pdflib
878891
}
879892

880893

894+
881895
/*
882896
void pdf_resource<PAGE_FONT>::init_fontfile3()
883897
{
@@ -911,18 +925,35 @@ namespace pdflib
911925
auto buffer = qpdf_obj.getRawStreamData();
912926
913927
LOG_S(INFO) << "buffer-size: " << buffer->getSize();
914-
LOG_S(INFO) << "buffer: " << buffer->getBuffer();
928+
//LOG_S(INFO) << "buffer: " << buffer->getBuffer();
929+
930+
std::string filename = "fontfile.zip";
931+
std::ofstream outFile(filename, std::ios::binary);
932+
if (!outFile) {
933+
LOG_S(ERROR) << "opening file for writing: " << filename << std::endl;
934+
return;
935+
}
936+
937+
outFile.write(reinterpret_cast<const char*>(buffer->getBuffer()), buffer->getSize());
938+
outFile.close();
939+
940+
if (!outFile) {
941+
LOG_S(ERROR) << "Error occurred while writing to the file: " << filename << std::endl;
942+
} else {
943+
LOG_S(INFO) << "Buffer successfully written to " << filename << std::endl;
944+
}
915945
}
916946
917947
{
918-
auto buffer = qpdf_obj.getStreamData(qpdf_dl_generalized);
948+
auto buffer = qpdf_obj.getStreamData(qpdf_dl_generalized);
919949
920-
LOG_S(INFO) << "buffer-size: " << buffer->getSize();
921-
LOG_S(INFO) << "buffer: " << buffer->getBuffer();
950+
LOG_S(INFO) << "buffer-size: " << buffer->getSize();
951+
//LOG_S(INFO) << "buffer: " << buffer->getBuffer();
922952
}
923953
924-
assert(false);
954+
//assert(false);
925955
}
956+
926957
else if(utils::json::has(keys_0, desc_font))
927958
{
928959
auto qpdf_obj = qpdf_desc_font.getKey("/FontDescriptor").getKey("/FontFile3");
@@ -963,8 +994,9 @@ namespace pdflib
963994
else
964995
{
965996
LOG_S(WARNING) << "fontfile3 is not a stream ...";
966-
}
997+
}
967998
}
999+
9681000
else
9691001
{
9701002
LOG_S(WARNING) << "no fontfile3 detected ...";
@@ -1616,6 +1648,7 @@ namespace pdflib
16161648
// Create a regex object
16171649
std::regex re_01(R"(\/(.+)\.(.+))");
16181650
std::regex re_02(R"((\/)?(uni|UNI)([0-9A-Ea-e]{4}))");
1651+
std::regex re_03(R"((\/)(g|G)\d+)");
16191652

16201653
if(utils::json::has(keys, json_font))
16211654
{
@@ -1654,10 +1687,13 @@ namespace pdflib
16541687
}
16551688
else
16561689
{}
1690+
1691+
LOG_S(INFO) << name << ", in cmap: " << cmap_numb_to_char.count(numb) << ", #-names: " << name_to_descr.size() << ", type: " << subtype;
16571692

1658-
if(name_to_descr.count(name)==1 and // only for TYPE_3 fonts
1693+
if(subtype==TYPE_3 and //name_to_descr.count(name)==1 and // only for TYPE_3 fonts
16591694
cmap_numb_to_char.count(numb)==1)
16601695
{
1696+
LOG_S(WARNING) << "overloading difference from cmap";
16611697
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
16621698
}
16631699

@@ -1739,6 +1775,13 @@ namespace pdflib
17391775
<< diff_numb_to_char[numb]
17401776
<< " (from " << name << ")";
17411777
}
1778+
else if(std::regex_match(name, match, re_03) and cmap_numb_to_char.count(numb)==1) // if the name is of type /g23 of /G23 and we have a match in the cmap
1779+
{
1780+
LOG_S(WARNING) << "overloading difference from cmap";
1781+
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
1782+
//diff_numb_to_char[numb] = name;
1783+
//LOG_S(ERROR) << "weird differences["<<numb<<"] -> " << name;
1784+
}
17421785
else
17431786
{
17441787
diff_numb_to_char[numb] = name;

src/v2/pdf_states/text.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,8 @@ namespace pdflib
663663
v += values[l];
664664

665665
std::pair<uint32_t, std::string> item(c,v);
666+
LOG_S(INFO) << item.first << ": " << item.second;
667+
666668
result.push_back(item);
667669
}
668670
}

0 commit comments

Comments
 (0)