Skip to content

Commit 25b1e64

Browse files
feat: add support for RtL (#94)
Signed-off-by: Peter Staar <[email protected]>
1 parent b634c11 commit 25b1e64

File tree

146 files changed

+486602
-241218
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

146 files changed

+486602
-241218
lines changed

docling_parse/document.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ class PdfCell(PdfColoredElement):
166166
font_name: str
167167

168168
widget: bool
169+
left_to_right: bool
169170

170171
def to_bottom_left_origin(self, page_height: float):
171172
self.rect = self.rect.to_bottom_left_origin(page_height=page_height)

docling_parse/pdf_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ def _to_cells(self, cells: dict) -> List[PdfCell]:
191191
font_key=row[header.index(f"font-key")],
192192
font_name=row[header.index(f"font-name")],
193193
widget=row[header.index(f"widget")],
194+
left_to_right=row[header.index(f"left_to_right")],
194195
ordering=ind,
195196
rendering_mode="",
196197
)

src/v2.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
#include <v2/pdf_resources/page_images.h>
7777

7878
#include <v2/pdf_sanitator.h>
79+
#include <v2/pdf_sanitators/constants.h>
7980
#include <v2/pdf_sanitators/lines.h>
8081
#include <v2/pdf_sanitators/cells.h>
8182
#include <v2/pdf_sanitators/page_dimension.h>

src/v2/pdf_resources/page_cell.h

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,17 @@ namespace pdflib
2727

2828
bool is_adjacent_to(pdf_resource<PAGE_CELL>& other, double delta);
2929

30+
bool has_same_reading_orientation(pdf_resource<PAGE_CELL>& other);
31+
3032
bool merge_with(pdf_resource<PAGE_CELL>& other, double delta);
3133

3234
public:
3335

3436
static std::vector<std::string> header;
3537

3638
bool active;
37-
39+
bool left_to_right;
40+
3841
double x0;
3942
double y0;
4043
double x1;
@@ -81,7 +84,8 @@ namespace pdflib
8184
};
8285

8386
pdf_resource<PAGE_CELL>::pdf_resource():
84-
active(true)
87+
active(true),
88+
left_to_right(true)
8589
{}
8690

8791
pdf_resource<PAGE_CELL>::~pdf_resource()
@@ -128,7 +132,8 @@ namespace pdflib
128132
//"block-count",
129133
//"instr-count",
130134

131-
"widget"
135+
"widget",
136+
"left_to_right"
132137
};
133138

134139
void pdf_resource<PAGE_CELL>::rotate(int angle, std::pair<double, double> delta)
@@ -180,7 +185,8 @@ namespace pdflib
180185
cell.push_back(font_key); // 17
181186
cell.push_back(font_name); // 18
182187

183-
cell.push_back(widget); //19
188+
cell.push_back(widget); // 19
189+
cell.push_back(left_to_right); // 20
184190
}
185191
assert(cell.size()==header.size());
186192

@@ -219,6 +225,7 @@ namespace pdflib
219225
font_name = data.at(18).get<std::string>();
220226

221227
widget = data.at(19).get<bool>();
228+
left_to_right = data.at(20).get<bool>();
222229

223230
return true;
224231
}
@@ -255,26 +262,51 @@ namespace pdflib
255262

256263
bool pdf_resource<PAGE_CELL>::is_adjacent_to(pdf_resource<PAGE_CELL>& other, double eps)
257264
{
258-
//if(eps<0.0)
259-
//{
260-
//eps = average_char_width()/2.0;
261-
//}
262-
263265
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
264266
double d1 = std::sqrt((r_x2-other.r_x3)*(r_x2-other.r_x3) + (r_y2-other.r_y3)*(r_y2-other.r_y3));
265267

266268
return ((d0<eps) and (d1<eps));
267269
}
268270

271+
bool pdf_resource<PAGE_CELL>::has_same_reading_orientation(pdf_resource<PAGE_CELL>& other)
272+
{
273+
// it might need is_punctuation function instead of just the space
274+
bool is_punc = utils::string::is_punctuation_or_space(text);
275+
bool other_is_punc = utils::string::is_punctuation_or_space(other.text);
276+
277+
//return ((left_to_right==other.left_to_right) or (text==" " or other.text==" "));
278+
return ((left_to_right==other.left_to_right) or (is_punc or other_is_punc));
279+
}
280+
269281
bool pdf_resource<PAGE_CELL>::merge_with(pdf_resource<PAGE_CELL>& other, double delta)
270282
{
283+
if(not has_same_reading_orientation(other))
284+
{
285+
LOG_S(ERROR) << "inconsistent merging of cells!";
286+
}
287+
271288
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
272289

273-
if(delta<d0)
290+
if((not left_to_right) or (not other.left_to_right))
291+
{
292+
if(delta<d0)
293+
{
294+
text = " " + text;
295+
}
296+
text = other.text + text;
297+
298+
left_to_right = false;
299+
}
300+
else
274301
{
275-
text += " ";
276-
}
277-
text += other.text;
302+
if(delta<d0)
303+
{
304+
text += " ";
305+
}
306+
text += other.text;
307+
308+
left_to_right = true;
309+
}
278310

279311
r_x1 = other.r_x1;
280312
r_y1 = other.r_y1;

src/v2/pdf_sanitators/cells.h

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ namespace pdflib
5050

5151
private:
5252

53+
/*
5354
// FIXME: we might at some point move this into a file into the resources ...
5455
const static inline std::vector<std::pair<std::string, std::string> > replacements = {
5556
{R"(\f_f_i)", "ffi"},
@@ -106,7 +107,7 @@ namespace pdflib
106107
107108
{"\u2212", "-"},
108109
};
109-
110+
*/
110111
};
111112

112113
pdf_sanitator<PAGE_CELLS>::pdf_sanitator()
@@ -142,7 +143,12 @@ namespace pdflib
142143
utils::values::distance(cells[i].r_x2, cells[i].r_y2, cells[j].r_x2, cells[j].r_y2)<eps and
143144
utils::values::distance(cells[i].r_x3, cells[i].r_y3, cells[j].r_x3, cells[j].r_y3)<eps)
144145
{
145-
LOG_S(INFO) << "removing: " << cells[j].text << "(" << cells[i].r_x0 << ", " << cells[i].r_y0 << ") ";
146+
LOG_S(WARNING) << "removing duplicate char with text: '" << cells[j].text << "' "
147+
<< "with r_0: (" << cells[i].r_x0 << ", " << cells[i].r_y0 << ") "
148+
<< "with r_2: (" << cells[i].r_x2 << ", " << cells[i].r_y2 << ") "
149+
<< "with r'_0: (" << cells[j].r_x0 << ", " << cells[j].r_y0 << ") "
150+
<< "with r'_2: (" << cells[j].r_x2 << ", " << cells[j].r_y2 << ") ";
151+
146152

147153
cells[j].active = false;
148154
erased_cell = true;
@@ -174,11 +180,33 @@ namespace pdflib
174180
{
175181
std::string& text = cells.at(i).text;
176182

177-
for(const std::pair<std::string, std::string>& pair:replacements)
183+
for(const std::pair<std::string, std::string>& pair:text_constants::replacements)
178184
{
179185
utils::string::replace(text, pair.first, pair.second);
180186
}
181187
}
188+
189+
{
190+
std::regex pattern(R"(^\/([A-Za-z])_([A-Za-z])(_([A-Za-z]))?$)");
191+
192+
for(int i=0; i<cells.size(); i++)
193+
{
194+
std::string text = cells.at(i).text;
195+
196+
std::smatch match;
197+
if(std::regex_match(text, match, pattern))
198+
{
199+
std::string replacement = match[1].str() + match[2].str();
200+
if(match[3].matched)
201+
{
202+
replacement += match[4].str();
203+
}
204+
205+
LOG_S(WARNING) << "replacing `" << text << "` with `" << replacement << "`";
206+
cells.at(i).text = replacement;
207+
}
208+
}
209+
}
182210
}
183211

184212
void pdf_sanitator<PAGE_CELLS>::sanitize_bbox(pdf_resource<PAGE_CELLS>& cells,
@@ -356,6 +384,11 @@ namespace pdflib
356384
continue;
357385
}
358386

387+
if(not cells[i].has_same_reading_orientation(cells[j]))
388+
{
389+
continue;
390+
}
391+
359392
double delta_0 = cells[i].average_char_width()*space_width_factor_for_merge;
360393
double delta_1 = cells[i].average_char_width()*space_width_factor_for_merge_with_space;
361394

src/v2/pdf_sanitators/constants.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
//-*-C++-*-
2+
3+
#ifndef PDF_CELLS_CONSTANTS_H
4+
#define PDF_CELLS_CONSTANTS_H
5+
6+
namespace pdflib
7+
{
8+
9+
class text_constants
10+
{
11+
12+
public:
13+
14+
// FIXME: we might at some point move this into a file into the resources ...
15+
const static inline std::vector<std::pair<std::string, std::string> > replacements = {
16+
{R"(\f_f_i)", "ffi"},
17+
{R"(\f_f_l)", "ffl"},
18+
{R"(\f_i)", "fi"},
19+
{R"(\f_l)", "fl"},
20+
{R"(\f_f)", "ff"},
21+
22+
{R"(/f_f_i)", "ffi"},
23+
{R"(/f_f_l)", "ffl"},
24+
{R"(/f_i)", "fi"},
25+
{R"(/f_l)", "fl"},
26+
{R"(/f_f)", "ff"},
27+
28+
{R"(f_f_i)", "ffi"},
29+
{R"(f_f_l)", "ffl"},
30+
{R"(f_i)", "fi"},
31+
{R"(f_l)", "fl"},
32+
{R"(f_f)", "ff"},
33+
34+
{"\uFB00", "ff"},
35+
{"\uFB01", "fi"},
36+
{"\uFB02", "fl"},
37+
{"\uFB03", "ffi"},
38+
{"\uFB04", "ffl"},
39+
40+
{"\u2000", " "},
41+
{"\u2001", " "},
42+
{"\u2002", " "},
43+
{"\u2003", " "},
44+
{"\u2004", " "},
45+
{"\u2005", " "},
46+
{"\u2006", " "},
47+
{"\u2007", " "},
48+
{"\u2008", " "},
49+
{"\u2009", " "},
50+
{"\u200A", " "},
51+
52+
{"\u200B", ""},
53+
{"\u200C", ""},
54+
{"\u200D", ""},
55+
{"\u200E", ""},
56+
{"\u200F", ""},
57+
58+
{"\u2010", "-"},
59+
{"\u2011", "-"},
60+
{"\u2012", "-"},
61+
{"\u2013", "-"},
62+
{"\u2014", "-"},
63+
{"\u2015", "-"},
64+
65+
{"\u2018", "'"},
66+
{"\u2019", "'"},
67+
{"\u201A", ","},
68+
{"\u201B", "'"},
69+
{"\u201C", "'"},
70+
{"\u201D", "'"},
71+
{"\u201E", "'"},
72+
{"\u201F", "'"},
73+
74+
{"\u2212", "-"},
75+
};
76+
77+
};
78+
79+
}
80+
81+
#endif

src/v2/pdf_states/text.h

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -473,14 +473,28 @@ namespace pdflib
473473
{
474474
LOG_S(INFO) << __FUNCTION__ << " with text='" << text << "', width=" << width;
475475

476+
bool left_to_right = (not utils::string::is_right_to_left(text));
477+
476478
double font_descent = font.get_descent();
477479
double font_ascent = font.get_ascent();
478480
double font_capheight = font.get_capheight();
479481

480-
LOG_S(INFO) << "font_descent: " << font_descent << ", "
481-
<< "font_ascent: " << font_ascent << ", "
482-
<< "font_capheight: " << font_capheight << ", "
483-
<< "capheight/ascent: " << font_capheight/font_ascent << "";
482+
if(left_to_right)
483+
{
484+
LOG_S(INFO) << "font_descent: " << font_descent << ", "
485+
<< "font_ascent: " << font_ascent << ", "
486+
<< "font_capheight: " << font_capheight << ", "
487+
<< "capheight/ascent: " << font_capheight/font_ascent << ", "
488+
<< "left_to_right: " << left_to_right << ", text: " << text;
489+
}
490+
else
491+
{
492+
LOG_S(WARNING) << "font_descent: " << font_descent << ", "
493+
<< "font_ascent: " << font_ascent << ", "
494+
<< "font_capheight: " << font_capheight << ", "
495+
<< "capheight/ascent: " << font_capheight/font_ascent << ", "
496+
<< "left_to_right: " << left_to_right << ", text: " << text;
497+
}
484498

485499
double space_width=0;
486500
{
@@ -498,6 +512,9 @@ namespace pdflib
498512
{
499513
pdf_resource<PAGE_CELL> cell;
500514

515+
cell.active = true;
516+
cell.left_to_right = left_to_right;
517+
501518
cell.widget = false;
502519

503520
double ratio = 1.0;
@@ -530,8 +547,6 @@ namespace pdflib
530547
cell.rendering_mode = rendering_mode;
531548

532549
cell.space_width = space_width;
533-
//cell.chars = {};//chars;
534-
//cell.widths = {};//widths;
535550

536551
cell.enc_name = font.get_encoding_name();
537552

0 commit comments

Comments
 (0)