Skip to content

Commit 5a24b9a

Browse files
committed
Ensure that words never contain RTL/LTR char mixtures
Previously, we only used white space to perform word breaks. This could lead to mixtures of RTL and LTR characters in the same word string. Which in turn made it impossible to produce satisfying extractions of RTL / LTR text mixtures. This change ensures that every word string contains either no or only RTL characters. Additional standard word delimiters: 0x202A: LEFT-TO-RIGHT EMBEDDING 0x202B: RIGHT-TO-LEFT EMBEDDING 0x202C: POP DIRECTIONAL FORMATTING 0x202D: LEFT-TO-RIGHT OVERRIDE 0x202E: RIGHT-TO-LEFT OVERRIDE Word breaks will be generated at the occurrence of any of these characters. In addition, breaks are also made if characters in a row are not both, either RTL or LTR.
1 parent a2b4ba3 commit 5a24b9a

File tree

5 files changed

+52
-8
lines changed

5 files changed

+52
-8
lines changed

src/__init__.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12800,6 +12800,7 @@ def extractWORDS(self, delimiters=None):
1280012800
if g_use_extra:
1280112801
return extra.extractWORDS(self.this, delimiters)
1280212802
buflen = 0
12803+
last_char_rtl = 0
1280312804
block_n = -1
1280412805
wbbox = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) # word bbox
1280512806
this_tpage = self.this
@@ -12825,16 +12826,19 @@ def extractWORDS(self, delimiters=None):
1282512826
):
1282612827
continue
1282712828
word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters)
12828-
if word_delimiter:
12829-
if buflen == 0:
12829+
this_char_rtl = JM_is_rtl_char(ch.m_internal.c)
12830+
if word_delimiter or this_char_rtl != last_char_rtl:
12831+
if buflen == 0 and word_delimiter:
1283012832
continue # skip delimiters at line start
1283112833
if not mupdf.fz_is_empty_rect(wbbox):
1283212834
word_n, wbbox = JM_append_word(lines, buff, wbbox, block_n, line_n, word_n)
1283312835
mupdf.fz_clear_buffer(buff)
1283412836
buflen = 0 # reset char counter
12835-
continue
12837+
if word_delimiter:
12838+
continue
1283612839
# append one unicode character to the word
1283712840
JM_append_rune(buff, ch.m_internal.c)
12841+
last_char_rtl = this_char_rtl
1283812842
buflen += 1
1283912843
# enlarge word bbox
1284012844
wbbox = mupdf.fz_union_rect(wbbox, JM_char_bbox(line, ch))
@@ -15371,7 +15375,13 @@ def JM_font_descender(font):
1537115375
def JM_is_word_delimiter(ch, delimiters):
1537215376
"""Check if ch is an extra word delimiting character.
1537315377
"""
15374-
if ch <= 32 or ch == 160: # any whitespace?
15378+
if (0
15379+
or ch <= 32
15380+
or ch == 160
15381+
or 0x202a <= ch <= 0x202e
15382+
):
15383+
# covers any whitespace plus unicodes that switch between
15384+
# right-to-left and left-to-right languages
1537515385
return True
1537615386
if not delimiters: # no extra delimiters provided
1537715387
return False
@@ -15380,6 +15390,12 @@ def JM_is_word_delimiter(ch, delimiters):
1538015390
if d == char:
1538115391
return True
1538215392
return False
15393+
15394+
15395+
def JM_is_rtl_char(ch):
15396+
if ch < 0x590 or ch > 0x900:
15397+
return False;
15398+
return True
1538315399

1538415400

1538515401
def JM_font_name(font):

src/extra.i

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1676,6 +1676,10 @@ static int
16761676
JM_is_word_delimiter(int c, PyObject *delimiters)
16771677
{
16781678
if (c <= 32 || c == 160) return 1; // a standard delimiter
1679+
if (0x202a <= c && c <= 0x202e)
1680+
{
1681+
return 1; // change between writing directions
1682+
}
16791683

16801684
// extra delimiters must be a non-empty sequence
16811685
if (!delimiters || PyObject_Not(delimiters) || !PySequence_Check(delimiters)) {
@@ -1707,6 +1711,12 @@ JM_is_word_delimiter(int c, PyObject *delimiters)
17071711
return 0;
17081712
}
17091713

1714+
static int
1715+
JM_is_rtl_char(int c)
1716+
{
1717+
if (c < 0x590 || c > 0x900) return 0;
1718+
return 1;
1719+
}
17101720

17111721
static const char* JM_font_name(fz_font* font)
17121722
{
@@ -3223,6 +3233,7 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
32233233
int word_n = 0; // word counter per line
32243234
mupdf::fz_clear_buffer(buff); // reset word buffer
32253235
size_t buflen = 0; // reset char counter
3236+
int last_char_rtl = 0; // was last character RTL?
32263237
for (mupdf::FzStextChar ch: line)
32273238
{
32283239
mupdf::FzRect cbbox = JM_char_bbox(line, ch);
@@ -3232,9 +3243,10 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
32323243
}
32333244

32343245
int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters);
3235-
if (word_delimiter)
3246+
int this_char_rtl = JM_is_rtl_char(ch.m_internal->c);
3247+
if (word_delimiter || this_char_rtl != last_char_rtl)
32363248
{
3237-
if (buflen == 0)
3249+
if (buflen == 0 && word_delimiter)
32383250
{
32393251
continue; // skip delimiters at line start
32403252
}
@@ -3251,10 +3263,11 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
32513263
}
32523264
mupdf::fz_clear_buffer(buff);
32533265
buflen = 0; // reset char counter
3254-
continue;
3266+
if (word_delimiter) continue;
32553267
}
32563268
// append one unicode character to the word
32573269
JM_append_rune(buff.m_internal, ch.m_internal->c);
3270+
last_char_rtl = this_char_rtl;
32583271
buflen++;
32593272
// enlarge word bbox
32603273
wbbox = fz_union_rect(wbbox, JM_char_bbox(line, ch));

src/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ def line_text(clip, line):
632632
# convert distance to previous word to multiple spaces
633633
dist = max(
634634
int(round((r.x0 - x1) / r.width * len(t))),
635-
0 if x1 == clip.x0 else 1,
635+
0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
636636
) # number of space characters
637637

638638
ltext += " " * dist + t # append word string

tests/resources/test-E+A.pdf

153 KB
Binary file not shown.

tests/test-rtl.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import pymupdf
2+
3+
4+
def test_rtl():
5+
doc = pymupdf.open("resources/test-E+A.pdf")
6+
page = doc[0]
7+
# set of all RTL characters
8+
rtl_chars = set([chr(i) for i in range(0x590, 0x901)])
9+
10+
for w in page.get_text("words"):
11+
# every word string must either ONLY contain RTL chars
12+
cond1 = rtl_chars.issuperset(w[4])
13+
# ... or NONE.
14+
cond2 = rtl_chars.intersection(w[4]) == set()
15+
assert cond1 or cond2

0 commit comments

Comments
 (0)