Ensure that words never contain RTL/LTR char mixtures

JorjMcKie · JorjMcKie · commit 5a24b9ac33c5 · 2024-11-01T11:22:02.000-04:00
Previously, we only used white space to perform  word breaks. This could lead to mixtures of RTL and LTR characters in the same word string.
Which in turn made it impossible to produce satisfying extractions of RTL / LTR text mixtures.
This change ensures that every word string contains either no or only RTL characters.

Additional standard word delimiters:

0x202A: LEFT-TO-RIGHT EMBEDDING
0x202B: RIGHT-TO-LEFT EMBEDDING
0x202C: POP DIRECTIONAL FORMATTING
0x202D: LEFT-TO-RIGHT OVERRIDE
0x202E: RIGHT-TO-LEFT OVERRIDE

Word breaks will be generated at the occurrence of any of these characters.
In addition, breaks are also made if characters in a row are not both, either RTL or LTR.
diff --git a/src/__init__.py b/src/__init__.py
@@ -12800,6 +12800,7 @@ def extractWORDS(self, delimiters=None):
         if g_use_extra:
             return extra.extractWORDS(self.this, delimiters)
         buflen = 0
+        last_char_rtl = 0
         block_n = -1
         wbbox = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)  # word bbox
         this_tpage = self.this
@@ -12825,16 +12826,19 @@ def extractWORDS(self, delimiters=None):
                             ):
                         continue
                     word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters)
-                    if word_delimiter:
-                        if buflen == 0:
+                    this_char_rtl = JM_is_rtl_char(ch.m_internal.c)
+                    if word_delimiter or this_char_rtl != last_char_rtl:
+                        if buflen == 0 and word_delimiter:
                             continue    # skip delimiters at line start
                         if not mupdf.fz_is_empty_rect(wbbox):
                             word_n, wbbox = JM_append_word(lines, buff, wbbox, block_n, line_n, word_n)
                         mupdf.fz_clear_buffer(buff)
                         buflen = 0  # reset char counter
-                        continue
+                        if word_delimiter:
+                            continue
                     # append one unicode character to the word
                     JM_append_rune(buff, ch.m_internal.c)
+                    last_char_rtl = this_char_rtl
                     buflen += 1
                     # enlarge word bbox
                     wbbox = mupdf.fz_union_rect(wbbox, JM_char_bbox(line, ch))
@@ -15371,7 +15375,13 @@ def JM_font_descender(font):
 def JM_is_word_delimiter(ch, delimiters):
     """Check if ch is an extra word delimiting character.
     """
-    if ch <= 32 or ch == 160:  # any whitespace?
+    if (0
+        or ch <= 32
+        or ch == 160
+        or 0x202a <= ch <= 0x202e
+    ):
+        # covers any whitespace plus unicodes that switch between
+        # right-to-left and left-to-right languages
         return True
     if not delimiters:  # no extra delimiters provided
         return False
@@ -15380,6 +15390,12 @@ def JM_is_word_delimiter(ch, delimiters):
         if d == char:
             return True
     return False
+    
+
+def JM_is_rtl_char(ch):
+    if ch < 0x590 or ch > 0x900:
+        return False;
+    return True
 
 
 def JM_font_name(font):
diff --git a/src/extra.i b/src/extra.i
@@ -1676,6 +1676,10 @@ static int
 JM_is_word_delimiter(int c, PyObject *delimiters)
 {
     if (c <= 32 || c == 160) return 1;  // a standard delimiter
+    if (0x202a <= c && c <= 0x202e)
+    {
+        return 1; // change between writing directions
+    }
 
     // extra delimiters must be a non-empty sequence
     if (!delimiters || PyObject_Not(delimiters) || !PySequence_Check(delimiters)) {  
@@ -1707,6 +1711,12 @@ JM_is_word_delimiter(int c, PyObject *delimiters)
     return 0;
 }
 
+static int 
+JM_is_rtl_char(int c)
+{
+    if (c < 0x590 || c > 0x900) return 0;
+    return 1;
+}
 
 static const char* JM_font_name(fz_font* font)
 {
@@ -3223,6 +3233,7 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
             int word_n = 0;                 // word counter per line
             mupdf::fz_clear_buffer(buff);   // reset word buffer
             size_t buflen = 0;              // reset char counter
+            int last_char_rtl = 0;          // was last character RTL?
             for (mupdf::FzStextChar ch: line)
             {
                 mupdf::FzRect cbbox = JM_char_bbox(line, ch);
@@ -3232,9 +3243,10 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
                 }
 
                 int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters);
-                if (word_delimiter)
+                int this_char_rtl = JM_is_rtl_char(ch.m_internal->c);
+                if (word_delimiter || this_char_rtl != last_char_rtl)
                 {
-                    if (buflen == 0)
+                    if (buflen == 0 && word_delimiter)
                     {
                         continue;  // skip delimiters at line start
                     }
@@ -3251,10 +3263,11 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
                     }
                     mupdf::fz_clear_buffer(buff);
                     buflen = 0;  // reset char counter
-                    continue;
+                    if (word_delimiter) continue;
                 }
                 // append one unicode character to the word
                 JM_append_rune(buff.m_internal, ch.m_internal->c);
+                last_char_rtl = this_char_rtl;
                 buflen++;
                 // enlarge word bbox
                 wbbox = fz_union_rect(wbbox, JM_char_bbox(line, ch));
diff --git a/src/utils.py b/src/utils.py
@@ -632,7 +632,7 @@ def line_text(clip, line):
             # convert distance to previous word to multiple spaces
             dist = max(
                 int(round((r.x0 - x1) / r.width * len(t))),
-                0 if x1 == clip.x0 else 1,
+                0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
             )  # number of space characters
 
             ltext += " " * dist + t  # append word string
diff --git a/tests/resources/test-E+A.pdf b/tests/resources/test-E+A.pdf
diff --git a/tests/test-rtl.py b/tests/test-rtl.py
@@ -0,0 +1,15 @@
+import pymupdf
+
+
+def test_rtl():
+    doc = pymupdf.open("resources/test-E+A.pdf")
+    page = doc[0]
+    # set of all RTL characters
+    rtl_chars = set([chr(i) for i in range(0x590, 0x901)])
+
+    for w in page.get_text("words"):
+        # every word string must either ONLY contain RTL chars
+        cond1 = rtl_chars.issuperset(w[4])
+        # ... or NONE.
+        cond2 = rtl_chars.intersection(w[4]) == set()
+        assert cond1 or cond2

Original file line number	Diff line number	Diff line change
`@@ -1676,6 +1676,10 @@ static int`
`1676`	`1676`	`JM_is_word_delimiter(int c, PyObject *delimiters)`
`1677`	`1677`	`{`
`1678`	`1678`	`if (c <= 32 \|\| c == 160) return 1; // a standard delimiter`
	`1679`	`+ if (0x202a <= c && c <= 0x202e)`
	`1680`	`+ {`
	`1681`	`+ return 1; // change between writing directions`
	`1682`	`+ }`
`1679`	`1683`
`1680`	`1684`	`// extra delimiters must be a non-empty sequence`
`1681`	`1685`	`if (!delimiters \|\| PyObject_Not(delimiters) \|\| !PySequence_Check(delimiters)) {`
`@@ -1707,6 +1711,12 @@ JM_is_word_delimiter(int c, PyObject *delimiters)`
`1707`	`1711`	`return 0;`
`1708`	`1712`	`}`
`1709`	`1713`
	`1714`	`+static int`
	`1715`	`+JM_is_rtl_char(int c)`
	`1716`	`+{`
	`1717`	`+ if (c < 0x590 \|\| c > 0x900) return 0;`
	`1718`	`+ return 1;`
	`1719`	`+}`
`1710`	`1720`
`1711`	`1721`	`static const char* JM_font_name(fz_font* font)`
`1712`	`1722`	`{`
`@@ -3223,6 +3233,7 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)`
`3223`	`3233`	`int word_n = 0; // word counter per line`
`3224`	`3234`	`mupdf::fz_clear_buffer(buff); // reset word buffer`
`3225`	`3235`	`size_t buflen = 0; // reset char counter`
	`3236`	`+ int last_char_rtl = 0; // was last character RTL?`
`3226`	`3237`	`for (mupdf::FzStextChar ch: line)`
`3227`	`3238`	`{`
`3228`	`3239`	`mupdf::FzRect cbbox = JM_char_bbox(line, ch);`
`@@ -3232,9 +3243,10 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)`
`3232`	`3243`	`}`
`3233`	`3244`
`3234`	`3245`	`int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters);`
`3235`		`- if (word_delimiter)`
	`3246`	`+ int this_char_rtl = JM_is_rtl_char(ch.m_internal->c);`
	`3247`	`+ if (word_delimiter \|\| this_char_rtl != last_char_rtl)`
`3236`	`3248`	`{`
`3237`		`- if (buflen == 0)`
	`3249`	`+ if (buflen == 0 && word_delimiter)`
`3238`	`3250`	`{`
`3239`	`3251`	`continue; // skip delimiters at line start`
`3240`	`3252`	`}`
`@@ -3251,10 +3263,11 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)`
`3251`	`3263`	`}`
`3252`	`3264`	`mupdf::fz_clear_buffer(buff);`
`3253`	`3265`	`buflen = 0; // reset char counter`
`3254`		`- continue;`
	`3266`	`+ if (word_delimiter) continue;`
`3255`	`3267`	`}`
`3256`	`3268`	`// append one unicode character to the word`
`3257`	`3269`	`JM_append_rune(buff.m_internal, ch.m_internal->c);`
	`3270`	`+ last_char_rtl = this_char_rtl;`
`3258`	`3271`	`buflen++;`
`3259`	`3272`	`// enlarge word bbox`
`3260`	`3273`	`wbbox = fz_union_rect(wbbox, JM_char_bbox(line, ch));`