Skip to content

Commit b16f047

Browse files
committed
Add support for ZWJ
1 parent 52bf877 commit b16f047

File tree

8 files changed

+143
-6
lines changed

8 files changed

+143
-6
lines changed

MANIFEST.in

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1-
include arabic_reshaper/default-config.ini
1+
include arabic_reshaper/__init__.py
2+
include arabic_reshaper/__version__.py
23
include arabic_reshaper/arabic_reshaper.py
3-
include README
4+
include arabic_reshaper/default-config.ini
5+
include arabic_reshaper/letters.py
6+
include arabic_reshaper/ligatures.py
7+
include README.md

arabic_reshaper/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
11
from .arabic_reshaper import reshape, default_reshaper, ArabicReshaper
2+
3+
import os
4+
5+
exec(open(os.path.join(os.path.dirname(__file__), '__version__.py')).read())

arabic_reshaper/__version__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = '2.0.9'

arabic_reshaper/arabic_reshaper.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,13 @@ def reshape(self, text):
166166

167167
delete_harakat = self.configuration.getboolean('delete_harakat')
168168
delete_tatweel = self.configuration.getboolean('delete_tatweel')
169+
support_zwj = self.configuration.getboolean('support_zwj')
169170
positions_harakat = {}
170171

171-
for letter in text:
172+
arabic_word_start = -1
173+
zwjs = []
174+
175+
for i, letter in enumerate(text):
172176
if HARAKAT_RE.match(letter):
173177
if not delete_harakat:
174178
position = len(output) - 1
@@ -177,13 +181,51 @@ def reshape(self, text):
177181
positions_harakat[position].append(letter)
178182
elif letter == TATWEEL and delete_tatweel:
179183
pass
184+
elif letter == ZWJ and support_zwj:
185+
zwjs.append(i)
186+
187+
if arabic_word_start != -1:
188+
# Handle three consecutive ZWJs or more
189+
if (
190+
len(zwjs) > 2 and
191+
zwjs[-2] == i - 1 and
192+
zwjs[-3] == i - 2
193+
):
194+
arabic_word_start = -1
195+
# Handle when previous letter is not ZWJ
196+
elif (
197+
output and
198+
len(zwjs) == 1 or (len(zwjs) > 1 and zwjs[-2] != i - 1)
199+
):
200+
previous_letter = output[-1]
201+
if connects_with_letter_after(previous_letter[LETTER]):
202+
if previous_letter[FORM] == ISOLATED:
203+
output[-1] = (
204+
previous_letter[LETTER],
205+
INITIAL
206+
)
207+
else:
208+
output[-1] = (
209+
previous_letter[LETTER],
210+
MEDIAL
211+
)
180212
elif letter not in LETTERS:
213+
arabic_word_start = -1
181214
output.append((letter, NOT_SUPPORTED))
182-
elif not output:
215+
elif not output: # first letter
216+
arabic_word_start = i
183217
output.append((letter, ISOLATED))
184218
else:
219+
if arabic_word_start == -1:
220+
arabic_word_start = i
185221
previous_letter = output[-1]
186-
if previous_letter[FORM] == NOT_SUPPORTED:
222+
if (
223+
arabic_word_start != i and
224+
zwjs and
225+
connects_with_letter_before(letter)
226+
):
227+
output.append((letter, FINAL))
228+
elif previous_letter[FORM] == NOT_SUPPORTED:
187229
output.append((letter, ISOLATED))
188230
elif not connects_with_letter_before(letter):
189231
output.append((letter, ISOLATED))
@@ -211,6 +253,10 @@ def reshape(self, text):
211253
)
212254
output.append((letter, FINAL))
213255

256+
# clear ZWJs
257+
if zwjs and letter != ZWJ:
258+
zwjs = []
259+
214260
if self.configuration.getboolean('support_ligatures'):
215261
# Clean text from Harakat to be able to find ligatures
216262
text = HARAKAT_RE.sub('', text)

arabic_reshaper/default-config.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ delete_harakat = yes
99
# Whether to delete the Tatweel (U+0640) before reshaping or not.
1010
delete_tatweel = no
1111

12+
# Whether to support ZWJ (U+200D) or not.
13+
support_zwj = yes
14+
1215
# Whether to use ligatures or not.
1316
# Serves as a shortcut to disable all ligatures.
1417
support_ligatures = yes

arabic_reshaper/letters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
FINAL = 3
2222

2323
TATWEEL = '\u0640'
24+
ZWJ = '\u200D'
2425

2526
LETTERS = {
2627
# ARABIC LETTER HAMZA

arabic_reshaper/tests/test_002_reshaping.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# -*- coding: utf-8 -*-
22

33
from __future__ import unicode_literals
4+
from __future__ import print_function
45

56
import unittest
7+
import sys
68
import arabic_reshaper
9+
import arabic_reshaper.letters as letters
710

811

912
def _reshaping_test(test):
@@ -13,6 +16,7 @@ def t(): test.assertEqual(case[1], test.reshaper.reshape(case[0]))
1316
with test.subTest(i=i, case=case[0]):
1417
t()
1518
else:
19+
print('running test case %d' % i, file=sys.stderr)
1620
t()
1721

1822

@@ -42,6 +46,70 @@ def test_reshaping(self):
4246
_reshaping_test(self)
4347

4448

49+
class TestZWJReshaping(unittest.TestCase):
50+
def setUp(self):
51+
self.reshaper = arabic_reshaper.default_reshaper
52+
53+
BEH = 'ب'
54+
BEH_ISOLATED = letters.LETTERS[BEH][letters.ISOLATED]
55+
BEH_INITIAL = letters.LETTERS[BEH][letters.INITIAL]
56+
BEH_MEDIAL = letters.LETTERS[BEH][letters.MEDIAL]
57+
BEH_FINAL = letters.LETTERS[BEH][letters.FINAL]
58+
59+
ALEF = 'ا'
60+
ALEF_ISOLATED = letters.LETTERS[ALEF][letters.ISOLATED]
61+
ALEF_FINAL = letters.LETTERS[ALEF][letters.FINAL]
62+
63+
HAMZA = 'ء'
64+
HAMZA_ISOLATED = letters.LETTERS[HAMZA][letters.ISOLATED]
65+
66+
self.cases = (
67+
(
68+
BEH + HAMZA,
69+
BEH_ISOLATED + HAMZA_ISOLATED
70+
),
71+
(
72+
letters.ZWJ + BEH + HAMZA,
73+
BEH_ISOLATED + HAMZA_ISOLATED
74+
),
75+
(
76+
BEH + letters.ZWJ + HAMZA,
77+
BEH_INITIAL + HAMZA_ISOLATED
78+
),
79+
(
80+
BEH + ALEF,
81+
BEH_INITIAL + ALEF_FINAL
82+
),
83+
(
84+
BEH + letters.ZWJ + ALEF,
85+
BEH_INITIAL + ALEF_FINAL
86+
),
87+
(
88+
BEH + letters.ZWJ + ALEF + letters.ZWJ,
89+
BEH_INITIAL + ALEF_FINAL
90+
),
91+
(
92+
BEH + ALEF + BEH,
93+
BEH_INITIAL + ALEF_FINAL + BEH_ISOLATED
94+
),
95+
(
96+
BEH + letters.ZWJ + ALEF + letters.ZWJ + BEH,
97+
BEH_INITIAL + ALEF_FINAL + BEH_FINAL
98+
),
99+
(
100+
BEH + letters.ZWJ + HAMZA + BEH,
101+
BEH_INITIAL + HAMZA_ISOLATED + BEH_ISOLATED
102+
),
103+
(
104+
BEH + letters.ZWJ + HAMZA + letters.ZWJ + BEH,
105+
BEH_INITIAL + HAMZA_ISOLATED + BEH_FINAL
106+
),
107+
)
108+
109+
def test_reshaping(self):
110+
_reshaping_test(self)
111+
112+
45113
class TestReshapingWithHarakat(unittest.TestCase):
46114
def setUp(self):
47115
self.reshaper = arabic_reshaper.ArabicReshaper({

setup.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,21 @@
33

44
from setuptools import setup
55

6+
import os
7+
8+
exec(
9+
open(os.path.join(
10+
os.path.dirname(__file__),
11+
'arabic_reshaper',
12+
'__version__.py'
13+
)).read()
14+
)
15+
616
setup(
717
name="arabic_reshaper",
818
description=("Reconstruct Arabic sentences to be used in"
919
" applications that don't support Arabic"),
10-
version='2.0.8',
20+
version=__version__,
1121
platforms="ALL",
1222
license="GPL",
1323
packages=['arabic_reshaper'],

0 commit comments

Comments
 (0)