Skip to content

Commit d0d74fc

Browse files
committed
More tests + delete_tatweel
* Add `delete_tatweel` * Add more test cases
1 parent 281a49e commit d0d74fc

File tree

7 files changed

+198
-41
lines changed

7 files changed

+198
-41
lines changed

README.md

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ in the reshaped text, you should enable this option if you are going to pass
9090
the reshaped text to `bidi.algorithm.get_display` because it will reverse the
9191
text and you'd end up with harakat applied to the next letter instead of the
9292
previous letter.
93+
* `delete_tatweel` (Default `False`): When this is set to `True` the reshaper
94+
will delete the Tatweel character (U+0640) from the text before reshaping, this
95+
can be useful when you want to support ligatures and don't care about Tatweel
96+
getting deleted.
9397

9498
Besides the settings above, you can enable/disable supported ligatures. For a
9599
full list of supported ligatures and their default status check the file
@@ -109,10 +113,10 @@ from arabic_reshaper import ArabicReshaper
109113
configuration = {
110114
'delete_harakat': False,
111115
'support_ligatures': True,
112-
'RIAL SIGN': True, # Replace ريال with ﷼
116+
'RIAL SIGN': True, # Replace ر ي ا ل with ﷼
113117
}
114118
reshaper = ArabicReshaper(configuration=configuration)
115-
text_to_be_reshaped = 'سعر المنتج ١٥٠ ريال'
119+
text_to_be_reshaped = 'سعر المنتج ١٥٠ ر' + 'يال' # had to split the string for display
116120
reshaped_text = reshaper.reshape(text_to_be_reshaped)
117121
```
118122

@@ -126,13 +130,8 @@ constructor's `configuration_file` parameter like this:
126130

127131
```
128132
from arabic_reshaper import ArabicReshaper
129-
configuration = {
130-
'delete_harakat': False,
131-
'support_ligatures': True,
132-
'RIAL SIGN': True, # Replace ريال with ﷼
133-
}
134133
reshaper = ArabicReshaper(configuration_file='/path/to/your/config.ini')
135-
text_to_be_reshaped = 'سعر المنتج ١٥٠ ريال'
134+
text_to_be_reshaped = 'سعر المنتج ١٥٠ ر' + 'يال' # had to split the string for display
136135
reshaped_text = reshaper.reshape(text_to_be_reshaped)
137136
```
138137

@@ -179,6 +178,42 @@ https://github.com/mpcabd/python-arabic-reshaper/tarball/master
179178

180179
## Version History
181180

181+
### 2.0.8
182+
183+
* Added `delete_tatweel`
184+
* Added more test cases
185+
186+
### 2.0.7
187+
188+
* Fix tests for Python 2.7
189+
190+
### 2.0.6
191+
192+
* Fixed a bug with Harakat breaking the reshaping
193+
* Wrote two small unit tests, more to come
194+
* Moved letters and ligatures to separate files for readability/maintainability
195+
* Moved package to its own folder for readability/maintainability
196+
197+
### 2.0.5
198+
199+
Fix error message formatting
200+
201+
### 2.0.4
202+
203+
Fix error message formatting
204+
205+
### 2.0.3
206+
207+
Use `Exception` instead of `Error`.
208+
209+
### 2.0.2
210+
211+
Use `pkg_resources.resource_filename` instead of depending on `__file__` to access `default-config.ini`.
212+
213+
### 2.0.1
214+
215+
Include default-config.ini in setup.py
216+
182217
### 2.0.0
183218

184219
* Totally rewrote the code;

arabic_reshaper/arabic_reshaper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def reshape(self, text):
165165
NOT_SUPPORTED = -1
166166

167167
delete_harakat = self.configuration.getboolean('delete_harakat')
168+
delete_tatweel = self.configuration.getboolean('delete_tatweel')
168169
positions_harakat = {}
169170

170171
for letter in text:
@@ -174,6 +175,8 @@ def reshape(self, text):
174175
if position not in positions_harakat:
175176
positions_harakat[position] = []
176177
positions_harakat[position].append(letter)
178+
elif letter == TATWEEL and delete_tatweel:
179+
pass
177180
elif letter not in LETTERS:
178181
output.append((letter, NOT_SUPPORTED))
179182
elif not output:
@@ -211,6 +214,11 @@ def reshape(self, text):
211214
if self.configuration.getboolean('support_ligatures'):
212215
# Clean text from Harakat to be able to find ligatures
213216
text = HARAKAT_RE.sub('', text)
217+
218+
# Clean text from Tatweel to find ligatures if delete_tatweel
219+
if delete_tatweel:
220+
text = text.replace(TATWEEL, '')
221+
214222
for match in re.finditer(self._ligatures_re, text):
215223
group_index = next((
216224
i for i, group in enumerate(match.groups()) if group

arabic_reshaper/default-config.ini

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
# More languages might be supported soon.
44
language = Arabic
55

6+
# Whether to delete the Harakat (Tashkeel) before reshaping or not.
7+
delete_harakat = yes
8+
9+
# Whether to delete the Tatweel (U+0640) before reshaping or not.
10+
delete_tatweel = no
11+
612
# Whether to use ligatures or not.
713
# Serves as a shortcut to disable all ligatures.
814
support_ligatures = yes
915

10-
# Whether to delete the Harakat (Tashkeel) before reshaping or not.
11-
delete_harakat = yes
12-
1316
# When `support_ligatures` is enabled.
1417
# Separate ligatures configuration take precedence over it.
1518
# When `support_ligatures` is disabled,

arabic_reshaper/letters.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
MEDIAL = 2
2121
FINAL = 3
2222

23+
TATWEEL = '\u0640'
24+
2325
LETTERS = {
2426
# ARABIC LETTER HAMZA
2527
'\u0621': ('\uFE80', '', '', ''),
@@ -74,7 +76,7 @@
7476
# ARABIC LETTER GHAIN
7577
'\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'),
7678
# ARABIC TATWEEL
77-
'\u0640': ('\u0640', '\u0640', '\u0640', '\u0640'),
79+
TATWEEL: (TATWEEL, TATWEEL, TATWEEL, TATWEEL),
7880
# ARABIC LETTER FEH
7981
'\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'),
8082
# ARABIC LETTER QAF

arabic_reshaper/tests/test_001_initialization.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ class TestDefaultConfiguration(unittest.TestCase):
88
def setUp(self):
99
self.reshaper = arabic_reshaper.ArabicReshaper()
1010

11+
def boolean_check(self, boolean):
12+
self.assertIn(boolean, self.reshaper.configuration)
13+
self.assertIsNotNone(
14+
self.reshaper.configuration.getboolean(boolean)
15+
)
16+
1117
def test_configuration_exists(self):
1218
self.assertIsNotNone(self.reshaper.configuration)
1319

@@ -17,31 +23,22 @@ def test_language(self):
1723
self.assertTrue(self.reshaper.configuration['language'])
1824

1925
def test_support_ligatures(self):
20-
self.assertIn('support_ligatures', self.reshaper.configuration)
21-
self.assertIsNotNone(
22-
self.reshaper.configuration.getboolean('support_ligatures')
23-
)
26+
self.boolean_check('support_ligatures')
2427

2528
def test_delete_harakat(self):
26-
self.assertIn('delete_harakat', self.reshaper.configuration)
27-
self.assertIsNotNone(
28-
self.reshaper.configuration.getboolean('delete_harakat')
29-
)
29+
self.boolean_check('delete_harakat')
30+
31+
def test_delete_tatweel(self):
32+
self.boolean_check('delete_tatweel')
3033

3134
def test_ligatures(self):
3235
import arabic_reshaper.ligatures
3336
for ligature in arabic_reshaper.ligatures.LIGATURES:
3437
if hasattr(self, 'subTest'):
3538
with self.subTest(ligature=ligature[0]):
36-
self.assertIn(ligature[0], self.reshaper.configuration)
37-
self.assertIsNotNone(
38-
self.reshaper.configuration.getboolean(ligature[0])
39-
)
39+
self.boolean_check(ligature[0])
4040
else:
41-
self.assertIn(ligature[0], self.reshaper.configuration)
42-
self.assertIsNotNone(
43-
self.reshaper.configuration.getboolean(ligature[0])
44-
)
41+
self.boolean_check(ligature[0])
4542

4643
if __name__ == '__main__':
4744
unittest.main()

arabic_reshaper/tests/test_002_reshaping.py

Lines changed: 124 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,40 @@
66
import arabic_reshaper
77

88

9+
def _reshaping_test(test):
10+
for i, case in enumerate(test.cases):
11+
def t(): test.assertEqual(case[1], test.reshaper.reshape(case[0]))
12+
if hasattr(test, 'subTest'):
13+
with test.subTest(i=i, case=case[0]):
14+
t()
15+
else:
16+
t()
17+
18+
919
class TestDefaultReshaping(unittest.TestCase):
1020
def setUp(self):
1121
self.reshaper = arabic_reshaper.default_reshaper
1222
self.cases = (
1323
('السلام عليكم', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'),
1424
('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ'),
25+
('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'),
26+
('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎ ﻭﻧﻄﻘﺎ ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'),
27+
('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'),
28+
('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'),
29+
('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'),
30+
('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'),
31+
('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'),
32+
('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'),
33+
('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'),
34+
('الأمم المتحدة، ويُحتفل', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳﺤﺘﻔﻞ'),
35+
('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'),
36+
('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'),
37+
('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'),
38+
('الأمم المتحدة.', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'),
1539
)
1640

1741
def test_reshaping(self):
18-
for i, case in enumerate(self.cases):
19-
if hasattr(self, 'subTest'):
20-
with self.subTest(i=i, case=case[0]):
21-
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
22-
else:
23-
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
42+
_reshaping_test(self)
2443

2544

2645
class TestReshapingWithHarakat(unittest.TestCase):
@@ -30,15 +49,108 @@ def setUp(self):
3049
})
3150
self.cases = (
3251
('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻼَْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'),
52+
('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'),
53+
('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'),
54+
('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'),
55+
('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'),
56+
('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'),
57+
('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'),
58+
('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'),
59+
('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'),
60+
('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'),
61+
('الأمم المتحدة، ويُحتفل', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ'),
62+
('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'),
63+
('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'),
64+
('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'),
65+
('الأمم المتحدة.', 'ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'),
66+
)
67+
68+
def test_reshaping(self):
69+
_reshaping_test(self)
70+
71+
72+
class TestReshapingWithHarakatWithoutLigatures(unittest.TestCase):
73+
def setUp(self):
74+
self.reshaper = arabic_reshaper.ArabicReshaper({
75+
'delete_harakat': False,
76+
'support_ligatures': False
77+
})
78+
self.cases = (
79+
('السَلَاْمٌ عَلَيْكُمْ', 'ﺍﻟﺴَﻠَﺎْﻡٌ ﻋَﻠَﻴْﻜُﻢْ'),
80+
('اللغة العربية هي أكثر اللغات', 'ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ'),
81+
('تحدثاً ونطقاً ضمن مجموعة', 'ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ'),
82+
('اللغات السامية', 'ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ'),
83+
('العربية لغة رسمية في', 'ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ'),
84+
('كل دول الوطن العربي', 'ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ'),
85+
('إضافة إلى كونها لغة', 'ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ'),
86+
('رسمية في تشاد وإريتريا', 'ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ'),
87+
('وإسرائيل. وهي إحدى اللغات', 'ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ'),
88+
('الرسمية الست في منظمة', 'ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ'),
89+
('الأمم المتحدة، ويُحتفل', 'ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ'),
90+
('باليوم العالمي للغة العربية', 'ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ'),
91+
('في 18 ديسمبر كذكرى اعتماد', 'ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ'),
92+
('العربية بين لغات العمل في', 'ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ'),
93+
('الأمم المتحدة.', 'ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.'),
94+
)
95+
96+
def test_reshaping(self):
97+
_reshaping_test(self)
98+
99+
100+
class TestReshapingSomeLigatures(unittest.TestCase):
101+
def setUp(self):
102+
self.reshaper = arabic_reshaper.ArabicReshaper({
103+
'delete_tatweel': True,
104+
'ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM': True,
105+
'ARABIC LIGATURE JALLAJALALOUHOU': True,
106+
'ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM': True,
107+
'ARABIC LIGATURE ALLAH ': True,
108+
'ARABIC LIGATURE AKBAR': True,
109+
'ARABIC LIGATURE ALAYHE': True,
110+
'ARABIC LIGATURE MOHAMMAD': True,
111+
'ARABIC LIGATURE RASOUL': True,
112+
'ARABIC LIGATURE SALAM': True,
113+
'ARABIC LIGATURE SALLA': True,
114+
'ARABIC LIGATURE WASALLAM': True,
115+
})
116+
self.cases = (
117+
('إِنَّهُ مِن سُلَيْمَانَ '
118+
'وَإِنَّهُ بِسْمِ اللّـَهِ '
119+
'الرَّحْمَـٰنِ الرَّحِيمِ ﴿٣٠﴾ '
120+
'أَلَّا تَعْلُوا عَلَيَّ '
121+
'وَأْتُونِي مُسْلِمِينَ ﴿٣١﴾',
122+
123+
'ﺇﻧﻪ ﻣﻦ ﺳﻠﻴﻤﺎﻥ ﻭﺇﻧﻪ ﷽ ﴿٣٠﴾ '
124+
'ﺃﻻ ﺗﻌﻠﻮﺍ ﻋﻠﻲ ﻭﺃﺗﻮﻧﻲ ﻣﺴﻠﻤﻴﻦ ﴿٣١﴾'),
125+
126+
('فَذَكِّرْ إِنَّمَا أَنتَ'
127+
' مُذَكِّرٌ ﴿٢١﴾ لَّسْتَ'
128+
' عَلَيْهِم بِمُصَيْطِرٍ ﴿٢٢﴾'
129+
' إِلَّا مَن تَوَلَّىٰ'
130+
' وَكَفَرَ ﴿٢٣﴾ فَيُعَذِّبُهُ'
131+
' اللَّـهُ الْعَذَابَ'
132+
' الْأَكْبَرَ ﴿٢٤﴾',
133+
134+
'ﻓﺬﻛﺮ ﺇﻧﻤﺎ ﺃﻧﺖ'
135+
' ﻣﺬﻛﺮ ﴿٢١﴾ ﻟﺴﺖ'
136+
' ﻋﻠﻴﻬﻢ ﺑﻤﺼﻴﻄﺮ ﴿٢٢﴾'
137+
' ﺇﻻ ﻣﻦ ﺗﻮﻟﻰ'
138+
' ﻭﻛﻔﺮ ﴿٢٣﴾ ﻓﻴﻌﺬﺑﻪ'
139+
' ﷲ ﺍﻟﻌﺬﺍﺏ'
140+
' ﺍﻷﻛﺒﺮ ﴿٢٤﴾'),
141+
142+
('محمد رسول الله صلى الله عليه وسلم',
143+
'ﷴ ﷶ ﷲ ﷺ'),
144+
145+
('الله جل جلاله',
146+
'ﷲ ﷻ'),
147+
148+
('محمد رسول الله عليه صلى الله وسلم',
149+
'ﷴ ﷶ ﷲ ﷷ ﷹ ﷲ ﷸ'),
33150
)
34151

35152
def test_reshaping(self):
36-
for i, case in enumerate(self.cases):
37-
if hasattr(self, 'subTest'):
38-
with self.subTest(i=i, case=case[0]):
39-
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
40-
else:
41-
self.assertEqual(case[1], self.reshaper.reshape(case[0]))
153+
_reshaping_test(self)
42154

43155
if __name__ == '__main__':
44156
unittest.main()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
name="arabic_reshaper",
88
description=("Reconstruct Arabic sentences to be used in"
99
" applications that don't support Arabic"),
10-
version='2.0.7',
10+
version='2.0.8',
1111
platforms="ALL",
1212
license="GPL",
1313
packages=['arabic_reshaper'],

0 commit comments

Comments
 (0)