Skip to content

Commit c441c66

Browse files
authored
Merge pull request #1222 from PyThaiNLP/copilot/fix-text-normalization-issue
Fix normalize() to remove spaces before Thai tone marks and non-base characters (conservative approach)
2 parents 3bd5fc5 + 2c317ed commit c441c66

File tree

3 files changed

+62
-5
lines changed

3 files changed

+62
-5
lines changed

pythainlp/util/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"remove_dangling",
3838
"remove_dup_spaces",
3939
"remove_repeat_vowels",
40+
"remove_spaces_before_marks",
4041
"remove_tone_ipa",
4142
"remove_tonemark",
4243
"remove_trailing_repeat_consonants",
@@ -105,6 +106,7 @@
105106
remove_dangling,
106107
remove_dup_spaces,
107108
remove_repeat_vowels,
109+
remove_spaces_before_marks,
108110
remove_tonemark,
109111
remove_zw,
110112
reorder_vowels,

pythainlp/util/normalize.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from pythainlp import thai_above_vowels as above_v
1212
from pythainlp import thai_below_vowels as below_v
13+
from pythainlp import thai_consonants, thai_vowels
1314
from pythainlp import thai_follow_vowels as follow_v
1415
from pythainlp import thai_lead_vowels as lead_v
1516
from pythainlp import thai_tonemarks as tonemarks
@@ -18,6 +19,7 @@
1819

1920
_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
2021
_RE_REMOVE_DANGLINGS = re.compile(f"^[{_DANGLING_CHARS}]+")
22+
_RE_REMOVE_DANGLINGS_AFTER_SPACE = re.compile(f" +[{_DANGLING_CHARS}]+")
2123

2224
_ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ
2325

@@ -50,13 +52,20 @@
5052

5153
_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
5254

55+
# Remove single space before non-base characters, but only after a consonant
56+
# that's not preceded by a vowel (to avoid breaking up complete words)
57+
# This conservative approach fixes "พ ุ่ม" but preserves "ภาพ ุ่"
58+
_RE_REMOVE_SPACES_BEFORE_NONBASE = re.compile(
59+
f"([{thai_consonants}])(?<![{thai_vowels}][{thai_consonants}]) ([{_DANGLING_CHARS}])"
60+
)
61+
5362

5463
def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS
5564
return matchobj.group(0)[-1]
5665

5766

5867
def remove_dangling(text: str) -> str:
59-
"""Remove Thai non-base characters at the beginning of text.
68+
"""Remove Thai non-base characters at the beginning of text and after spaces.
6069
6170
This is a common "typo", especially for input field in a form,
6271
as these non-base characters can be visually hidden from user
@@ -65,10 +74,10 @@ def remove_dangling(text: str) -> str:
6574
A character to be removed should be both:
6675
6776
* tone mark, above vowel, below vowel, or non-base sign AND
68-
* located at the beginning of the text
77+
* located at the beginning of the text or after spaces
6978
7079
:param str text: input text
71-
:return: text without dangling Thai characters at the beginning
80+
:return: text without dangling Thai characters at the beginning and after spaces
7281
:rtype: str
7382
7483
:Example:
@@ -78,8 +87,13 @@ def remove_dangling(text: str) -> str:
7887
7988
remove_dangling("๊ก")
8089
# output: 'ก'
90+
91+
remove_dangling("คำ ่ที่สอง")
92+
# output: 'คำ ที่สอง'
8193
"""
82-
return _RE_REMOVE_DANGLINGS.sub("", text)
94+
text = _RE_REMOVE_DANGLINGS.sub("", text)
95+
text = _RE_REMOVE_DANGLINGS_AFTER_SPACE.sub(" ", text)
96+
return text
8397

8498

8599
def remove_dup_spaces(text: str) -> str:
@@ -172,6 +186,28 @@ def remove_zw(text: str) -> str:
172186
return text
173187

174188

189+
def remove_spaces_before_marks(text: str) -> str:
190+
"""Remove spaces before Thai tone marks and non-base characters.
191+
192+
Spaces before tone marks, above vowels, below vowels, and other
193+
non-base characters are often unintentional typos. This function
194+
removes such spaces to normalize the text.
195+
196+
:param str text: input text
197+
:return: text without spaces before Thai tone marks and non-base characters
198+
:rtype: str
199+
200+
:Example:
201+
::
202+
203+
from pythainlp.util import remove_spaces_before_marks
204+
205+
remove_spaces_before_marks("พ ุ่มดอกไม้")
206+
# output: 'พุ่มดอกไม้'
207+
"""
208+
return _RE_REMOVE_SPACES_BEFORE_NONBASE.sub(r"\1\2", text)
209+
210+
175211
def reorder_vowels(text: str) -> str:
176212
"""Reorder vowels and tone marks to the standard logical order/spelling.
177213
@@ -242,13 +278,15 @@ def normalize(text: str) -> str:
242278
243279
* Remove zero-width spaces
244280
* Remove duplicate spaces
281+
* Remove spaces before tone marks and non-base characters
245282
* Reorder tone marks and vowels to standard order/spelling
246283
* Remove duplicate vowels and signs
247284
* Remove duplicate tone marks
248285
* Remove dangling non-base characters at the beginning of text
249286
250287
normalize() simply call remove_zw(), remove_dup_spaces(),
251-
remove_repeat_vowels(), and remove_dangling(), in that order.
288+
remove_spaces_before_marks(), remove_repeat_vowels(), and
289+
remove_dangling(), in that order.
252290
253291
If a user wants to customize the selection or the order of rules
254292
to be applied, they can choose to call those functions by themselves.
@@ -272,6 +310,7 @@ def normalize(text: str) -> str:
272310
"""
273311
text = remove_zw(text)
274312
text = remove_dup_spaces(text)
313+
text = remove_spaces_before_marks(text)
275314
text = remove_repeat_vowels(text)
276315
text = remove_dangling(text)
277316

tests/core/test_util.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,22 @@ def test_normalize(self):
581581
self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
582582
self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
583583

584+
# remove spaces before tone marks and non-base characters
585+
self.assertEqual(normalize("พ ุ่มดอกไม้"), "พุ่มดอกไม้")
586+
self.assertEqual(
587+
normalize("เค้้้าเดินไปสนามหญา้หนา้บา้น"),
588+
"เค้าเดินไปสนามหญ้าหน้าบ้าน",
589+
)
590+
self.assertEqual(
591+
normalize("พ ุ่มดอกไม้ในสนามหญา้หนา้บา้น"),
592+
"พุ่มดอกไม้ในสนามหญ้าหน้าบ้าน",
593+
)
594+
self.assertEqual(normalize("ก ิ"), "กิ") # space before above vowel
595+
self.assertEqual(normalize("ก ุ"), "กุ") # space before below vowel
596+
self.assertEqual(
597+
normalize("ก ้า"), "ก้า"
598+
) # spaces before tone mark (also reordered)
599+
584600
# remove duplicate spaces
585601
self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d")
586602
self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd")

0 commit comments

Comments
 (0)