1010
1111from pythainlp import thai_above_vowels as above_v
1212from pythainlp import thai_below_vowels as below_v
13+ from pythainlp import thai_consonants , thai_vowels
1314from pythainlp import thai_follow_vowels as follow_v
1415from pythainlp import thai_lead_vowels as lead_v
1516from pythainlp import thai_tonemarks as tonemarks
1819
1920_DANGLING_CHARS = f"{ above_v } { below_v } { tonemarks } \u0e3a \u0e4c \u0e4d \u0e4e "
2021_RE_REMOVE_DANGLINGS = re .compile (f"^[{ _DANGLING_CHARS } ]+" )
22+ _RE_REMOVE_DANGLINGS_AFTER_SPACE = re .compile (f" +[{ _DANGLING_CHARS } ]+" )
2123
2224_ZERO_WIDTH_CHARS = "\u200b \u200c " # ZWSP, ZWNJ
2325
5052
5153_RE_REMOVE_NEWLINES = re .compile ("[ \n ]*\n [ \n ]*" )
5254
55+ # Remove single space before non-base characters, but only after a consonant
56+ # that's not preceded by a vowel (to avoid breaking up complete words)
57+ # This conservative approach fixes "พ ุ่ม" but preserves "ภาพ ุ่"
58+ _RE_REMOVE_SPACES_BEFORE_NONBASE = re .compile (
59+ f"([{ thai_consonants } ])(?<![{ thai_vowels } ][{ thai_consonants } ]) ([{ _DANGLING_CHARS } ])"
60+ )
61+
5362
5463def _last_char (matchobj ): # to be used with _RE_NOREPEAT_TONEMARKS
5564 return matchobj .group (0 )[- 1 ]
5665
5766
5867def remove_dangling (text : str ) -> str :
59- """Remove Thai non-base characters at the beginning of text.
68+ """Remove Thai non-base characters at the beginning of text and after spaces .
6069
6170 This is a common "typo", especially for input field in a form,
6271 as these non-base characters can be visually hidden from user
@@ -65,10 +74,10 @@ def remove_dangling(text: str) -> str:
6574 A character to be removed should be both:
6675
6776 * tone mark, above vowel, below vowel, or non-base sign AND
68- * located at the beginning of the text
77+ * located at the beginning of the text or after spaces
6978
7079 :param str text: input text
71- :return: text without dangling Thai characters at the beginning
80+ :return: text without dangling Thai characters at the beginning and after spaces
7281 :rtype: str
7382
7483 :Example:
@@ -78,8 +87,13 @@ def remove_dangling(text: str) -> str:
7887
7988 remove_dangling("๊ก")
8089 # output: 'ก'
90+
91+ remove_dangling("คำ ่ที่สอง")
92+ # output: 'คำ ที่สอง'
8193 """
82- return _RE_REMOVE_DANGLINGS .sub ("" , text )
94+ text = _RE_REMOVE_DANGLINGS .sub ("" , text )
95+ text = _RE_REMOVE_DANGLINGS_AFTER_SPACE .sub (" " , text )
96+ return text
8397
8498
8599def remove_dup_spaces (text : str ) -> str :
@@ -172,6 +186,28 @@ def remove_zw(text: str) -> str:
172186 return text
173187
174188
189+ def remove_spaces_before_marks (text : str ) -> str :
190+ """Remove spaces before Thai tone marks and non-base characters.
191+
192+ Spaces before tone marks, above vowels, below vowels, and other
193+ non-base characters are often unintentional typos. This function
194+ removes such spaces to normalize the text.
195+
196+ :param str text: input text
197+ :return: text without spaces before Thai tone marks and non-base characters
198+ :rtype: str
199+
200+ :Example:
201+ ::
202+
203+ from pythainlp.util import remove_spaces_before_marks
204+
205+ remove_spaces_before_marks("พ ุ่มดอกไม้")
206+ # output: 'พุ่มดอกไม้'
207+ """
208+ return _RE_REMOVE_SPACES_BEFORE_NONBASE .sub (r"\1\2" , text )
209+
210+
175211def reorder_vowels (text : str ) -> str :
176212 """Reorder vowels and tone marks to the standard logical order/spelling.
177213
@@ -242,13 +278,15 @@ def normalize(text: str) -> str:
242278
243279 * Remove zero-width spaces
244280 * Remove duplicate spaces
281+ * Remove spaces before tone marks and non-base characters
245282 * Reorder tone marks and vowels to standard order/spelling
246283 * Remove duplicate vowels and signs
247284 * Remove duplicate tone marks
248285 * Remove dangling non-base characters at the beginning of text
249286
250287 normalize() simply call remove_zw(), remove_dup_spaces(),
251- remove_repeat_vowels(), and remove_dangling(), in that order.
288+ remove_spaces_before_marks(), remove_repeat_vowels(), and
289+ remove_dangling(), in that order.
252290
253291 If a user wants to customize the selection or the order of rules
254292 to be applied, they can choose to call those functions by themselves.
@@ -272,6 +310,7 @@ def normalize(text: str) -> str:
272310 """
273311 text = remove_zw (text )
274312 text = remove_dup_spaces (text )
313+ text = remove_spaces_before_marks (text )
275314 text = remove_repeat_vowels (text )
276315 text = remove_dangling (text )
277316
0 commit comments