Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"remove_dangling",
"remove_dup_spaces",
"remove_repeat_vowels",
"remove_spaces_before_marks",
"remove_tone_ipa",
"remove_tonemark",
"remove_trailing_repeat_consonants",
Expand Down Expand Up @@ -105,6 +106,7 @@
remove_dangling,
remove_dup_spaces,
remove_repeat_vowels,
remove_spaces_before_marks,
remove_tonemark,
remove_zw,
reorder_vowels,
Expand Down
32 changes: 31 additions & 1 deletion pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@

_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")

# Remove spaces before non-base characters (tone marks, above/below vowels, etc.)
_RE_REMOVE_SPACES_BEFORE_NONBASE = re.compile(
f" +([{_DANGLING_CHARS}])"
)


def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS
return matchobj.group(0)[-1]
Expand Down Expand Up @@ -172,6 +177,28 @@ def remove_zw(text: str) -> str:
return text


def remove_spaces_before_marks(text: str) -> str:
"""Remove spaces before Thai tone marks and non-base characters.

Spaces before tone marks, above vowels, below vowels, and other
non-base characters are often unintentional typos. This function
removes such spaces to normalize the text.

:param str text: input text
:return: text without spaces before Thai tone marks and non-base characters
:rtype: str

:Example:
::

from pythainlp.util import remove_spaces_before_marks

remove_spaces_before_marks("พ ุ่มดอกไม้")
# output: 'พุ่มดอกไม้'
"""
return _RE_REMOVE_SPACES_BEFORE_NONBASE.sub(r"\1", text)


def reorder_vowels(text: str) -> str:
"""Reorder vowels and tone marks to the standard logical order/spelling.

Expand Down Expand Up @@ -242,13 +269,15 @@ def normalize(text: str) -> str:

* Remove zero-width spaces
* Remove duplicate spaces
* Remove spaces before tone marks and non-base characters
* Reorder tone marks and vowels to standard order/spelling
* Remove duplicate vowels and signs
* Remove duplicate tone marks
* Remove dangling non-base characters at the beginning of text

normalize() simply call remove_zw(), remove_dup_spaces(),
remove_repeat_vowels(), and remove_dangling(), in that order.
remove_spaces_before_marks(), remove_repeat_vowels(), and
remove_dangling(), in that order.

If a user wants to customize the selection or the order of rules
to be applied, they can choose to call those functions by themselves.
Expand All @@ -272,6 +301,7 @@ def normalize(text: str) -> str:
"""
text = remove_zw(text)
text = remove_dup_spaces(text)
text = remove_spaces_before_marks(text)
text = remove_repeat_vowels(text)
text = remove_dangling(text)

Expand Down
17 changes: 17 additions & 0 deletions tests/core/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
reign_year_to_ad,
remove_dangling,
remove_dup_spaces,
remove_spaces_before_marks,

Check failure on line 43 in tests/core/test_util.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

tests/core/test_util.py:43:5: F401 `pythainlp.util.remove_spaces_before_marks` imported but unused
remove_tone_ipa,
remove_tonemark,
remove_trailing_repeat_consonants,
Expand Down Expand Up @@ -581,6 +582,22 @@
self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")

# remove spaces before tone marks and non-base characters
self.assertEqual(normalize("พ ุ่มดอกไม้"), "พุ่มดอกไม้")
self.assertEqual(
normalize("เค้้้าเดินไปสนามหญา้หนา้บา้น"),
"เค้าเดินไปสนามหญ้าหน้าบ้าน",
)
self.assertEqual(
normalize("พ ุ่มดอกไม้ในสนามหญา้หนา้บา้น"),
"พุ่มดอกไม้ในสนามหญ้าหน้าบ้าน",
)
self.assertEqual(normalize("ก ิ"), "กิ") # space before above vowel
self.assertEqual(normalize("ก ุ"), "กุ") # space before below vowel
self.assertEqual(
normalize("ก ้า"), "ก้า"
) # spaces before tone mark (also reordered)

# remove duplicate spaces
self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d")
self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd")
Expand Down
Loading