PyThaiNLP · bact · Jan 22, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 22, 2026
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -37,6 +37,7 @@
     "remove_dangling",
     "remove_dup_spaces",
     "remove_repeat_vowels",
+    "remove_spaces_before_marks",
     "remove_tone_ipa",
     "remove_tonemark",
     "remove_trailing_repeat_consonants",
@@ -105,6 +106,7 @@
     remove_dangling,
     remove_dup_spaces,
     remove_repeat_vowels,
+    remove_spaces_before_marks,
     remove_tonemark,
     remove_zw,
     reorder_vowels,

diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
@@ -50,6 +50,11 @@
 
 _RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
 
+# Remove spaces before non-base characters (tone marks, above/below vowels, etc.)
+_RE_REMOVE_SPACES_BEFORE_NONBASE = re.compile(
+    f" +([{_DANGLING_CHARS}])"
+)
+
 
 def _last_char(matchobj):  # to be used with _RE_NOREPEAT_TONEMARKS
     return matchobj.group(0)[-1]
@@ -172,6 +177,28 @@ def remove_zw(text: str) -> str:
     return text
 
 
+def remove_spaces_before_marks(text: str) -> str:
+    """Remove spaces before Thai tone marks and non-base characters.
+
+    Spaces before tone marks, above vowels, below vowels, and other
+    non-base characters are often unintentional typos. This function
+    removes such spaces to normalize the text.
+
+    :param str text: input text
+    :return: text without spaces before Thai tone marks and non-base characters
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.util import remove_spaces_before_marks
+
+        remove_spaces_before_marks("พ ุ่มดอกไม้")
+        # output: 'พุ่มดอกไม้'
+    """
+    return _RE_REMOVE_SPACES_BEFORE_NONBASE.sub(r"\1", text)
+
+
 def reorder_vowels(text: str) -> str:
     """Reorder vowels and tone marks to the standard logical order/spelling.
 
@@ -242,13 +269,15 @@ def normalize(text: str) -> str:
 
         * Remove zero-width spaces
         * Remove duplicate spaces
+        * Remove spaces before tone marks and non-base characters
         * Reorder tone marks and vowels to standard order/spelling
         * Remove duplicate vowels and signs
         * Remove duplicate tone marks
         * Remove dangling non-base characters at the beginning of text
 
     normalize() simply call remove_zw(), remove_dup_spaces(),
-    remove_repeat_vowels(), and remove_dangling(), in that order.
+    remove_spaces_before_marks(), remove_repeat_vowels(), and
+    remove_dangling(), in that order.
 
     If a user wants to customize the selection or the order of rules
     to be applied, they can choose to call those functions by themselves.
@@ -272,6 +301,7 @@ def normalize(text: str) -> str:
     """
     text = remove_zw(text)
     text = remove_dup_spaces(text)
+    text = remove_spaces_before_marks(text)
     text = remove_repeat_vowels(text)
     text = remove_dangling(text)
 

diff --git a/tests/core/test_util.py b/tests/core/test_util.py
@@ -40,6 +40,7 @@
     reign_year_to_ad,
     remove_dangling,
     remove_dup_spaces,
+    remove_spaces_before_marks,
     remove_tone_ipa,
     remove_tonemark,
     remove_trailing_repeat_consonants,
@@ -581,6 +582,22 @@
         self.assertEqual(remove_dangling("\u0e48\u0e49\u0e01"), "\u0e01")
         self.assertEqual(remove_dangling("\u0e48\u0e01\u0e48"), "\u0e01\u0e48")
 
+        # remove spaces before tone marks and non-base characters
+        self.assertEqual(normalize("พ ุ่มดอกไม้"), "พุ่มดอกไม้")
+        self.assertEqual(
+            normalize("เค้้้าเดินไปสนามหญา้หนา้บา้น"),
+            "เค้าเดินไปสนามหญ้าหน้าบ้าน",
+        )
+        self.assertEqual(
+            normalize("พ ุ่มดอกไม้ในสนามหญา้หนา้บา้น"),
+            "พุ่มดอกไม้ในสนามหญ้าหน้าบ้าน",
+        )
+        self.assertEqual(normalize("ก ิ"), "กิ")  # space before above vowel
+        self.assertEqual(normalize("ก ุ"), "กุ")  # space before below vowel
+        self.assertEqual(
+            normalize("ก  ้า"), "ก้า"
+        )  # spaces before tone mark (also reordered)
+
         # remove duplicate spaces
         self.assertEqual(remove_dup_spaces("  ab  c d  "), "ab c d")
         self.assertEqual(remove_dup_spaces("\nab  c   \n d \n"), "ab c\nd")