Add display cell tokenizer

wannaphong · wannaphong · commit 156697b803e2 · 2025-01-08T20:29:17.000+07:00
Fixes #663 Add a new function `display_cell_tokenize` to split Thai text into display cells without splitting tone marks. * **New Functionality** - Add `display_cell_tokenize` function in `pythainlp/tokenize/core.py` to handle the splitting of Thai text into display cells. - Ensure the function does not split tone marks. * **Initialization** - Update `pythainlp/tokenize/__init__.py` to include the new `display_cell_tokenize` function in the `__all__` list. * **Testing** - Add tests for the `display_cell_tokenize` function in `tests/core/test_tokenize.py`. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/PyThaiNLP/pythainlp/issues/663?shareId=XXXX-XXXX-XXXX-XXXX).
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -16,6 +16,7 @@
     "syllable_tokenize",
     "word_detokenize",
     "word_tokenize",
+    "display_cell_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
     syllable_tokenize,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -733,6 +733,45 @@ def syllable_tokenize(
     )
 
 
+def display_cell_tokenize(text: str) -> List[str]:
+    """
+    Display cell tokenizer.
+
+    Tokenizes Thai text into display cells without splitting tone marks.
+
+    :param str text: text to be tokenized
+    :return: list of display cells
+    :rtype: List[str]
+    :Example:
+
+    Tokenize Thai text into display cells::
+
+        from pythainlp.tokenize import display_cell_tokenize
+
+        text = "แม่น้ำอยู่ที่ไหน"
+        display_cell_tokenize(text)
+        # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    display_cells = []
+    current_cell = ""
+
+    for char in text:
+        if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
+            current_cell += char
+        else:
+            if current_cell:
+                display_cells.append(current_cell)
+            current_cell = char
+
+    if current_cell:
+        display_cells.append(current_cell)
+
+    return display_cells
+
+
 class Tokenizer:
     """
     Tokenizer class for a custom tokenizer.
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -19,6 +19,7 @@
     tcc_p,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 from pythainlp.util import dict_trie
 
@@ -604,3 +605,10 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
+
+    def test_display_cell_tokenize(self):
+        self.assertEqual(display_cell_tokenize(""), [])
+        self.assertEqual(display_cell_tokenize("แม่น้ำอยู่ที่ไหน"), ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"])
+        self.assertEqual(display_cell_tokenize("สวัสดี"), ["ส", "ว", "ั", "ส", "ด", "ี"])
+        self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
+        self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`"syllable_tokenize",`
`17`	`17`	`"word_detokenize",`
`18`	`18`	`"word_tokenize",`
	`19`	`+ "display_cell_tokenize",`
`19`	`20`	`]`
`20`	`21`
`21`	`22`	`from pythainlp.corpus import thai_syllables, thai_words`
`@@ -38,6 +39,7 @@`
`38`	`39`	`syllable_tokenize,`
`39`	`40`	`word_detokenize,`
`40`	`41`	`word_tokenize,`
	`42`	`+ display_cell_tokenize,`
`41`	`43`	`)`
`42`	`44`
`43`	`45`	`from pythainlp.corpus import get_corpus as _get_corpus`