Skip to content

Commit 156697b

Browse files
committed
Add display cell tokenizer
Fixes #663 Add a new function `display_cell_tokenize` to split Thai text into display cells without splitting tone marks. * **New Functionality** - Add `display_cell_tokenize` function in `pythainlp/tokenize/core.py` to handle the splitting of Thai text into display cells. - Ensure the function does not split tone marks. * **Initialization** - Update `pythainlp/tokenize/__init__.py` to include the new `display_cell_tokenize` function in the `__all__` list. * **Testing** - Add tests for the `display_cell_tokenize` function in `tests/core/test_tokenize.py`. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/PyThaiNLP/pythainlp/issues/663?shareId=XXXX-XXXX-XXXX-XXXX).
1 parent 7332984 commit 156697b

File tree

3 files changed

+49
-0
lines changed

3 files changed

+49
-0
lines changed

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"syllable_tokenize",
1717
"word_detokenize",
1818
"word_tokenize",
19+
"display_cell_tokenize",
1920
]
2021

2122
from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
3839
syllable_tokenize,
3940
word_detokenize,
4041
word_tokenize,
42+
display_cell_tokenize,
4143
)
4244

4345
from pythainlp.corpus import get_corpus as _get_corpus

pythainlp/tokenize/core.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,45 @@ def syllable_tokenize(
733733
)
734734

735735

736+
def display_cell_tokenize(text: str) -> List[str]:
737+
"""
738+
Display cell tokenizer.
739+
740+
Tokenizes Thai text into display cells without splitting tone marks.
741+
742+
:param str text: text to be tokenized
743+
:return: list of display cells
744+
:rtype: List[str]
745+
:Example:
746+
747+
Tokenize Thai text into display cells::
748+
749+
from pythainlp.tokenize import display_cell_tokenize
750+
751+
text = "แม่น้ำอยู่ที่ไหน"
752+
display_cell_tokenize(text)
753+
# output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
754+
"""
755+
if not text or not isinstance(text, str):
756+
return []
757+
758+
display_cells = []
759+
current_cell = ""
760+
761+
for char in text:
762+
if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
763+
current_cell += char
764+
else:
765+
if current_cell:
766+
display_cells.append(current_cell)
767+
current_cell = char
768+
769+
if current_cell:
770+
display_cells.append(current_cell)
771+
772+
return display_cells
773+
774+
736775
class Tokenizer:
737776
"""
738777
Tokenizer class for a custom tokenizer.

tests/core/test_tokenize.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
tcc_p,
2020
word_detokenize,
2121
word_tokenize,
22+
display_cell_tokenize,
2223
)
2324
from pythainlp.util import dict_trie
2425

@@ -604,3 +605,10 @@ def test_tcc_p(self):
604605
# )
605606
self.assertEqual(list(tcc_p.tcc("")), [])
606607
self.assertEqual(tcc_p.tcc_pos(""), set())
608+
609+
def test_display_cell_tokenize(self):
610+
self.assertEqual(display_cell_tokenize(""), [])
611+
self.assertEqual(display_cell_tokenize("แม่น้ำอยู่ที่ไหน"), ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"])
612+
self.assertEqual(display_cell_tokenize("สวัสดี"), ["ส", "ว", "ั", "ส", "ด", "ี"])
613+
self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
614+
self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])

0 commit comments

Comments
 (0)