Skip to content

Commit c9d8c91

Browse files
authored
Merge branch 'dev' into add-spelling
2 parents f49c480 + 3adc978 commit c9d8c91

File tree

13 files changed

+207
-12
lines changed

13 files changed

+207
-12
lines changed

docs/api/khavee.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Example
4949
Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry:
5050

5151
::
52+
5253
from pythainlp.khavee import KhaveeVerifier
5354
5455
# Initialize a KhaveeVerifier instance

docs/api/tokenize.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ Modules
4444

4545
The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.
4646

47+
.. autoclass:: display_cell_tokenize
48+
4749
Tokenization Engines
4850
--------------------
4951

docs/api/util.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,11 @@ Modules
287287

288288
The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.
289289

290+
.. autofunction:: longest_common_subsequence
291+
:noindex:
292+
293+
The `longest_common_subsequence` function is find the longest common subsequence between two strings.
294+
290295
.. autofunction:: pythainlp.util.morse.morse_encode
291296
:noindex:
292297

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"syllable_tokenize",
1717
"word_detokenize",
1818
"word_tokenize",
19+
"display_cell_tokenize",
1920
]
2021

2122
from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
3839
syllable_tokenize,
3940
word_detokenize,
4041
word_tokenize,
42+
display_cell_tokenize,
4143
)
4244

4345
from pythainlp.corpus import get_corpus as _get_corpus

pythainlp/tokenize/attacut.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
:See Also:
99
* `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
1010
"""
11-
from typing import List
11+
from typing import Dict, List
1212

1313
from attacut import Tokenizer
1414

@@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]:
2626
return self._tokenizer.tokenize(text)
2727

2828

29+
_tokenizers: Dict[str, AttacutTokenizer] = {}
30+
31+
2932
def segment(text: str, model: str = "attacut-sc") -> List[str]:
3033
"""
3134
Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
@@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
4043
if not text or not isinstance(text, str):
4144
return []
4245

43-
_tokenizer = AttacutTokenizer(model)
46+
global _tokenizers
47+
if model not in _tokenizers:
48+
_tokenizers[model] = AttacutTokenizer(model)
4449

45-
return _tokenizer.tokenize(text)
50+
return _tokenizers[model].tokenize(text)

pythainlp/tokenize/core.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,46 @@ def syllable_tokenize(
733733
)
734734

735735

736+
def display_cell_tokenize(text: str) -> List[str]:
737+
"""
738+
Display cell tokenizer.
739+
740+
Tokenizes Thai text into display cells without splitting tone marks.
741+
742+
:param str text: text to be tokenized
743+
:return: list of display cells
744+
:rtype: List[str]
745+
:Example:
746+
747+
Tokenize Thai text into display cells::
748+
749+
from pythainlp.tokenize import display_cell_tokenize
750+
751+
text = "แม่น้ำอยู่ที่ไหน"
752+
display_cell_tokenize(text)
753+
# output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
754+
"""
755+
if not text or not isinstance(text, str):
756+
return []
757+
758+
display_cells = []
759+
current_cell = ""
760+
text = text.replace("ำ", "ํา")
761+
762+
for char in text:
763+
if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
764+
current_cell += char
765+
else:
766+
if current_cell:
767+
display_cells.append(current_cell)
768+
current_cell = char
769+
770+
if current_cell:
771+
display_cells.append(current_cell)
772+
773+
return display_cells
774+
775+
736776
class Tokenizer:
737777
"""
738778
Tokenizer class for a custom tokenizer.

pythainlp/tokenize/longest.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
1313
"""
1414
import re
15-
from typing import List, Union
15+
from typing import Dict, List, Union
1616

1717
from pythainlp import thai_tonemarks
1818
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
@@ -134,16 +134,25 @@ def __segment(self, text: str):
134134
token_statuses.append(_KNOWN)
135135
begin_pos += len(match)
136136

137-
return tokens
137+
# Group consecutive spaces into one token
138+
grouped_tokens = []
139+
for token in tokens:
140+
if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace():
141+
grouped_tokens[-1] += token
142+
else:
143+
grouped_tokens.append(token)
144+
145+
return grouped_tokens
138146

139147
def tokenize(self, text: str) -> List[str]:
140148
tokens = self.__segment(text)
141149
return tokens
142150

143151

144-
def segment(
145-
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
146-
) -> List[str]:
152+
_tokenizers: Dict[int, LongestMatchTokenizer] = {}
153+
154+
155+
def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]:
147156
"""
148157
Dictionary-based longest matching word segmentation.
149158
@@ -157,4 +166,9 @@ def segment(
157166
if not custom_dict:
158167
custom_dict = DEFAULT_WORD_DICT_TRIE
159168

160-
return LongestMatchTokenizer(custom_dict).tokenize(text)
169+
global _tokenizers
170+
custom_dict_ref_id = id(custom_dict)
171+
if custom_dict_ref_id not in _tokenizers:
172+
_tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
173+
174+
return _tokenizers[custom_dict_ref_id].tokenize(text)

pythainlp/tokenize/newmm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def segment(
182182
# try to break by space first
183183
space_idx = sample.rfind(" ")
184184
if space_idx >= 0:
185-
cut_pos = space_idx + 1
185+
cut_pos = space_idx + 1 + _TEXT_SCAN_BEGIN
186186
else:
187187
tokens = list(_onecut(sample, custom_dict))
188188
token_max_idx = 0

pythainlp/tokenize/pyicu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515

1616
from icu import BreakIterator, Locale
1717

18+
bd = BreakIterator.createWordInstance(Locale("th"))
1819

1920
def _gen_words(text: str) -> str:
20-
bd = BreakIterator.createWordInstance(Locale("th"))
21+
global bd
2122
bd.setText(text)
2223
p = bd.first()
2324
for q in bd:

pythainlp/util/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# -*- coding: utf-8 -*-
1+
# -*- coding: utf-8 -*-
22
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
33
# SPDX-FileType: SOURCE
44
# SPDX-License-Identifier: Apache-2.0
@@ -26,6 +26,7 @@
2626
"is_native_thai",
2727
"isthai",
2828
"isthaichar",
29+
"longest_common_subsequence",
2930
"nectec_to_ipa",
3031
"normalize",
3132
"now_reign_year",
@@ -95,6 +96,7 @@
9596
thai_to_eng,
9697
)
9798
from pythainlp.util.keywords import find_keyword, rank
99+
from pythainlp.util.lcs import longest_common_subsequence
98100
from pythainlp.util.normalize import (
99101
maiyamok,
100102
normalize,

0 commit comments

Comments
 (0)