Skip to content

Commit e6ea7e2

Browse files
committed
Add nlpo3 to compact
- Add relevant nlpo3 test to testc - Add notes on tests in main readme - Up nlpo3 version to 1.3.0 (fixed karan bug)
1 parent 93ecd91 commit e6ea7e2

File tree

7 files changed

+96
-76
lines changed

7 files changed

+96
-76
lines changed

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ pip install pythainlp[extra1,extra2,...]
7676
Possible `extras`:
7777

7878
- `full` (install everything)
79+
- `compact` (install a stable and small subset of dependencies)
7980
- `attacut` (to support attacut, a fast and accurate tokenizer)
8081
- `benchmarks` (for [word tokenization benchmarking](tokenization-benchmark.md))
8182
- `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization)
@@ -85,7 +86,8 @@ Possible `extras`:
8586
- `thai2rom` (for machine-learnt romanization)
8687
- `wordnet` (for Thai WordNet API)
8788

88-
For dependency details, look at the `extras` variable in [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
89+
For dependency details, look at the `extras` variable in
90+
[`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
8991

9092
## Data Directory
9193

@@ -110,6 +112,19 @@ To show how to use:
110112
thainlp help
111113
```
112114

115+
## Testing and test suites
116+
117+
We test core functionalities on all officially supported Python versions.
118+
119+
Some functionality requiring extra dependencies may be tested less frequently
120+
due to potential version conflicts or incompatibilities between packages.
121+
122+
Test cases are categorized into three groups: core, compact, and extra.
123+
You can find these tests in the [tests/](/tests/) directory.
124+
125+
For more detailed information on testing, please refer to the tests README:
126+
[tests/README.md](./tests/README.md)
127+
113128
## Licenses
114129

115130
| | License |

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
PyYAML>=5.4.1
2+
nlpo3>=1.3.0
23
numpy>=1.22
34
pyicu>=2.3
45
python-crfsuite>=0.9.7

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
"ipa": ["epitran>=1.1"],
6868
"ml": ["numpy>=1.22", "torch>=1.0.0"],
6969
"mt5": ["sentencepiece>=0.1.91", "transformers>=4.6.0"],
70-
"nlpo3": ["nlpo3>=1.2.2"],
70+
"nlpo3": ["nlpo3>=1.3.0"],
7171
"onnx": ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"],
7272
"oskut": ["oskut>=1.3"],
7373
"sefr_cut": ["sefr_cut>=1.1"],
@@ -103,6 +103,7 @@
103103
# Compact dependencies, this one matches requirements.txt
104104
"compact": [
105105
"PyYAML>=5.4.1",
106+
"nlpo3>=1.3.0",
106107
"numpy>=1.22",
107108
"pyicu>=2.3",
108109
"python-crfsuite>=0.9.7",
@@ -119,7 +120,7 @@
119120
"fastcoref>=2.1.5",
120121
"gensim>=4.0.0",
121122
"khamyo>=0.2.0",
122-
"nlpo3>=1.2.2",
123+
"nlpo3>=1.3.0",
123124
"nltk>=3.3",
124125
"numpy>=1.22",
125126
"onnxruntime>=1.10.0",

tests/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ Tests are categorized into three groups: core, compact, and extra.
1414
## Compact Tests (testc_*.py)
1515

1616
- Run `unittest tests.compact`
17-
- Test a limited set of additional functionalities that rely on optional
18-
dependencies specified in `requirements.txt`.
19-
- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and
20-
`requests`.
17+
- Test a limited set of functionalities that rely on a stable and small subset
18+
of optional dependencies specified in `requirements.txt`.
19+
- These dependencies are `PyYAML`, `nlpo3`, `numpy`, `pyicu`,
20+
`python-crfsuite`, and `requests`.
2121
- Test with the latest two stable Python versions.
2222

2323
## Extra Tests (testx_*.py)

tests/compact/testc_tokenize.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,6 @@
2525
)
2626

2727

28-
class WordTokenizeICUTestCase(unittest.TestCase):
29-
def test_icu(self):
30-
self.assertEqual(pyicu.segment(None), [])
31-
self.assertEqual(pyicu.segment(""), [])
32-
self.assertEqual(
33-
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
34-
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
35-
)
36-
37-
def test_word_tokenize_icu(self):
38-
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
39-
40-
4128
class SentTokenizeCRFCutTestCase(unittest.TestCase):
4229
def test_sent_tokenize(self):
4330
# Use default engine (crfcut)
@@ -88,3 +75,21 @@ def test_subword_tokenize(self):
8875
self.assertNotIn(
8976
"า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
9077
)
78+
79+
80+
class WordTokenizeICUTestCase(unittest.TestCase):
81+
def test_icu(self):
82+
self.assertEqual(pyicu.segment(None), [])
83+
self.assertEqual(pyicu.segment(""), [])
84+
self.assertEqual(
85+
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
86+
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
87+
)
88+
89+
def test_word_tokenize_icu(self):
90+
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
91+
92+
93+
class WordTokenizeNlpO3TestCase(unittest.TestCase):
94+
def test_word_tokenize_nlpo3(self):
95+
self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3"))

tests/core/test_tokenize.py

Lines changed: 53 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,59 @@
204204
SENT_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
205205

206206

207+
class DetokenizeTestCase(unittest.TestCase):
208+
"""Detokenize and regrouping test cases"""
209+
210+
def test_word_detokenize(self):
211+
self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
212+
self.assertEqual(
213+
word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
214+
)
215+
self.assertEqual(
216+
word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
217+
[["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
218+
)
219+
self.assertEqual(
220+
word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]),
221+
"ผมเลี้ยง 5 10 ตัว ๆ คนดี",
222+
)
223+
self.assertEqual(
224+
word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]),
225+
"ผมเลี้ยง 5 ตัว ๆ คนดี",
226+
)
227+
self.assertEqual(
228+
word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
229+
"ม่ายย ผมเลี้ยง 5 ตัว",
230+
)
231+
232+
def test_numeric_data_format(self):
233+
engines = ["newmm"]
234+
235+
for engine in engines:
236+
self.assertIn(
237+
"127.0.0.1",
238+
word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
239+
)
240+
241+
tokens = word_tokenize(
242+
"เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
243+
)
244+
self.assertTrue(
245+
any(value in tokens for value in ["12:12pm", "12:12"]),
246+
msg=f"{engine}: {tokens}",
247+
)
248+
self.assertIn("11.11", tokens)
249+
250+
self.assertIn(
251+
"1,234,567.89",
252+
word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
253+
)
254+
255+
tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
256+
self.assertIn("2.5:1", tokens)
257+
self.assertIn("5:2", tokens)
258+
259+
207260
class TokenizeTestCase(unittest.TestCase):
208261
def test_Tokenizer(self):
209262
_tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE)
@@ -550,56 +603,3 @@ def test_tcc_p(self):
550603
# )
551604
self.assertEqual(list(tcc_p.tcc("")), [])
552605
self.assertEqual(tcc_p.tcc_pos(""), set())
553-
554-
555-
class DetokenizeTestCase(unittest.TestCase):
556-
"""Detokenize and regrouping test cases"""
557-
558-
def test_word_detokenize(self):
559-
self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
560-
self.assertEqual(
561-
word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
562-
)
563-
self.assertEqual(
564-
word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
565-
[["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
566-
)
567-
self.assertEqual(
568-
word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]),
569-
"ผมเลี้ยง 5 10 ตัว ๆ คนดี",
570-
)
571-
self.assertEqual(
572-
word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]),
573-
"ผมเลี้ยง 5 ตัว ๆ คนดี",
574-
)
575-
self.assertEqual(
576-
word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
577-
"ม่ายย ผมเลี้ยง 5 ตัว",
578-
)
579-
580-
def test_numeric_data_format(self):
581-
engines = ["newmm"]
582-
583-
for engine in engines:
584-
self.assertIn(
585-
"127.0.0.1",
586-
word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
587-
)
588-
589-
tokens = word_tokenize(
590-
"เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
591-
)
592-
self.assertTrue(
593-
any(value in tokens for value in ["12:12pm", "12:12"]),
594-
msg=f"{engine}: {tokens}",
595-
)
596-
self.assertIn("11.11", tokens)
597-
598-
self.assertIn(
599-
"1,234,567.89",
600-
word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
601-
)
602-
603-
tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
604-
self.assertIn("2.5:1", tokens)
605-
self.assertIn("5:2", tokens)

tests/extra/testx_tokenize.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,9 +306,7 @@ def test_nercut(self):
306306
self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut"))
307307

308308

309-
class WordTokenizeNlpO3TestCase(unittest.TestCase):
310-
def test_word_tokenize_nlpo3(self):
311-
self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3"))
309+
312310

313311

314312
class WordTokenizeOSKutTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)