Add nlpo3 to compact

bact · bact · commit e6ea7e2b39fe · 2024-11-04T07:16:06.000Z
- Add relevant nlpo3 test to testc
- Add notes on tests in main readme
- Up nlpo3 version to 1.3.0 (fixed karan bug)
diff --git a/README.md b/README.md
@@ -76,6 +76,7 @@ pip install pythainlp[extra1,extra2,...]
 Possible `extras`:
 
 - `full` (install everything)
+- `compact` (install a stable and small subset of dependencies)
 - `attacut` (to support attacut, a fast and accurate tokenizer)
 - `benchmarks` (for [word tokenization benchmarking](tokenization-benchmark.md))
 - `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization)
@@ -85,7 +86,8 @@ Possible `extras`:
 - `thai2rom` (for machine-learnt romanization)
 - `wordnet` (for Thai WordNet API)
 
-For dependency details, look at the `extras` variable in [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
+For dependency details, look at the `extras` variable in
+[`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
 
 ## Data Directory
 
@@ -110,6 +112,19 @@ To show how to use:
 thainlp help
 ```
 
+## Testing and test suites
+
+We test core functionalities on all officially supported Python versions.
+
+Some functionality requiring extra dependencies may be tested less frequently
+due to potential version conflicts or incompatibilities between packages.
+
+Test cases are categorized into three groups: core, compact, and extra.
+You can find these tests in the [tests/](/tests/) directory.
+
+For more detailed information on testing, please refer to the tests README:
+[tests/README.md](./tests/README.md)
+
 ## Licenses
 
 | | License |
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 PyYAML>=5.4.1
+nlpo3>=1.3.0
 numpy>=1.22
 pyicu>=2.3
 python-crfsuite>=0.9.7
diff --git a/setup.py b/setup.py
@@ -67,7 +67,7 @@
     "ipa": ["epitran>=1.1"],
     "ml": ["numpy>=1.22", "torch>=1.0.0"],
     "mt5": ["sentencepiece>=0.1.91", "transformers>=4.6.0"],
-    "nlpo3": ["nlpo3>=1.2.2"],
+    "nlpo3": ["nlpo3>=1.3.0"],
     "onnx": ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"],
     "oskut": ["oskut>=1.3"],
     "sefr_cut": ["sefr_cut>=1.1"],
@@ -103,6 +103,7 @@
     # Compact dependencies, this one matches requirements.txt
     "compact": [
         "PyYAML>=5.4.1",
+        "nlpo3>=1.3.0",
         "numpy>=1.22",
         "pyicu>=2.3",
         "python-crfsuite>=0.9.7",
@@ -119,7 +120,7 @@
         "fastcoref>=2.1.5",
         "gensim>=4.0.0",
         "khamyo>=0.2.0",
-        "nlpo3>=1.2.2",
+        "nlpo3>=1.3.0",
         "nltk>=3.3",
         "numpy>=1.22",
         "onnxruntime>=1.10.0",
diff --git a/tests/README.md b/tests/README.md
@@ -14,10 +14,10 @@ Tests are categorized into three groups: core, compact, and extra.
 ## Compact Tests (testc_*.py)
 
 - Run `unittest tests.compact`
-- Test a limited set of additional functionalities that rely on optional
-  dependencies specified in `requirements.txt`.
-- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and
-  `requests`.
+- Test a limited set of functionalities that rely on a stable and small subset
+  of optional dependencies specified in `requirements.txt`.
+- These dependencies are `PyYAML`, `nlpo3`, `numpy`, `pyicu`,
+  `python-crfsuite`, and `requests`.
 - Test with the latest two stable Python versions.
 
 ## Extra Tests (testx_*.py)
diff --git a/tests/compact/testc_tokenize.py b/tests/compact/testc_tokenize.py
@@ -25,19 +25,6 @@
 )
 
 
-class WordTokenizeICUTestCase(unittest.TestCase):
-    def test_icu(self):
-        self.assertEqual(pyicu.segment(None), [])
-        self.assertEqual(pyicu.segment(""), [])
-        self.assertEqual(
-            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
-            ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
-        )
-
-    def test_word_tokenize_icu(self):
-        self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
-
-
 class SentTokenizeCRFCutTestCase(unittest.TestCase):
     def test_sent_tokenize(self):
         # Use default engine (crfcut)
@@ -88,3 +75,21 @@ def test_subword_tokenize(self):
         self.assertNotIn(
             "า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
         )
+
+
+class WordTokenizeICUTestCase(unittest.TestCase):
+    def test_icu(self):
+        self.assertEqual(pyicu.segment(None), [])
+        self.assertEqual(pyicu.segment(""), [])
+        self.assertEqual(
+            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
+            ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
+        )
+
+    def test_word_tokenize_icu(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
+
+
+class WordTokenizeNlpO3TestCase(unittest.TestCase):
+    def test_word_tokenize_nlpo3(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3"))
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -204,6 +204,59 @@
 SENT_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
 
 
+class DetokenizeTestCase(unittest.TestCase):
+    """Detokenize and regrouping test cases"""
+
+    def test_word_detokenize(self):
+        self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
+        )
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
+            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
+        )
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]),
+            "ผมเลี้ยง 5 10 ตัว ๆ คนดี",
+        )
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]),
+            "ผมเลี้ยง 5 ตัว ๆ คนดี",
+        )
+        self.assertEqual(
+            word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
+            "ม่ายย ผมเลี้ยง 5 ตัว",
+        )
+
+    def test_numeric_data_format(self):
+        engines = ["newmm"]
+
+        for engine in engines:
+            self.assertIn(
+                "127.0.0.1",
+                word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
+            )
+
+            tokens = word_tokenize(
+                "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
+            )
+            self.assertTrue(
+                any(value in tokens for value in ["12:12pm", "12:12"]),
+                msg=f"{engine}: {tokens}",
+            )
+            self.assertIn("11.11", tokens)
+
+            self.assertIn(
+                "1,234,567.89",
+                word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
+            )
+
+            tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
+            self.assertIn("2.5:1", tokens)
+            self.assertIn("5:2", tokens)
+
+
 class TokenizeTestCase(unittest.TestCase):
     def test_Tokenizer(self):
         _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE)
@@ -550,56 +603,3 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
-
-
-class DetokenizeTestCase(unittest.TestCase):
-    """Detokenize and regrouping test cases"""
-
-    def test_word_detokenize(self):
-        self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
-        )
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
-            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
-        )
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]),
-            "ผมเลี้ยง 5 10 ตัว ๆ คนดี",
-        )
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]),
-            "ผมเลี้ยง 5 ตัว ๆ คนดี",
-        )
-        self.assertEqual(
-            word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
-            "ม่ายย ผมเลี้ยง 5 ตัว",
-        )
-
-    def test_numeric_data_format(self):
-        engines = ["newmm"]
-
-        for engine in engines:
-            self.assertIn(
-                "127.0.0.1",
-                word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
-            )
-
-            tokens = word_tokenize(
-                "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
-            )
-            self.assertTrue(
-                any(value in tokens for value in ["12:12pm", "12:12"]),
-                msg=f"{engine}: {tokens}",
-            )
-            self.assertIn("11.11", tokens)
-
-            self.assertIn(
-                "1,234,567.89",
-                word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
-            )
-
-            tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
-            self.assertIn("2.5:1", tokens)
-            self.assertIn("5:2", tokens)
diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py
@@ -306,9 +306,7 @@ def test_nercut(self):
         self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut"))
 
 
-class WordTokenizeNlpO3TestCase(unittest.TestCase):
-    def test_word_tokenize_nlpo3(self):
-        self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3"))
+
 
 
 class WordTokenizeOSKutTestCase(unittest.TestCase):

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`PyYAML>=5.4.1`
	`2`	`+nlpo3>=1.3.0`
`2`	`3`	`numpy>=1.22`
`3`	`4`	`pyicu>=2.3`
`4`	`5`	`python-crfsuite>=0.9.7`