Skip to content

Commit 01e81c1

Browse files
authored
Merge pull request #976 from bact/unittest-testx
Add Compact Tests (testc)
2 parents 2958632 + 976bea8 commit 01e81c1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+426
-218
lines changed

.github/workflows/unittest.yml

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Unit test and coverage
1+
name: Unit test
22

33
on:
44
push:
@@ -23,9 +23,10 @@ jobs:
2323
runs-on: ${{ matrix.os }}
2424
env:
2525
PYICU_WIN_VER: 2.14
26-
INSTALL_PYICU_WIN: false
2726
INSTALL_TORCH: false
2827
INSTALL_FULL_DEPS: false
28+
PYTHON_VERSION_LATEST: "3.13"
29+
PYTHON_VERSION_LATEST_2: "3.12"
2930

3031
steps:
3132
- name: Checkout
@@ -52,7 +53,7 @@ jobs:
5253
echo "ICU_VER=${ICU_VER}"
5354
echo "ICU_VER=${ICU_VER}" >> "${GITHUB_ENV}"
5455
- name: Install PyICU (Windows)
55-
if: startsWith(matrix.os, 'windows-') && env.INSTALL_PYICU_WIN == 'true'
56+
if: startsWith(matrix.os, 'windows-') && (matrix.python-version == '3.12' || matrix.python-version == '3.13')
5657
shell: powershell
5758
run: |
5859
$PYTHON_WIN_VER = "${{ matrix.python-version }}"
@@ -66,29 +67,42 @@ jobs:
6667
# If torch for the platform is not available in PyPI, use this command:
6768
# pip install "<torch_wheel_url>"
6869
# Get wheel URL from http://download.pytorch.org/whl/torch/
69-
- name: Install dependencies
70+
- name: Install dependencies from docker_requirements.txt
7071
if: env.INSTALL_FULL_DEPS == 'true'
7172
env:
7273
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
7374
run: pip install -r docker_requirements.txt
74-
- name: Install PyThaiNLP
75-
env:
76-
PYTHONIOENCODING: utf-8
75+
- name: Install PyThaiNLP + dependencies (minimum)
76+
if: matrix.python-version != env.PYTHON_VERSION_LATEST && matrix.python-version != env.PYTHON_VERSION_LATEST_2
7777
run: pip install .
78+
- name: Install PyThaiNLP + dependencies (compact)
79+
if: matrix.python-version == env.PYTHON_VERSION_LATEST || matrix.python-version == env.PYTHON_VERSION_LATEST_2
80+
run: pip install ".[compact]"
7881
# If you want to install a safe small set of optional dependencies, use:
79-
# pip install .[compact]
80-
# "compact" includes numpy, pyicu, and python-crfsuite.
82+
# pip install ".[compact]"
8183
# We can gradually run more test cases by installing more optional
8284
# dependencies. But we should also consider to reduce the number
8385
# of dependencies to avoid the conflict between dependencies.
8486
# See: https://github.com/PyThaiNLP/pythainlp/issues/935
85-
- name: Unit test and code coverage
86-
run: coverage run -m unittest tests
87-
# Use 'unittest tests' instead of 'unittest discover' to avoid loading
88-
# tests with external imports.
87+
- name: Unit test (core)
88+
if: matrix.python-version != env.PYTHON_VERSION_LATEST && matrix.python-version != env.PYTHON_VERSION_LATEST_2
89+
env:
90+
PYTHONIOENCODING: utf-8
91+
run: coverage run -m unittest tests.core
92+
- name: Unit test (core + compact)
93+
if: matrix.python-version == env.PYTHON_VERSION_LATEST || matrix.python-version == env.PYTHON_VERSION_LATEST_2
94+
env:
95+
PYTHONIOENCODING: utf-8
96+
run: coverage run -m unittest tests.core tests.compact
97+
# Only test "compact" set with the latest two stable Python versions.
98+
# Use 'unittest <test_module>' instead of 'unittest discover' to avoid
99+
# loading tests with dependencies more than expected.
89100
# Test cases loaded is defined in __init__.py in the tests directory.
101+
# See also tests/README.md
90102
- name: Coverage report
103+
if: matrix.python-version == env.PYTHON_VERSION_LATEST
91104
env:
92105
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93106
COVERALLS_SERVICE_NAME: github
94107
run: coveralls
108+
# Only submit a report from the latest Python version

pythainlp/util/strftime.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
Thai date/time formatting.
66
"""
7+
78
import warnings
89
from datetime import datetime
910
from string import digits
@@ -44,7 +45,7 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str:
4445
# in that case just use the fmt_char
4546
warnings.warn(
4647
(
47-
f"String format directive unknown/not support: %{fmt_char}"
48+
f"String format directive unknown/not support: %{fmt_char}\n"
4849
f"The system raises this ValueError: {err}"
4950
),
5051
UserWarning,
@@ -145,7 +146,8 @@ def _thai_strftime(dt_obj: datetime, fmt_char: str) -> str:
145146
)
146147
else:
147148
# No known localization available, use Python's default
148-
str_ = _std_strftime(dt_obj, fmt_char)
149+
# With a good _NEED_L10N and _EXTENSIONS, this should not happen
150+
str_ = _std_strftime(dt_obj, fmt_char) # pragma: no cover
149151

150152
return str_
151153

setup.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,12 @@
101101
"wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"],
102102
"wunsen": ["wunsen>=0.0.1"],
103103
# Compact dependencies, this one matches requirements.txt
104-
"compact": ["PyYAML>=5.4.1", "numpy>=1.22", "pyicu>=2.3", "python-crfsuite>=0.9.7"],
104+
"compact": [
105+
"PyYAML>=5.4.1",
106+
"numpy>=1.22",
107+
"pyicu>=2.3",
108+
"python-crfsuite>=0.9.7",
109+
],
105110
# Full dependencies
106111
"full": [
107112
"PyYAML>=5.4.1",

tests/README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
# Test cases
22

3-
Tests are categorized into two groups: fundamental and extra.
3+
Tests are categorized into three groups: core, compact, and extra.
44

5-
## Fundamental Tests (test_*.py)
5+
## Core Tests (test_*.py)
66

77
- Focus on core functionalities.
8-
- Do not rely on additional dependencies beyond those listed in the
9-
`requirements` section of `setup.py`.
8+
- Do not rely on external dependencies beyond the standard library,
9+
except for `requests` which is used for corpus downloading.
10+
- Test with all officially supported Python versions
11+
(currently 3.9, 3.10, 3.11, 3.12, and 3.13).
12+
13+
## Compact Tests (testc_*.py)
14+
15+
- Test a limited set of additional functionalities that rely on optional
16+
dependencies specified in `requirements.txt`.
17+
- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and
18+
`requests`.
19+
- Test with the latest two stable Python versions.
1020

1121
## Extra Tests (testx_*.py)
1222

1323
- Explore functionalities that rely on optional dependencies specified in the
1424
`extras` section of `setup.py`.
15-
- These dependencies might include libraries like `nltk`, `pycrfsuite`, or
16-
`torch`.
25+
- These dependencies might include libraries like `gensim`, `nltk`, or `torch`.
26+
- Due to dependency complexities, these functionalities are not part of the
27+
automated test suite and will not be tested in the CI/CD pipeline.
1728

1829
## Default Test Suite
1930

tests/__init__.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,8 @@
1111

1212
# Names of module to be tested
1313
test_packages: list[str] = [
14-
"tests.test_ancient",
15-
"tests.test_cli",
16-
"tests.test_corpus",
17-
"tests.test_khavee",
18-
"tests.test_morpheme",
19-
"tests.test_soundex",
20-
"tests.test_spell",
21-
"tests.test_tag",
22-
"tests.test_tokenize",
23-
"tests.test_tools",
24-
"tests.test_transliterate",
25-
"tests.test_util",
14+
"tests.core",
15+
"tests.compact",
2616
]
2717

2818

tests/compact/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Unit test. Compact version.
6+
7+
Test functions that require "compact" dependencies (see setup.py).
8+
"""
9+
10+
from unittest import TestLoader, TestSuite
11+
12+
# Names of module to be tested
13+
test_packages: list[str] = [
14+
"tests.compact.testc_tag",
15+
"tests.compact.testc_tokenize",
16+
"tests.compact.testc_util",
17+
]
18+
19+
20+
def load_tests(
21+
loader: TestLoader, standard_tests: TestSuite, pattern: str
22+
) -> TestSuite:
23+
"""Load test protocol
24+
See: https://docs.python.org/3/library/unittest.html#id1
25+
"""
26+
suite = TestSuite()
27+
for test_package in test_packages:
28+
tests = loader.loadTestsFromName(test_package)
29+
suite.addTests(tests)
30+
return suite
31+
32+
33+
if __name__ == "__main__":
34+
import unittest
35+
36+
unittest.main()

tests/compact/testc_tag.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import unittest
6+
7+
from pythainlp.tag import chunk_parse, pos_tag
8+
9+
10+
class TagTestCase(unittest.TestCase):
11+
def test_chunk_parse(self):
12+
tokens = ["ผม", "รัก", "คุณ"]
13+
14+
w_p = pos_tag(tokens, engine="perceptron", corpus="orchid")
15+
self.assertIsNotNone(chunk_parse(w_p))

tests/compact/testc_tokenize.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import unittest
6+
7+
from pythainlp.tokenize import (
8+
pyicu,
9+
sent_tokenize,
10+
subword_tokenize,
11+
word_tokenize,
12+
)
13+
14+
from ..core.test_tokenize import (
15+
SENT_1,
16+
SENT_1_TOKS,
17+
SENT_2,
18+
SENT_2_TOKS,
19+
SENT_3,
20+
SENT_3_TOKS,
21+
SENT_4,
22+
TEXT_1,
23+
)
24+
25+
26+
# Tests for functions that need "compact" dependencies
27+
class TokenizeTestCaseCompact(unittest.TestCase):
28+
def test_icu(self):
29+
self.assertEqual(pyicu.segment(None), [])
30+
self.assertEqual(pyicu.segment(""), [])
31+
self.assertEqual(
32+
word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
33+
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
34+
)
35+
36+
def test_sent_tokenize(self):
37+
# Use default engine (crfcut)
38+
self.assertEqual(sent_tokenize(None), [])
39+
self.assertEqual(sent_tokenize(""), [])
40+
self.assertEqual(
41+
sent_tokenize(SENT_1),
42+
SENT_1_TOKS,
43+
)
44+
self.assertEqual(
45+
sent_tokenize(SENT_2),
46+
SENT_2_TOKS,
47+
)
48+
self.assertEqual(
49+
sent_tokenize(SENT_3),
50+
SENT_3_TOKS,
51+
)
52+
53+
self.assertEqual(
54+
sent_tokenize(SENT_1, engine="crfcut"),
55+
SENT_1_TOKS,
56+
)
57+
self.assertEqual(
58+
sent_tokenize(SENT_2, engine="crfcut"),
59+
SENT_2_TOKS,
60+
)
61+
self.assertEqual(
62+
sent_tokenize(SENT_3, engine="crfcut"),
63+
SENT_3_TOKS,
64+
)
65+
self.assertEqual(
66+
sent_tokenize(SENT_4, engine="crfcut"),
67+
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
68+
)
69+
70+
def test_subword_tokenize(self):
71+
self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
72+
self.assertEqual(
73+
subword_tokenize("แมวกินปลา", engine="han_solo"),
74+
["แมว", "กิน", "ปลา"],
75+
)
76+
self.assertIn(
77+
"ดาว", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
78+
)
79+
80+
self.assertNotIn(
81+
"า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
82+
)
83+
84+
def test_word_tokenize_icu(self):
85+
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,14 @@
88

99
import unittest
1010

11-
from pythainlp.util import rhyme, thai_word_tone_detector
1211
from pythainlp.util.spell_words import spell_word
1312

1413

1514
class UtilTestCaseX(unittest.TestCase):
16-
def test_rhyme(self):
17-
self.assertIsInstance(rhyme("แมว"), list)
18-
self.assertTrue(len(rhyme("แมว")) > 2)
19-
2015
def test_spell_word(self):
2116
self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"])
2217
self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"])
2318
self.assertEqual(spell_word("คน"), ["คอ", "นอ", "คน"])
2419
self.assertEqual(
2520
spell_word("คนดี"), ["คอ", "นอ", "คน", "ดอ", "อี", "ดี", "คนดี"]
2621
)
27-
28-
def test_thai_word_tone_detector(self):
29-
self.assertIsNotNone(thai_word_tone_detector("คนดี"))
30-
self.assertEqual(
31-
thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")]
32-
)

tests/core/__init__.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Unit test.
6+
7+
Each file in tests/ is for each main package.
8+
"""
9+
10+
from unittest import TestLoader, TestSuite
11+
12+
# Names of module to be tested
13+
test_packages: list[str] = [
14+
"tests.core.test_ancient",
15+
"tests.core.test_cli",
16+
"tests.core.test_corpus",
17+
"tests.core.test_khavee",
18+
"tests.core.test_morpheme",
19+
"tests.core.test_soundex",
20+
"tests.core.test_spell",
21+
"tests.core.test_tag",
22+
"tests.core.test_tokenize",
23+
"tests.core.test_tools",
24+
"tests.core.test_transliterate",
25+
"tests.core.test_util",
26+
]
27+
28+
29+
def load_tests(
30+
loader: TestLoader, standard_tests: TestSuite, pattern: str
31+
) -> TestSuite:
32+
"""Load test protocol
33+
See: https://docs.python.org/3/library/unittest.html#id1
34+
"""
35+
suite = TestSuite()
36+
for test_package in test_packages:
37+
tests = loader.loadTestsFromName(test_package)
38+
suite.addTests(tests)
39+
return suite
40+
41+
42+
if __name__ == "__main__":
43+
import unittest
44+
45+
unittest.main()

0 commit comments

Comments
 (0)