Skip to content

Commit 9bab566

Browse files
Copilotwannaphong
andcommitted
Add test for issue #680 with ORST filtering documentation
- Added test_issue_680_orst_filtering test case - Documents that compound words not in ORST won't be suggested - Verifies spell checker only uses ORST words - Tests specific case from issue #680: ปลาอินทรีย์ -> ปลาอินทรี - Explains expected behavior when words aren't in ORST dictionary Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
1 parent 0538ccb commit 9bab566

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

tests/core/test_spell.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,44 @@ def test_norvig_spell_checker(self):
9696
with self.assertRaises(TypeError):
9797
_ = NorvigSpellChecker(custom_dict=user_dict)
9898

99+
def test_issue_680_orst_filtering(self):
100+
"""Test for issue #680: Spell checker uses only ORST words.
101+
102+
Issue #680 reported that the TNC dictionary contained misspelled words.
103+
The solution is to filter Phupha dataset by thai_orst_words.
104+
105+
Note: Compound words like 'ปลาอินทรีย์' and 'ปลาอินทรี' that are not
106+
in the ORST dictionary as single entries will not be suggested by the
107+
spell checker. This is expected behavior when filtering to ORST words only.
108+
"""
109+
from pythainlp.corpus import thai_orst_words
110+
111+
checker = NorvigSpellChecker()
112+
orst = thai_orst_words()
113+
114+
# Verify that the checker only uses ORST words
115+
# Check a sample of words from the dictionary
116+
dict_words = [word for word, _ in list(checker.dictionary())[:1000]]
117+
non_orst_words = [w for w in dict_words if w not in orst]
118+
self.assertEqual(len(non_orst_words), 0,
119+
"All words in spell checker should be from ORST")
120+
121+
# The specific case from issue #680
122+
# Both 'ปลาอินทรีย์' (misspelled) and 'ปลาอินทรี' (correct)
123+
# are not in ORST as compound words, so neither will be suggested
124+
word_misspelled = "ปลาอินทรีย์"
125+
word_correct = "ปลาอินทรี"
126+
127+
# Verify neither is in ORST
128+
self.assertNotIn(word_misspelled, orst)
129+
self.assertNotIn(word_correct, orst)
130+
131+
# Since neither is in the dictionary, the spell checker
132+
# will return the input word unchanged
133+
result = checker.correct(word_misspelled)
134+
# The checker returns the word as-is if no suggestions found
135+
self.assertEqual(result, word_misspelled)
136+
99137
def test_spell_sent(self):
100138
self.assertIsNotNone(spell_sent(SENT_TOKS))
101139
self.assertIsNotNone(spell_sent(SENT_TOKS, engine="pn"))

0 commit comments

Comments
 (0)