Skip to content

Commit e238e84

Browse files
authored
Merge pull request #1057 from PyThaiNLP/wannaphong/add-misspell-command
Add misspell command to CLI
2 parents ef73fdc + 3367c03 commit e238e84

File tree

5 files changed

+103
-1
lines changed

5 files changed

+103
-1
lines changed

docs/notes/command_line.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,15 @@ You can use some thainlp functions directly from command line.
9494
word_level:precision 0.8173
9595
word_level:recall 0.8314
9696

97+
**Misspell**::
98+
99+
thainlp misspell --file <input_file> [--seed <seed>] [--misspell-ratio <ratio>] [--output <output_file>]
100+
101+
*Example*::
102+
103+
$ thainlp misspell --file ./some/data.txt --seed=1 --misspell-ratio 0.05
104+
# output file: ./some/data-misspelled-r0.05-seed1.txt
105+
97106
**Help**::
98107

99108
thainlp --help

pythainlp/cli/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")
1313

1414
# a command should start with a verb when possible
15-
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])
15+
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark", "misspell"])
1616

1717
CLI_NAME = "thainlp"
1818

pythainlp/cli/misspell.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
import argparse
7+
import os
8+
import random
9+
10+
from pythainlp.tools.misspell import misspell
11+
12+
13+
class App:
14+
def __init__(self, argv):
15+
parser = argparse.ArgumentParser(
16+
prog="misspell",
17+
description="Generate misspelled texts from a given file.",
18+
usage=(
19+
"thainlp misspell --file <input_file> [--seed <seed>] "
20+
"[--misspell-ratio <ratio>] [--output <output_file>]\n\n"
21+
"Example:\n\n"
22+
"thainlp misspell --file ./some/data.txt --seed=1 "
23+
"--misspell-ratio 0.05\n\n"
24+
"--"
25+
),
26+
)
27+
parser.add_argument(
28+
"--file",
29+
type=str,
30+
required=True,
31+
help="Path to the input file",
32+
)
33+
parser.add_argument(
34+
"--seed",
35+
type=int,
36+
default=None,
37+
help="Random seed for reproducibility",
38+
)
39+
parser.add_argument(
40+
"--misspell-ratio",
41+
type=float,
42+
default=0.05,
43+
help="Ratio of misspells per 100 characters",
44+
)
45+
parser.add_argument(
46+
"--output",
47+
type=str,
48+
default=None,
49+
help="Path to the output file",
50+
)
51+
52+
args = parser.parse_args(argv[2:])
53+
54+
if args.seed is not None:
55+
random.seed(args.seed)
56+
57+
with open(args.file, "r", encoding="utf-8") as f:
58+
lines = f.readlines()
59+
60+
misspelled_lines = [
61+
misspell(line, ratio=args.misspell_ratio) for line in lines
62+
]
63+
64+
if args.output is None:
65+
base, ext = os.path.splitext(args.file)
66+
args.output = f"{base}-misspelled-r{args.misspell_ratio}-seed{args.seed}{ext}"
67+
68+
with open(args.output, "w", encoding="utf-8") as f:
69+
f.writelines(misspelled_lines)

tests/core/test_cli.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pythainlp.cli.soundex import App as SoundexApp
1212
from pythainlp.cli.tag import App as TagApp
1313
from pythainlp.cli.tokenize import App as TokenizeApp
14+
from pythainlp.cli.misspell import App as MisspellApp
1415

1516

1617
class CliTestCase(unittest.TestCase):
@@ -139,3 +140,25 @@ def test_cli_tokenize(self):
139140
]
140141
)
141142
)
143+
144+
def test_cli_misspell(self):
145+
self.assertTrue(hasattr(cli, "misspell"))
146+
147+
with self.assertRaises(SystemExit) as ex:
148+
MisspellApp(["thainlp", "misspell"])
149+
self.assertEqual(ex.exception.code, 2)
150+
151+
self.assertIsNotNone(
152+
MisspellApp(
153+
[
154+
"thainlp",
155+
"misspell",
156+
"--file",
157+
"./tests/data/text.txt",
158+
"--seed",
159+
"1",
160+
"--misspell-ratio",
161+
"0.05",
162+
]
163+
)
164+
)

tests/data/text.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ผมไม่ชอบกินผัก ดังนั้นผมจึงมักจะเลือกทานอาหารที่มีเนื้อสัตว์เป็นส่วนใหญ่ อย่างไรก็ตาม ผมก็รู้ว่าการทานผักมีประโยชน์ต่อสุขภาพ ดังนั้นผมจึงพยายามทานผักบ้างในบางมื้อ แต่ผมก็ยังคงเลือกทานผักที่ผมชอบเท่านั้น อย่างเช่น ถั่วฝักยาว หรือ ถั่วฝักยาว ซึ่งผมคิดว่ามันก็เป็นผักที่อร่อยและมีประโยชน์ด้วย

0 commit comments

Comments
 (0)