Skip to content

Commit c530bdf

Browse files
committed
Add wordlist
1 parent f8de681 commit c530bdf

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

tests/test_coverage.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import os
22
import sys
33
from .context import wordsegment
4-
from wordsegment import clean, load, main, isegment, segment, UNIGRAMS, BIGRAMS
4+
from wordsegment import (
5+
clean, load, main, isegment, segment, UNIGRAMS, BIGRAMS, WORDS,
6+
)
57

68
load()
79

@@ -95,3 +97,8 @@ def test_main():
9597
main(['tests/test.txt'])
9698
result = os.linesep.join(('choose spain', 'this is a test')) + os.linesep
9799
assert sys.stdout.getvalue() == result
100+
101+
def test_words():
102+
assert len(WORDS) > 0
103+
assert WORDS[0] == 'aa'
104+
assert WORDS[-1] == 'zzz'

wordsegment/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,18 @@ class Segmenter(object):
4949
)
5050
TOTAL = 1024908267229.0
5151
LIMIT = 24
52+
WORDS_FILENAME = op.join(
53+
op.dirname(op.realpath(__file__)),
54+
'words.txt',
55+
)
5256

5357

5458
def __init__(self):
5559
self.unigrams = {}
5660
self.bigrams = {}
5761
self.total = 0.0
5862
self.limit = 0
63+
self.words = []
5964

6065

6166
def load(self):
@@ -64,6 +69,9 @@ def load(self):
6469
self.bigrams.update(self.parse(self.BIGRAMS_FILENAME))
6570
self.total = self.TOTAL
6671
self.limit = self.LIMIT
72+
with io.open(self.WORDS_FILENAME, encoding='utf-8') as reader:
73+
text = reader.read()
74+
self.words.extend(text.splitlines())
6775

6876

6977
@staticmethod
@@ -181,6 +189,7 @@ def clean(cls, text):
181189
segment = _segmenter.segment # pylint: disable=invalid-name
182190
UNIGRAMS = _segmenter.unigrams
183191
BIGRAMS = _segmenter.bigrams
192+
WORDS = _segmenter.words
184193

185194

186195
def main(arguments=()):

0 commit comments

Comments
 (0)