Skip to content

Commit 34d1c7b

Browse files
committed
Refactor wordsegment module with Segmenter class
1 parent 1999b26 commit 34d1c7b

File tree

6 files changed

+132
-94
lines changed

6 files changed

+132
-94
lines changed

MANIFEST.in

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
include README.rst LICENSE requirements.txt
1+
include README.rst
2+
include LICENSE
3+
include unigrams.txt
4+
include bigrams.txt
File renamed without changes.

setup.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,11 @@ def run_tests(self):
2626
author='Grant Jenks',
2727
author_email='[email protected]',
2828
url='http://www.grantjenks.com/docs/wordsegment/',
29-
py_modules=['wordsegment'],
30-
packages=['wordsegment_data'],
31-
package_data={'wordsegment_data': ['*.txt']},
29+
packages=['wordsegment'],
30+
include_package_data=True,
3231
tests_require=['tox'],
3332
cmdclass={'test': Tox},
3433
license='Apache 2.0',
35-
install_requires=[],
3634
classifiers=[
3735
'Development Status :: 4 - Beta',
3836
'Intended Audience :: Developers',
@@ -47,5 +45,6 @@ def run_tests(self):
4745
'Programming Language :: Python :: 3.3',
4846
'Programming Language :: Python :: 3.4',
4947
'Programming Language :: Python :: 3.5',
48+
'Programming Language :: Python :: 3.6',
5049
],
5150
)
File renamed without changes.

wordsegment.py

Lines changed: 125 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -30,122 +30,156 @@
3030
import os.path as op
3131
import sys
3232

33-
ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789')
34-
BIGRAMS = None
35-
DATADIR = op.join(op.dirname(op.realpath(__file__)), 'wordsegment_data')
36-
TOTAL = 1024908267229.0
37-
UNIGRAMS = None
3833

39-
def clean(text):
40-
"Return `text` lower-cased with non-alphanumeric characters removed."
41-
return ''.join(letter for letter in text.lower() if letter in ALPHABET)
34+
class Segmenter(object):
35+
alphabet = set('abcdefghijklmnopqrstuvwxyz0123456789')
4236

43-
def divide(text, limit=24):
44-
"""Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
45-
exceeding `limit`.
4637

47-
"""
48-
for pos in range(1, min(len(text), limit) + 1):
49-
yield (text[:pos], text[pos:])
38+
def __init__(self, unigrams, bigrams, total):
39+
self.unigrams = dict(unigrams)
40+
self.bigrams = dict(bigrams)
41+
self.total = float(total)
42+
self.limit = max(map(len, self.unigrams))
5043

51-
def load():
52-
"Load unigram and bigram counts from disk."
53-
global UNIGRAMS, BIGRAMS # pylint: disable=global-statement
54-
UNIGRAMS = parse_file(op.join(DATADIR, 'unigrams.txt'))
55-
BIGRAMS = parse_file(op.join(DATADIR, 'bigrams.txt'))
5644

57-
def parse_file(filename):
58-
"Read `filename` and parse tab-separated file of (word, count) pairs."
59-
with io.open(filename, encoding='utf-8') as reader:
60-
lines = (line.split('\t') for line in reader)
61-
return dict((word, float(number)) for word, number in lines)
45+
def score(self, word, previous=None):
46+
"Score `word` in the context of `previous` word."
47+
unigrams = self.unigrams
48+
bigrams = self.bigrams
49+
total = self.total
6250

63-
def score(word, prev=None):
64-
"Score a `word` in the context of the previous word, `prev`."
65-
if UNIGRAMS is None and BIGRAMS is None:
66-
load()
51+
if previous is None:
52+
if word in unigrams:
6753

68-
if prev is None:
69-
if word in UNIGRAMS:
54+
# Probability of the given word.
7055

71-
# Probability of the given word.
56+
return unigrams[word] / total
57+
else:
58+
# Penalize words not found in the unigrams according
59+
# to their length, a crucial heuristic.
7260

73-
return UNIGRAMS[word] / TOTAL
61+
return 10.0 / (total * 10 ** len(word))
7462
else:
75-
# Penalize words not found in the unigrams according
76-
# to their length, a crucial heuristic.
63+
bigram = '{0} {1}'.format(previous, word)
7764

78-
return 10.0 / (TOTAL * 10 ** len(word))
79-
else:
80-
bigram = '{0} {1}'.format(prev, word)
65+
if bigram in bigrams and previous in unigrams:
8166

82-
if bigram in BIGRAMS and prev in UNIGRAMS:
67+
# Conditional probability of the word given the previous
68+
# word. The technical name is *stupid backoff* and it's
69+
# not a probability distribution but it works well in
70+
# practice.
8371

84-
# Conditional probability of the word given the previous
85-
# word. The technical name is *stupid backoff* and it's
86-
# not a probability distribution but it works well in
87-
# practice.
72+
return bigrams[bigram] / total / self.score(previous)
73+
else:
74+
# Fall back to using the unigram probability.
8875

89-
return BIGRAMS[bigram] / TOTAL / score(prev)
90-
else:
91-
# Fall back to using the unigram probability.
76+
return self.score(word)
9277

93-
return score(word)
9478

95-
def isegment(text):
96-
"Return iterator of words that is the best segmenation of `text`."
79+
def isegment(self, text):
80+
"Return iterator of words that is the best segmenation of `text`."
81+
memo = dict()
9782

98-
memo = dict()
83+
def search(text, previous='<s>'):
84+
"Return max of candidates matching `text` given `previous` word."
85+
if text == '':
86+
return 0.0, []
9987

100-
def search(text, prev='<s>'):
101-
"Return max of candidates matching `text` given previous word, `prev`."
102-
if text == '':
103-
return 0.0, []
88+
def candidates():
89+
"Generator of (score, words) pairs for all divisions of text."
90+
for prefix, suffix in self.divide(text):
91+
prefix_score = math.log10(self.score(prefix, previous))
10492

105-
def candidates():
106-
"Generator of (score, words) pairs for all divisions of text."
107-
for prefix, suffix in divide(text):
108-
prefix_score = math.log10(score(prefix, prev))
93+
pair = (suffix, prefix)
94+
if pair not in memo:
95+
memo[pair] = search(suffix, prefix)
96+
suffix_score, suffix_words = memo[pair]
10997

110-
pair = (suffix, prefix)
111-
if pair not in memo:
112-
memo[pair] = search(suffix, prefix)
113-
suffix_score, suffix_words = memo[pair]
98+
yield (prefix_score + suffix_score, [prefix] + suffix_words)
11499

115-
yield (prefix_score + suffix_score, [prefix] + suffix_words)
100+
return max(candidates())
116101

117-
return max(candidates())
102+
# Avoid recursion limit issues by dividing text into chunks, segmenting
103+
# those chunks and combining the results together. Chunks may divide
104+
# words in the middle so prefix chunks with the last five words of the
105+
# previous result.
118106

119-
# Avoid recursion limit issues by dividing text into chunks, segmenting
120-
# those chunks and combining the results together. Chunks may divide words
121-
# in the middle so prefix chunks with the last five words of the previous
122-
# result.
107+
clean_text = self.clean(text)
108+
size = 250
109+
prefix = ''
123110

124-
clean_text = clean(text)
125-
size = 250
126-
prefix = ''
111+
for offset in range(0, len(clean_text), size):
112+
chunk = clean_text[offset:(offset + size)]
113+
_, chunk_words = search(prefix + chunk)
114+
prefix = ''.join(chunk_words[-5:])
115+
del chunk_words[-5:]
116+
for word in chunk_words:
117+
yield word
127118

128-
for offset in range(0, len(clean_text), size):
129-
chunk = clean_text[offset:(offset + size)]
130-
_, chunk_words = search(prefix + chunk)
131-
prefix = ''.join(chunk_words[-5:])
132-
del chunk_words[-5:]
133-
for word in chunk_words:
119+
_, prefix_words = search(prefix)
120+
121+
for word in prefix_words:
134122
yield word
135123

136-
_, prefix_words = search(prefix)
137124

138-
for word in prefix_words:
139-
yield word
125+
def segment(self, text):
126+
"Return list of words that is the best segmenation of `text`."
127+
return list(self.isegment(text))
128+
129+
130+
def divide(self, text):
131+
"Yield `(prefix, suffix)` pairs from `text`."
132+
for pos in range(1, min(len(text), self.limit) + 1):
133+
yield (text[:pos], text[pos:])
134+
135+
136+
@classmethod
137+
def clean(cls, text):
138+
"Return `text` lower-cased with non-alphanumeric characters removed."
139+
alphabet = cls.alphabet
140+
text_lower = text.lower()
141+
letters = (letter for letter in text_lower if letter in alphabet)
142+
return ''.join(letters)
143+
140144

141145
def segment(text):
142-
"Return a list of words that is the best segmenation of `text`."
143-
return list(isegment(text))
146+
"Return list of words that is the best segmenation of `text`."
147+
segmenter = load()
148+
return segmenter.segment(text)
149+
150+
151+
def isegment(text):
152+
"Return iterator of words that is the best segmenation of `text`."
153+
segmenter = load()
154+
return segmenter.isegment(text)
155+
156+
157+
_cache = {}
158+
159+
160+
def load():
161+
"Load unigram and bigram counts from disk and cache Segmenter instance."
162+
if 'segmenter' not in _cache:
163+
directory = op.dirname(op.realpath(__file__))
164+
unigrams = _parse(op.join(directory, 'unigrams.txt'))
165+
bigrams = _parse(op.join(directory, 'bigrams.txt'))
166+
_cache['segmenter'] = Segmenter(unigrams, bigrams, 1024908267229.0)
167+
168+
return _cache['segmenter']
144169

145-
def main(args=()):
146-
"""Command-line entry-point. Parses `args` into in-file and out-file then
147-
reads lines from in-file, segments the lines, and writes the result to
148-
out-file. Input and output default to stdin and stdout respectively.
170+
171+
def _parse(filename):
172+
"Read `filename` and parse tab-separated file of word and count pairs."
173+
with io.open(filename, encoding='utf-8') as reader:
174+
lines = (line.split('\t') for line in reader)
175+
return dict((word, float(number)) for word, number in lines)
176+
177+
178+
def main(arguments=()):
179+
"""Command-line interface (CLI) entry-point. Parse `arguments` into in-file
180+
and out-file then read lines from in-file, segment the lines, and write the
181+
result to out-file. Input and output default to stdin and stdout
182+
respectively.
149183
150184
"""
151185
import argparse
@@ -157,18 +191,20 @@ def main(args=()):
157191
parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
158192
default=sys.stdout)
159193

160-
streams = parser.parse_args(args)
194+
streams = parser.parse_args(arguments)
161195

162196
for line in streams.infile:
163197
streams.outfile.write(' '.join(segment(line)))
164198
streams.outfile.write(os.linesep)
165199

200+
166201
if __name__ == '__main__':
167202
main(sys.argv[1:])
168203

204+
169205
__title__ = 'wordsegment'
170-
__version__ = '0.8.0'
171-
__build__ = 0x000800
206+
__version__ = '1.0.0'
207+
__build__ = 0x010000
172208
__author__ = 'Grant Jenks'
173209
__license__ = 'Apache 2.0'
174-
__copyright__ = 'Copyright 2016 Grant Jenks'
210+
__copyright__ = 'Copyright 2017 Grant Jenks'

wordsegment_data/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)