Skip to content

Commit 7e0037d

Browse files
committed
Refactor module to use object-oriented Segmenter class
1 parent 541f94e commit 7e0037d

File tree

1 file changed

+46
-39
lines changed

1 file changed

+46
-39
lines changed

wordsegment/__init__.py

Lines changed: 46 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
For humans, it's relatively easy to parse. This module makes it easy for
66
machines too. Use `segment` to parse a phrase into its parts:
77
8-
>>> from wordsegment import segment
8+
>>> from wordsegment import load, segment
9+
>>> load()
910
>>> segment('thisisatest')
1011
['this', 'is', 'a', 'test']
1112
@@ -32,14 +33,45 @@
3233

3334

3435
class Segmenter(object):
35-
alphabet = set('abcdefghijklmnopqrstuvwxyz0123456789')
36+
"""Segmenter
3637
38+
Support for object-oriented programming and customization.
3739
38-
def __init__(self, unigrams, bigrams, total):
39-
self.unigrams = dict(unigrams)
40-
self.bigrams = dict(bigrams)
41-
self.total = float(total)
42-
self.limit = max(map(len, self.unigrams))
40+
"""
41+
ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789')
42+
UNIGRAMS_FILENAME = op.join(
43+
op.dirname(op.realpath(__file__)),
44+
'unigrams.txt',
45+
)
46+
BIGRAMS_FILENAME = op.join(
47+
op.dirname(op.realpath(__file__)),
48+
'bigrams.txt',
49+
)
50+
TOTAL = 1024908267229.0
51+
LIMIT = 24
52+
53+
54+
def __init__(self):
55+
self.unigrams = {}
56+
self.bigrams = {}
57+
self.total = 0.0
58+
self.limit = 0
59+
60+
61+
def load(self):
62+
"Load unigram and bigram counts from disk."
63+
self.unigrams.update(self.parse(self.UNIGRAMS_FILENAME))
64+
self.bigrams.update(self.parse(self.BIGRAMS_FILENAME))
65+
self.total = self.TOTAL
66+
self.limit = self.LIMIT
67+
68+
69+
@staticmethod
70+
def parse(filename):
71+
"Read `filename` and parse tab-separated file of word and count pairs."
72+
with io.open(filename, encoding='utf-8') as reader:
73+
lines = (line.split('\t') for line in reader)
74+
return dict((word, float(number)) for word, number in lines)
4375

4476

4577
def score(self, word, previous=None):
@@ -136,43 +168,18 @@ def divide(self, text):
136168
@classmethod
137169
def clean(cls, text):
138170
"Return `text` lower-cased with non-alphanumeric characters removed."
139-
alphabet = cls.alphabet
171+
alphabet = cls.ALPHABET
140172
text_lower = text.lower()
141173
letters = (letter for letter in text_lower if letter in alphabet)
142174
return ''.join(letters)
143175

144176

145-
def segment(text):
146-
"Return list of words that is the best segmenation of `text`."
147-
segmenter = load()
148-
return segmenter.segment(text)
149-
150-
151-
def isegment(text):
152-
"Return iterator of words that is the best segmenation of `text`."
153-
segmenter = load()
154-
return segmenter.isegment(text)
155-
156-
157-
_cache = {}
158-
159-
160-
def load():
161-
"Load unigram and bigram counts from disk and cache Segmenter instance."
162-
if 'segmenter' not in _cache:
163-
directory = op.dirname(op.realpath(__file__))
164-
unigrams = _parse(op.join(directory, 'unigrams.txt'))
165-
bigrams = _parse(op.join(directory, 'bigrams.txt'))
166-
_cache['segmenter'] = Segmenter(unigrams, bigrams, 1024908267229.0)
167-
168-
return _cache['segmenter']
169-
170-
171-
def _parse(filename):
172-
"Read `filename` and parse tab-separated file of word and count pairs."
173-
with io.open(filename, encoding='utf-8') as reader:
174-
lines = (line.split('\t') for line in reader)
175-
return dict((word, float(number)) for word, number in lines)
177+
_segmenter = Segmenter()
178+
bigrams = _segmenter.bigrams
179+
isegment = _segmenter.isegment
180+
load = _segmenter.load
181+
segment = _segmenter.segment
182+
unigrams = _segmenter.unigrams
176183

177184

178185
def main(arguments=()):

0 commit comments

Comments
 (0)