|
5 | 5 | For humans, it's relatively easy to parse. This module makes it easy for
|
6 | 6 | machines too. Use `segment` to parse a phrase into its parts:
|
7 | 7 |
|
8 |
| ->>> from wordsegment import segment |
| 8 | +>>> from wordsegment import load, segment |
| 9 | +>>> load() |
9 | 10 | >>> segment('thisisatest')
|
10 | 11 | ['this', 'is', 'a', 'test']
|
11 | 12 |
|
|
32 | 33 |
|
33 | 34 |
|
34 | 35 | class Segmenter(object):
|
35 |
| - alphabet = set('abcdefghijklmnopqrstuvwxyz0123456789') |
| 36 | + """Segmenter |
36 | 37 |
|
| 38 | + Support for object-oriented programming and customization. |
37 | 39 |
|
38 |
| - def __init__(self, unigrams, bigrams, total): |
39 |
| - self.unigrams = dict(unigrams) |
40 |
| - self.bigrams = dict(bigrams) |
41 |
| - self.total = float(total) |
42 |
| - self.limit = max(map(len, self.unigrams)) |
| 40 | + """ |
| 41 | + ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789') |
| 42 | + UNIGRAMS_FILENAME = op.join( |
| 43 | + op.dirname(op.realpath(__file__)), |
| 44 | + 'unigrams.txt', |
| 45 | + ) |
| 46 | + BIGRAMS_FILENAME = op.join( |
| 47 | + op.dirname(op.realpath(__file__)), |
| 48 | + 'bigrams.txt', |
| 49 | + ) |
| 50 | + TOTAL = 1024908267229.0 |
| 51 | + LIMIT = 24 |
| 52 | + |
| 53 | + |
| 54 | + def __init__(self): |
| 55 | + self.unigrams = {} |
| 56 | + self.bigrams = {} |
| 57 | + self.total = 0.0 |
| 58 | + self.limit = 0 |
| 59 | + |
| 60 | + |
| 61 | + def load(self): |
| 62 | + "Load unigram and bigram counts from disk." |
| 63 | + self.unigrams.update(self.parse(self.UNIGRAMS_FILENAME)) |
| 64 | + self.bigrams.update(self.parse(self.BIGRAMS_FILENAME)) |
| 65 | + self.total = self.TOTAL |
| 66 | + self.limit = self.LIMIT |
| 67 | + |
| 68 | + |
| 69 | + @staticmethod |
| 70 | + def parse(filename): |
| 71 | + "Read `filename` and parse tab-separated file of word and count pairs." |
| 72 | + with io.open(filename, encoding='utf-8') as reader: |
| 73 | + lines = (line.split('\t') for line in reader) |
| 74 | + return dict((word, float(number)) for word, number in lines) |
43 | 75 |
|
44 | 76 |
|
45 | 77 | def score(self, word, previous=None):
|
@@ -136,43 +168,18 @@ def divide(self, text):
|
136 | 168 | @classmethod
|
137 | 169 | def clean(cls, text):
|
138 | 170 | "Return `text` lower-cased with non-alphanumeric characters removed."
|
139 |
| - alphabet = cls.alphabet |
| 171 | + alphabet = cls.ALPHABET |
140 | 172 | text_lower = text.lower()
|
141 | 173 | letters = (letter for letter in text_lower if letter in alphabet)
|
142 | 174 | return ''.join(letters)
|
143 | 175 |
|
144 | 176 |
|
145 |
| -def segment(text): |
146 |
| - "Return list of words that is the best segmenation of `text`." |
147 |
| - segmenter = load() |
148 |
| - return segmenter.segment(text) |
149 |
| - |
150 |
| - |
151 |
| -def isegment(text): |
152 |
| - "Return iterator of words that is the best segmenation of `text`." |
153 |
| - segmenter = load() |
154 |
| - return segmenter.isegment(text) |
155 |
| - |
156 |
| - |
157 |
| -_cache = {} |
158 |
| - |
159 |
| - |
160 |
| -def load(): |
161 |
| - "Load unigram and bigram counts from disk and cache Segmenter instance." |
162 |
| - if 'segmenter' not in _cache: |
163 |
| - directory = op.dirname(op.realpath(__file__)) |
164 |
| - unigrams = _parse(op.join(directory, 'unigrams.txt')) |
165 |
| - bigrams = _parse(op.join(directory, 'bigrams.txt')) |
166 |
| - _cache['segmenter'] = Segmenter(unigrams, bigrams, 1024908267229.0) |
167 |
| - |
168 |
| - return _cache['segmenter'] |
169 |
| - |
170 |
| - |
171 |
| -def _parse(filename): |
172 |
| - "Read `filename` and parse tab-separated file of word and count pairs." |
173 |
| - with io.open(filename, encoding='utf-8') as reader: |
174 |
| - lines = (line.split('\t') for line in reader) |
175 |
| - return dict((word, float(number)) for word, number in lines) |
| 177 | +_segmenter = Segmenter() |
| 178 | +bigrams = _segmenter.bigrams |
| 179 | +isegment = _segmenter.isegment |
| 180 | +load = _segmenter.load |
| 181 | +segment = _segmenter.segment |
| 182 | +unigrams = _segmenter.unigrams |
176 | 183 |
|
177 | 184 |
|
178 | 185 | def main(arguments=()):
|
|
0 commit comments