30
30
import os .path as op
31
31
import sys
32
32
33
- ALPHABET = set ('abcdefghijklmnopqrstuvwxyz0123456789' )
34
- BIGRAMS = None
35
- DATADIR = op .join (op .dirname (op .realpath (__file__ )), 'wordsegment_data' )
36
- TOTAL = 1024908267229.0
37
- UNIGRAMS = None
38
33
39
- def clean (text ):
40
- "Return `text` lower-cased with non-alphanumeric characters removed."
41
- return '' .join (letter for letter in text .lower () if letter in ALPHABET )
34
+ class Segmenter (object ):
35
+ alphabet = set ('abcdefghijklmnopqrstuvwxyz0123456789' )
42
36
43
- def divide (text , limit = 24 ):
44
- """Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
45
- exceeding `limit`.
46
37
47
- """
48
- for pos in range (1 , min (len (text ), limit ) + 1 ):
49
- yield (text [:pos ], text [pos :])
38
+ def __init__ (self , unigrams , bigrams , total ):
39
+ self .unigrams = dict (unigrams )
40
+ self .bigrams = dict (bigrams )
41
+ self .total = float (total )
42
+ self .limit = max (map (len , self .unigrams ))
50
43
51
- def load ():
52
- "Load unigram and bigram counts from disk."
53
- global UNIGRAMS , BIGRAMS # pylint: disable=global-statement
54
- UNIGRAMS = parse_file (op .join (DATADIR , 'unigrams.txt' ))
55
- BIGRAMS = parse_file (op .join (DATADIR , 'bigrams.txt' ))
56
44
57
- def parse_file ( filename ):
58
- "Read `filename` and parse tab-separated file of (word, count) pairs ."
59
- with io . open ( filename , encoding = 'utf-8' ) as reader :
60
- lines = ( line . split ( ' \t ' ) for line in reader )
61
- return dict (( word , float ( number )) for word , number in lines )
45
+ def score ( self , word , previous = None ):
46
+ "Score `word` in the context of `previous` word ."
47
+ unigrams = self . unigrams
48
+ bigrams = self . bigrams
49
+ total = self . total
62
50
63
- def score (word , prev = None ):
64
- "Score a `word` in the context of the previous word, `prev`."
65
- if UNIGRAMS is None and BIGRAMS is None :
66
- load ()
51
+ if previous is None :
52
+ if word in unigrams :
67
53
68
- if prev is None :
69
- if word in UNIGRAMS :
54
+ # Probability of the given word.
70
55
71
- # Probability of the given word.
56
+ return unigrams [word ] / total
57
+ else :
58
+ # Penalize words not found in the unigrams according
59
+ # to their length, a crucial heuristic.
72
60
73
- return UNIGRAMS [ word ] / TOTAL
61
+ return 10.0 / ( total * 10 ** len ( word ))
74
62
else :
75
- # Penalize words not found in the unigrams according
76
- # to their length, a crucial heuristic.
63
+ bigram = '{0} {1}' .format (previous , word )
77
64
78
- return 10.0 / (TOTAL * 10 ** len (word ))
79
- else :
80
- bigram = '{0} {1}' .format (prev , word )
65
+ if bigram in bigrams and previous in unigrams :
81
66
82
- if bigram in BIGRAMS and prev in UNIGRAMS :
67
+ # Conditional probability of the word given the previous
68
+ # word. The technical name is *stupid backoff* and it's
69
+ # not a probability distribution but it works well in
70
+ # practice.
83
71
84
- # Conditional probability of the word given the previous
85
- # word. The technical name is *stupid backoff* and it's
86
- # not a probability distribution but it works well in
87
- # practice.
72
+ return bigrams [bigram ] / total / self .score (previous )
73
+ else :
74
+ # Fall back to using the unigram probability.
88
75
89
- return BIGRAMS [bigram ] / TOTAL / score (prev )
90
- else :
91
- # Fall back to using the unigram probability.
76
+ return self .score (word )
92
77
93
- return score (word )
94
78
95
- def isegment (text ):
96
- "Return iterator of words that is the best segmenation of `text`."
79
+ def isegment (self , text ):
80
+ "Return iterator of words that is the best segmenation of `text`."
81
+ memo = dict ()
97
82
98
- memo = dict ()
83
+ def search (text , previous = '<s>' ):
84
+ "Return max of candidates matching `text` given `previous` word."
85
+ if text == '' :
86
+ return 0.0 , []
99
87
100
- def search ( text , prev = '<s>' ):
101
- "Return max of candidates matching `text` given previous word, `prev` ."
102
- if text == '' :
103
- return 0.0 , []
88
+ def candidates ( ):
89
+ "Generator of (score, words) pairs for all divisions of text ."
90
+ for prefix , suffix in self . divide ( text ) :
91
+ prefix_score = math . log10 ( self . score ( prefix , previous ))
104
92
105
- def candidates ():
106
- "Generator of (score, words) pairs for all divisions of text."
107
- for prefix , suffix in divide ( text ):
108
- prefix_score = math . log10 ( score ( prefix , prev ))
93
+ pair = ( suffix , prefix )
94
+ if pair not in memo :
95
+ memo [ pair ] = search ( suffix , prefix )
96
+ suffix_score , suffix_words = memo [ pair ]
109
97
110
- pair = (suffix , prefix )
111
- if pair not in memo :
112
- memo [pair ] = search (suffix , prefix )
113
- suffix_score , suffix_words = memo [pair ]
98
+ yield (prefix_score + suffix_score , [prefix ] + suffix_words )
114
99
115
- yield ( prefix_score + suffix_score , [ prefix ] + suffix_words )
100
+ return max ( candidates () )
116
101
117
- return max (candidates ())
102
+ # Avoid recursion limit issues by dividing text into chunks, segmenting
103
+ # those chunks and combining the results together. Chunks may divide
104
+ # words in the middle so prefix chunks with the last five words of the
105
+ # previous result.
118
106
119
- # Avoid recursion limit issues by dividing text into chunks, segmenting
120
- # those chunks and combining the results together. Chunks may divide words
121
- # in the middle so prefix chunks with the last five words of the previous
122
- # result.
107
+ clean_text = self .clean (text )
108
+ size = 250
109
+ prefix = ''
123
110
124
- clean_text = clean (text )
125
- size = 250
126
- prefix = ''
111
+ for offset in range (0 , len (clean_text ), size ):
112
+ chunk = clean_text [offset :(offset + size )]
113
+ _ , chunk_words = search (prefix + chunk )
114
+ prefix = '' .join (chunk_words [- 5 :])
115
+ del chunk_words [- 5 :]
116
+ for word in chunk_words :
117
+ yield word
127
118
128
- for offset in range (0 , len (clean_text ), size ):
129
- chunk = clean_text [offset :(offset + size )]
130
- _ , chunk_words = search (prefix + chunk )
131
- prefix = '' .join (chunk_words [- 5 :])
132
- del chunk_words [- 5 :]
133
- for word in chunk_words :
119
+ _ , prefix_words = search (prefix )
120
+
121
+ for word in prefix_words :
134
122
yield word
135
123
136
- _ , prefix_words = search (prefix )
137
124
138
- for word in prefix_words :
139
- yield word
125
+ def segment (self , text ):
126
+ "Return list of words that is the best segmenation of `text`."
127
+ return list (self .isegment (text ))
128
+
129
+
130
+ def divide (self , text ):
131
+ "Yield `(prefix, suffix)` pairs from `text`."
132
+ for pos in range (1 , min (len (text ), self .limit ) + 1 ):
133
+ yield (text [:pos ], text [pos :])
134
+
135
+
136
+ @classmethod
137
+ def clean (cls , text ):
138
+ "Return `text` lower-cased with non-alphanumeric characters removed."
139
+ alphabet = cls .alphabet
140
+ text_lower = text .lower ()
141
+ letters = (letter for letter in text_lower if letter in alphabet )
142
+ return '' .join (letters )
143
+
140
144
141
145
def segment (text ):
142
- "Return a list of words that is the best segmenation of `text`."
143
- return list (isegment (text ))
146
+ "Return list of words that is the best segmenation of `text`."
147
+ segmenter = load ()
148
+ return segmenter .segment (text )
149
+
150
+
151
+ def isegment (text ):
152
+ "Return iterator of words that is the best segmenation of `text`."
153
+ segmenter = load ()
154
+ return segmenter .isegment (text )
155
+
156
+
157
+ _cache = {}
158
+
159
+
160
+ def load ():
161
+ "Load unigram and bigram counts from disk and cache Segmenter instance."
162
+ if 'segmenter' not in _cache :
163
+ directory = op .dirname (op .realpath (__file__ ))
164
+ unigrams = _parse (op .join (directory , 'unigrams.txt' ))
165
+ bigrams = _parse (op .join (directory , 'bigrams.txt' ))
166
+ _cache ['segmenter' ] = Segmenter (unigrams , bigrams , 1024908267229.0 )
167
+
168
+ return _cache ['segmenter' ]
144
169
145
- def main (args = ()):
146
- """Command-line entry-point. Parses `args` into in-file and out-file then
147
- reads lines from in-file, segments the lines, and writes the result to
148
- out-file. Input and output default to stdin and stdout respectively.
170
+
171
+ def _parse (filename ):
172
+ "Read `filename` and parse tab-separated file of word and count pairs."
173
+ with io .open (filename , encoding = 'utf-8' ) as reader :
174
+ lines = (line .split ('\t ' ) for line in reader )
175
+ return dict ((word , float (number )) for word , number in lines )
176
+
177
+
178
+ def main (arguments = ()):
179
+ """Command-line interface (CLI) entry-point. Parse `arguments` into in-file
180
+ and out-file then read lines from in-file, segment the lines, and write the
181
+ result to out-file. Input and output default to stdin and stdout
182
+ respectively.
149
183
150
184
"""
151
185
import argparse
@@ -157,18 +191,20 @@ def main(args=()):
157
191
parser .add_argument ('outfile' , nargs = '?' , type = argparse .FileType ('w' ),
158
192
default = sys .stdout )
159
193
160
- streams = parser .parse_args (args )
194
+ streams = parser .parse_args (arguments )
161
195
162
196
for line in streams .infile :
163
197
streams .outfile .write (' ' .join (segment (line )))
164
198
streams .outfile .write (os .linesep )
165
199
200
+
166
201
if __name__ == '__main__' :
167
202
main (sys .argv [1 :])
168
203
204
+
169
205
__title__ = 'wordsegment'
170
- __version__ = '0.8 .0'
171
- __build__ = 0x000800
206
+ __version__ = '1.0 .0'
207
+ __build__ = 0x010000
172
208
__author__ = 'Grant Jenks'
173
209
__license__ = 'Apache 2.0'
174
- __copyright__ = 'Copyright 2016 Grant Jenks'
210
+ __copyright__ = 'Copyright 2017 Grant Jenks'
0 commit comments