1
- # -*- coding: utf-8 -*-
2
-
3
- """
4
- English Word Segmentation in Python
1
+ """English Word Segmentation in Python
5
2
6
3
Word segmentation is the process of dividing a phrase without spaces back
7
4
into its constituent parts. For example, consider a phrase like "thisisatest".
18
15
http://norvig.com/ngrams/ under the names count_1w.txt and count_2w.txt
19
16
respectively.
20
17
21
- Copyright (c) 2015 by Grant Jenks
18
+ Copyright (c) 2016 by Grant Jenks
22
19
23
20
Based on code from the chapter "Natural Language Corpus Data"
24
21
from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
25
22
http://oreilly.com/catalog/9780596157111/
26
23
27
24
Original Copyright (c) 2008-2009 by Peter Norvig
25
+
28
26
"""
29
27
28
+ import io
29
+ import math
30
+ import os .path as op
30
31
import sys
31
- import codecs
32
- from os .path import join , dirname , realpath
33
- from math import log10
34
- from functools import wraps
35
-
36
32
37
33
ALPHABET = set ('abcdefghijklmnopqrstuvwxyz0123456789' )
34
+ BIGRAMS = None
35
+ DATADIR = op .join (op .dirname (op .realpath (__file__ )), 'wordsegment_data' )
36
+ TOTAL = 1024908267229.0
37
+ UNIGRAMS = None
38
38
39
- if sys .hexversion < 0x03000000 :
40
- range = xrange
41
-
42
- def parse_file (filename ):
43
- "Read `filename` and parse tab-separated file of (word, count) pairs."
44
- with codecs .open (filename , 'r' , 'utf-8' ) as fptr :
45
- lines = (line .split ('\t ' ) for line in fptr )
46
- return dict ((word , float (number )) for word , number in lines )
47
-
48
- basepath = join (dirname (realpath (__file__ )), 'wordsegment_data' )
49
- unigram_counts = None
50
- bigram_counts = None
39
+ def clean (text ):
40
+ "Return `text` lower-cased with non-alphanumeric characters removed."
41
+ return '' .join (letter for letter in text .lower () if letter in ALPHABET )
51
42
52
43
def divide (text , limit = 24 ):
53
- """
54
- Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
44
+ """Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
55
45
exceeding `limit`.
46
+
56
47
"""
57
48
for pos in range (1 , min (len (text ), limit ) + 1 ):
58
49
yield (text [:pos ], text [pos :])
59
50
60
- TOTAL = 1024908267229.0
51
+ def load ():
52
+ "Load unigram and bigram counts from disk."
53
+ global UNIGRAMS , BIGRAMS # pylint: disable=global-statement
54
+ UNIGRAMS = parse_file (op .join (DATADIR , 'unigrams.txt' ))
55
+ BIGRAMS = parse_file (op .join (DATADIR , 'bigrams.txt' ))
56
+
57
+ def parse_file (filename ):
58
+ "Read `filename` and parse tab-separated file of (word, count) pairs."
59
+ with io .open (filename , encoding = 'utf-8' ) as reader :
60
+ lines = (line .split ('\t ' ) for line in reader )
61
+ return dict ((word , float (number )) for word , number in lines )
61
62
62
63
def score (word , prev = None ):
63
64
"Score a `word` in the context of the previous word, `prev`."
64
- global unigram_counts , bigram_counts
65
-
66
- if unigram_counts is None :
67
- unigram_counts = parse_file (join (basepath , 'unigrams.txt' ))
68
-
69
- if bigram_counts is None :
70
- bigram_counts = parse_file (join (basepath , 'bigrams.txt' ))
65
+ if UNIGRAMS is None and BIGRAMS is None :
66
+ load ()
71
67
72
68
if prev is None :
73
- if word in unigram_counts :
69
+ if word in UNIGRAMS :
74
70
75
71
# Probability of the given word.
76
72
77
- return unigram_counts [word ] / TOTAL
73
+ return UNIGRAMS [word ] / TOTAL
78
74
else :
79
75
# Penalize words not found in the unigrams according
80
76
# to their length, a crucial heuristic.
@@ -83,35 +79,33 @@ def score(word, prev=None):
83
79
else :
84
80
bigram = '{0} {1}' .format (prev , word )
85
81
86
- if bigram in bigram_counts and prev in unigram_counts :
82
+ if bigram in BIGRAMS and prev in UNIGRAMS :
87
83
88
84
# Conditional probability of the word given the previous
89
85
# word. The technical name is *stupid backoff* and it's
90
86
# not a probability distribution but it works well in
91
87
# practice.
92
88
93
- return bigram_counts [bigram ] / TOTAL / score (prev )
89
+ return BIGRAMS [bigram ] / TOTAL / score (prev )
94
90
else :
95
91
# Fall back to using the unigram probability.
96
92
97
93
return score (word )
98
94
99
- def clean (text ):
100
- "Return `text` lower-cased with non-alphanumeric characters removed."
101
- return '' .join (letter for letter in text .lower () if letter in ALPHABET )
102
-
103
95
def segment (text ):
104
96
"Return a list of words that is the best segmenation of `text`."
105
97
106
98
memo = dict ()
107
99
108
100
def search (text , prev = '<s>' ):
101
+ "Return max of candidates matching `text` given previous word, `prev`."
109
102
if text == '' :
110
103
return 0.0 , []
111
104
112
105
def candidates ():
106
+ "Generator of (score, words) pairs for all divisions of text."
113
107
for prefix , suffix in divide (text ):
114
- prefix_score = log10 (score (prefix , prev ))
108
+ prefix_score = math . log10 (score (prefix , prev ))
115
109
116
110
pair = (suffix , prefix )
117
111
if pair not in memo :
@@ -122,30 +116,30 @@ def candidates():
122
116
123
117
return max (candidates ())
124
118
125
- result_score , result_words = search (clean (text ))
119
+ _ , result_words = search (clean (text ))
126
120
127
121
return result_words
128
122
129
- def main (args = '' ):
130
- """
131
- Command-line entry-point. Parses args into in-file and out-file then
132
- reads lines from infile, segments the lines, and writes the result
133
- to outfile. Input and output default to stdin and stdout respectively.
123
+ def main (args = () ):
124
+ """Command-line entry-point. Parses `args` into in-file and out-file then
125
+ reads lines from in-file, segments the lines, and writes the result to
126
+ out-file. Input and output default to stdin and stdout respectively.
127
+
134
128
"""
135
- import os , argparse
129
+ import argparse
130
+ import os
136
131
137
132
parser = argparse .ArgumentParser (description = 'English Word Segmentation' )
138
-
139
133
parser .add_argument ('infile' , nargs = '?' , type = argparse .FileType ('r' ),
140
134
default = sys .stdin )
141
135
parser .add_argument ('outfile' , nargs = '?' , type = argparse .FileType ('w' ),
142
136
default = sys .stdout )
143
137
144
- args = parser .parse_args (args )
138
+ streams = parser .parse_args (args )
145
139
146
- for line in args .infile :
147
- args .outfile .write (' ' .join (segment (line )))
148
- args .outfile .write (os .linesep )
140
+ for line in streams .infile :
141
+ streams .outfile .write (' ' .join (segment (line )))
142
+ streams .outfile .write (os .linesep )
149
143
150
144
if __name__ == '__main__' :
151
145
main (sys .argv [1 :])
@@ -155,4 +149,4 @@ def main(args=''):
155
149
__build__ = 0x000601
156
150
__author__ = 'Grant Jenks'
157
151
__license__ = 'Apache 2.0'
158
- __copyright__ = 'Copyright 2015 Grant Jenks'
152
+ __copyright__ = 'Copyright 2016 Grant Jenks'
0 commit comments