Skip to content

Commit 0f51bb2

Browse files
committed
Refactor with pylint fixes
1 parent 62d67dc commit 0f51bb2

File tree

1 file changed

+48
-54
lines changed

1 file changed

+48
-54
lines changed

wordsegment.py

Lines changed: 48 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
# -*- coding: utf-8 -*-
2-
3-
"""
4-
English Word Segmentation in Python
1+
"""English Word Segmentation in Python
52
63
Word segmentation is the process of dividing a phrase without spaces back
74
into its constituent parts. For example, consider a phrase like "thisisatest".
@@ -18,63 +15,62 @@
1815
http://norvig.com/ngrams/ under the names count_1w.txt and count_2w.txt
1916
respectively.
2017
21-
Copyright (c) 2015 by Grant Jenks
18+
Copyright (c) 2016 by Grant Jenks
2219
2320
Based on code from the chapter "Natural Language Corpus Data"
2421
from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
2522
http://oreilly.com/catalog/9780596157111/
2623
2724
Original Copyright (c) 2008-2009 by Peter Norvig
25+
2826
"""
2927

28+
import io
29+
import math
30+
import os.path as op
3031
import sys
31-
import codecs
32-
from os.path import join, dirname, realpath
33-
from math import log10
34-
from functools import wraps
35-
3632

3733
ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789')
34+
BIGRAMS = None
35+
DATADIR = op.join(op.dirname(op.realpath(__file__)), 'wordsegment_data')
36+
TOTAL = 1024908267229.0
37+
UNIGRAMS = None
3838

39-
if sys.hexversion < 0x03000000:
40-
range = xrange
41-
42-
def parse_file(filename):
43-
"Read `filename` and parse tab-separated file of (word, count) pairs."
44-
with codecs.open(filename, 'r', 'utf-8') as fptr:
45-
lines = (line.split('\t') for line in fptr)
46-
return dict((word, float(number)) for word, number in lines)
47-
48-
basepath = join(dirname(realpath(__file__)), 'wordsegment_data')
49-
unigram_counts = None
50-
bigram_counts = None
39+
def clean(text):
40+
"Return `text` lower-cased with non-alphanumeric characters removed."
41+
return ''.join(letter for letter in text.lower() if letter in ALPHABET)
5142

5243
def divide(text, limit=24):
53-
"""
54-
Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
44+
"""Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
5545
exceeding `limit`.
46+
5647
"""
5748
for pos in range(1, min(len(text), limit) + 1):
5849
yield (text[:pos], text[pos:])
5950

60-
TOTAL = 1024908267229.0
51+
def load():
52+
"Load unigram and bigram counts from disk."
53+
global UNIGRAMS, BIGRAMS # pylint: disable=global-statement
54+
UNIGRAMS = parse_file(op.join(DATADIR, 'unigrams.txt'))
55+
BIGRAMS = parse_file(op.join(DATADIR, 'bigrams.txt'))
56+
57+
def parse_file(filename):
58+
"Read `filename` and parse tab-separated file of (word, count) pairs."
59+
with io.open(filename, encoding='utf-8') as reader:
60+
lines = (line.split('\t') for line in reader)
61+
return dict((word, float(number)) for word, number in lines)
6162

6263
def score(word, prev=None):
6364
"Score a `word` in the context of the previous word, `prev`."
64-
global unigram_counts, bigram_counts
65-
66-
if unigram_counts is None:
67-
unigram_counts = parse_file(join(basepath, 'unigrams.txt'))
68-
69-
if bigram_counts is None:
70-
bigram_counts = parse_file(join(basepath, 'bigrams.txt'))
65+
if UNIGRAMS is None and BIGRAMS is None:
66+
load()
7167

7268
if prev is None:
73-
if word in unigram_counts:
69+
if word in UNIGRAMS:
7470

7571
# Probability of the given word.
7672

77-
return unigram_counts[word] / TOTAL
73+
return UNIGRAMS[word] / TOTAL
7874
else:
7975
# Penalize words not found in the unigrams according
8076
# to their length, a crucial heuristic.
@@ -83,35 +79,33 @@ def score(word, prev=None):
8379
else:
8480
bigram = '{0} {1}'.format(prev, word)
8581

86-
if bigram in bigram_counts and prev in unigram_counts:
82+
if bigram in BIGRAMS and prev in UNIGRAMS:
8783

8884
# Conditional probability of the word given the previous
8985
# word. The technical name is *stupid backoff* and it's
9086
# not a probability distribution but it works well in
9187
# practice.
9288

93-
return bigram_counts[bigram] / TOTAL / score(prev)
89+
return BIGRAMS[bigram] / TOTAL / score(prev)
9490
else:
9591
# Fall back to using the unigram probability.
9692

9793
return score(word)
9894

99-
def clean(text):
100-
"Return `text` lower-cased with non-alphanumeric characters removed."
101-
return ''.join(letter for letter in text.lower() if letter in ALPHABET)
102-
10395
def segment(text):
10496
"Return a list of words that is the best segmenation of `text`."
10597

10698
memo = dict()
10799

108100
def search(text, prev='<s>'):
101+
"Return max of candidates matching `text` given previous word, `prev`."
109102
if text == '':
110103
return 0.0, []
111104

112105
def candidates():
106+
"Generator of (score, words) pairs for all divisions of text."
113107
for prefix, suffix in divide(text):
114-
prefix_score = log10(score(prefix, prev))
108+
prefix_score = math.log10(score(prefix, prev))
115109

116110
pair = (suffix, prefix)
117111
if pair not in memo:
@@ -122,30 +116,30 @@ def candidates():
122116

123117
return max(candidates())
124118

125-
result_score, result_words = search(clean(text))
119+
_, result_words = search(clean(text))
126120

127121
return result_words
128122

129-
def main(args=''):
130-
"""
131-
Command-line entry-point. Parses args into in-file and out-file then
132-
reads lines from infile, segments the lines, and writes the result
133-
to outfile. Input and output default to stdin and stdout respectively.
123+
def main(args=()):
124+
"""Command-line entry-point. Parses `args` into in-file and out-file then
125+
reads lines from in-file, segments the lines, and writes the result to
126+
out-file. Input and output default to stdin and stdout respectively.
127+
134128
"""
135-
import os, argparse
129+
import argparse
130+
import os
136131

137132
parser = argparse.ArgumentParser(description='English Word Segmentation')
138-
139133
parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
140134
default=sys.stdin)
141135
parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
142136
default=sys.stdout)
143137

144-
args = parser.parse_args(args)
138+
streams = parser.parse_args(args)
145139

146-
for line in args.infile:
147-
args.outfile.write(' '.join(segment(line)))
148-
args.outfile.write(os.linesep)
140+
for line in streams.infile:
141+
streams.outfile.write(' '.join(segment(line)))
142+
streams.outfile.write(os.linesep)
149143

150144
if __name__ == '__main__':
151145
main(sys.argv[1:])
@@ -155,4 +149,4 @@ def main(args=''):
155149
__build__ = 0x000601
156150
__author__ = 'Grant Jenks'
157151
__license__ = 'Apache 2.0'
158-
__copyright__ = 'Copyright 2015 Grant Jenks'
152+
__copyright__ = 'Copyright 2016 Grant Jenks'

0 commit comments

Comments
 (0)