grantjenks
diff --git a/‎MANIFEST.in
Lines changed: 4 additions & 1 deletion b/‎MANIFEST.in
Lines changed: 4 additions & 1 deletion
diff --git a/‎wordsegment_data/bigrams.txt renamed to ‎bigrams.txt b/‎wordsegment_data/bigrams.txt renamed to ‎bigrams.txt
diff --git a/‎setup.py
Lines changed: 3 additions & 4 deletions b/‎setup.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎wordsegment_data/unigrams.txt renamed to ‎unigrams.txt b/‎wordsegment_data/unigrams.txt renamed to ‎unigrams.txt
diff --git a/‎wordsegment.py
Lines changed: 125 additions & 89 deletions b/‎wordsegment.py
Lines changed: 125 additions & 89 deletions
diff --git a/‎wordsegment_data/__init__.py b/‎wordsegment_data/__init__.py
@@ -1 +1,4 @@
-include README.rst LICENSE requirements.txt
+include README.rst
+include LICENSE
+include unigrams.txt
+include bigrams.txt
@@ -26,13 +26,11 @@ def run_tests(self):
     author='Grant Jenks',
     author_email='[email protected]',
     url='http://www.grantjenks.com/docs/wordsegment/',
-    py_modules=['wordsegment'],
-    packages=['wordsegment_data'],
-    package_data={'wordsegment_data': ['*.txt']},
+    packages=['wordsegment'],
+    include_package_data=True,
     tests_require=['tox'],
     cmdclass={'test': Tox},
     license='Apache 2.0',
-    install_requires=[],
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
@@ -47,5 +45,6 @@ def run_tests(self):
         'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
     ],
 )
@@ -30,122 +30,156 @@
 import os.path as op
 import sys
 
-ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789')
-BIGRAMS = None
-DATADIR = op.join(op.dirname(op.realpath(__file__)), 'wordsegment_data')
-TOTAL = 1024908267229.0
-UNIGRAMS = None
 
-def clean(text):
-    "Return `text` lower-cased with non-alphanumeric characters removed."
-    return ''.join(letter for letter in text.lower() if letter in ALPHABET)
+class Segmenter(object):
+    alphabet = set('abcdefghijklmnopqrstuvwxyz0123456789')
 
-def divide(text, limit=24):
-    """Yield `(prefix, suffix)` pairs from `text` with `len(prefix)` not
-    exceeding `limit`.
 
-    """
-    for pos in range(1, min(len(text), limit) + 1):
-        yield (text[:pos], text[pos:])
+    def __init__(self, unigrams, bigrams, total):
+        self.unigrams = dict(unigrams)
+        self.bigrams = dict(bigrams)
+        self.total = float(total)
+        self.limit = max(map(len, self.unigrams))
 
-def load():
-    "Load unigram and bigram counts from disk."
-    global UNIGRAMS, BIGRAMS  # pylint: disable=global-statement
-    UNIGRAMS = parse_file(op.join(DATADIR, 'unigrams.txt'))
-    BIGRAMS = parse_file(op.join(DATADIR, 'bigrams.txt'))
 
-def parse_file(filename):
-    "Read `filename` and parse tab-separated file of (word, count) pairs."
-    with io.open(filename, encoding='utf-8') as reader:
-        lines = (line.split('\t') for line in reader)
-        return dict((word, float(number)) for word, number in lines)
+    def score(self, word, previous=None):
+        "Score `word` in the context of `previous` word."
+        unigrams = self.unigrams
+        bigrams = self.bigrams
+        total = self.total
 
-def score(word, prev=None):
-    "Score a `word` in the context of the previous word, `prev`."
-    if UNIGRAMS is None and BIGRAMS is None:
-        load()
+        if previous is None:
+            if word in unigrams:
 
-    if prev is None:
-        if word in UNIGRAMS:
+                # Probability of the given word.
 
-            # Probability of the given word.
+                return unigrams[word] / total
+            else:
+                # Penalize words not found in the unigrams according
+                # to their length, a crucial heuristic.
 
-            return UNIGRAMS[word] / TOTAL
+                return 10.0 / (total * 10 ** len(word))
         else:
-            # Penalize words not found in the unigrams according
-            # to their length, a crucial heuristic.
+            bigram = '{0} {1}'.format(previous, word)
 
-            return 10.0 / (TOTAL * 10 ** len(word))
-    else:
-        bigram = '{0} {1}'.format(prev, word)
+            if bigram in bigrams and previous in unigrams:
 
-        if bigram in BIGRAMS and prev in UNIGRAMS:
+                # Conditional probability of the word given the previous
+                # word. The technical name is *stupid backoff* and it's
+                # not a probability distribution but it works well in
+                # practice.
 
-            # Conditional probability of the word given the previous
-            # word. The technical name is *stupid backoff* and it's
-            # not a probability distribution but it works well in
-            # practice.
+                return bigrams[bigram] / total / self.score(previous)
+            else:
+                # Fall back to using the unigram probability.
 
-            return BIGRAMS[bigram] / TOTAL / score(prev)
-        else:
-            # Fall back to using the unigram probability.
+                return self.score(word)
 
-            return score(word)
 
-def isegment(text):
-    "Return iterator of words that is the best segmenation of `text`."
+    def isegment(self, text):
+        "Return iterator of words that is the best segmenation of `text`."
+        memo = dict()
 
-    memo = dict()
+        def search(text, previous='<s>'):
+            "Return max of candidates matching `text` given `previous` word."
+            if text == '':
+                return 0.0, []
 
-    def search(text, prev='<s>'):
-        "Return max of candidates matching `text` given previous word, `prev`."
-        if text == '':
-            return 0.0, []
+            def candidates():
+                "Generator of (score, words) pairs for all divisions of text."
+                for prefix, suffix in self.divide(text):
+                    prefix_score = math.log10(self.score(prefix, previous))
 
-        def candidates():
-            "Generator of (score, words) pairs for all divisions of text."
-            for prefix, suffix in divide(text):
-                prefix_score = math.log10(score(prefix, prev))
+                    pair = (suffix, prefix)
+                    if pair not in memo:
+                        memo[pair] = search(suffix, prefix)
+                    suffix_score, suffix_words = memo[pair]
 
-                pair = (suffix, prefix)
-                if pair not in memo:
-                    memo[pair] = search(suffix, prefix)
-                suffix_score, suffix_words = memo[pair]
+                    yield (prefix_score + suffix_score, [prefix] + suffix_words)
 
-                yield (prefix_score + suffix_score, [prefix] + suffix_words)
+            return max(candidates())
 
-        return max(candidates())
+        # Avoid recursion limit issues by dividing text into chunks, segmenting
+        # those chunks and combining the results together. Chunks may divide
+        # words in the middle so prefix chunks with the last five words of the
+        # previous result.
 
-    # Avoid recursion limit issues by dividing text into chunks, segmenting
-    # those chunks and combining the results together. Chunks may divide words
-    # in the middle so prefix chunks with the last five words of the previous
-    # result.
+        clean_text = self.clean(text)
+        size = 250
+        prefix = ''
 
-    clean_text = clean(text)
-    size = 250
-    prefix = ''
+        for offset in range(0, len(clean_text), size):
+            chunk = clean_text[offset:(offset + size)]
+            _, chunk_words = search(prefix + chunk)
+            prefix = ''.join(chunk_words[-5:])
+            del chunk_words[-5:]
+            for word in chunk_words:
+                yield word
 
-    for offset in range(0, len(clean_text), size):
-        chunk = clean_text[offset:(offset + size)]
-        _, chunk_words = search(prefix + chunk)
-        prefix = ''.join(chunk_words[-5:])
-        del chunk_words[-5:]
-        for word in chunk_words:
+        _, prefix_words = search(prefix)
+
+        for word in prefix_words:
             yield word
 
-    _, prefix_words = search(prefix)
 
-    for word in prefix_words:
-        yield word
+    def segment(self, text):
+        "Return list of words that is the best segmenation of `text`."
+        return list(self.isegment(text))
+
+
+    def divide(self, text):
+        "Yield `(prefix, suffix)` pairs from `text`."
+        for pos in range(1, min(len(text), self.limit) + 1):
+            yield (text[:pos], text[pos:])
+
+
+    @classmethod
+    def clean(cls, text):
+        "Return `text` lower-cased with non-alphanumeric characters removed."
+        alphabet = cls.alphabet
+        text_lower = text.lower()
+        letters = (letter for letter in text_lower if letter in alphabet)
+        return ''.join(letters)
+
 
 def segment(text):
-    "Return a list of words that is the best segmenation of `text`."
-    return list(isegment(text))
+    "Return list of words that is the best segmenation of `text`."
+    segmenter = load()
+    return segmenter.segment(text)
+
+
+def isegment(text):
+    "Return iterator of words that is the best segmenation of `text`."
+    segmenter = load()
+    return segmenter.isegment(text)
+
+
+_cache = {}
+
+
+def load():
+    "Load unigram and bigram counts from disk and cache Segmenter instance."
+    if 'segmenter' not in _cache:
+        directory = op.dirname(op.realpath(__file__))
+        unigrams = _parse(op.join(directory, 'unigrams.txt'))
+        bigrams = _parse(op.join(directory, 'bigrams.txt'))
+        _cache['segmenter'] = Segmenter(unigrams, bigrams, 1024908267229.0)
+
+    return _cache['segmenter']
 
-def main(args=()):
-    """Command-line entry-point. Parses `args` into in-file and out-file then
-    reads lines from in-file, segments the lines, and writes the result to
-    out-file. Input and output default to stdin and stdout respectively.
+
+def _parse(filename):
+    "Read `filename` and parse tab-separated file of word and count pairs."
+    with io.open(filename, encoding='utf-8') as reader:
+        lines = (line.split('\t') for line in reader)
+        return dict((word, float(number)) for word, number in lines)
+
+
+def main(arguments=()):
+    """Command-line interface (CLI) entry-point. Parse `arguments` into in-file
+    and out-file then read lines from in-file, segment the lines, and write the
+    result to out-file. Input and output default to stdin and stdout
+    respectively.
 
     """
     import argparse
@@ -157,18 +191,20 @@ def main(args=()):
     parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
                         default=sys.stdout)
 
-    streams = parser.parse_args(args)
+    streams = parser.parse_args(arguments)
 
     for line in streams.infile:
         streams.outfile.write(' '.join(segment(line)))
         streams.outfile.write(os.linesep)
 
+
 if __name__ == '__main__':
     main(sys.argv[1:])
 
+
 __title__ = 'wordsegment'
-__version__ = '0.8.0'
-__build__ = 0x000800
+__version__ = '1.0.0'
+__build__ = 0x010000
 __author__ = 'Grant Jenks'
 __license__ = 'Apache 2.0'
-__copyright__ = 'Copyright 2016 Grant Jenks'
+__copyright__ = 'Copyright 2017 Grant Jenks'