diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 4dccf48..745ed7e 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -93,6 +93,30 @@ def test_segment_12(): ] assert segment(''.join(result)) == result +# test ignore_digits param +def test_segment_13(): + result = [ + 'in', '1864', 'lincoln', 'wrote', '4', 'score', 'and', '7', 'years', 'ago', 'our', 'fathers', 'brought' + ] + # maintain spaces between words + assert segment(' '.join(result), True) == result + +# test ignore_digits param +def test_segment_14(): + result = [ + 'this', '$5,000', 'is', 'a', '2019', 'test', 'test1', 'asdf1' + ] + # maintain spaces between words + assert segment(' '.join(result), True) == result + +# test ignore_digits param +def test_segment_14(): + result = [ + 'increased', '$55', 'million', 'or', '23.8%', 'for' + ] + # maintain spaces between words + assert segment(' '.join(result), True) == result + def test_main(): main(['tests/test.txt']) result = os.linesep.join(('choose spain', 'this is a test')) + os.linesep diff --git a/wordsegment/__init__.py b/wordsegment/__init__.py index 1db0776..879e80a 100644 --- a/wordsegment/__init__.py +++ b/wordsegment/__init__.py @@ -30,6 +30,8 @@ import math import os.path as op import sys +import re +import string class Segmenter(object): @@ -161,10 +163,27 @@ def candidates(): for word in prefix_words: yield word - - def segment(self, text): + def segment_ignore_digits(self, text): + "apply segmentation only to non-numeric text" + ignore_chars = string.digits + # test if text contains digits + segments = re.split(r'((?=\S*[\d.])\S*)', text) + digit_checker = re.compile(r'\d') + results = [] + for substring in segments: + if digit_checker.search(substring) is not None: + # has digits, so append substring w/out modification + results.append(substring) + else: + results.extend(self.isegment(substring)) + return results + + def segment(self, text, ignore_digits = False): "Return list of words that is the best segmenation of `text`." - return list(self.isegment(text)) + if ignore_digits: + return self.segment_ignore_digits(text) + else: + return list(self.isegment(text)) def divide(self, text):