Hotfix/long word fix (#58)

barrust · web-flow · commit 9e298a50ad6c · 2019-11-25T17:32:54.000-05:00
* handle really long words without doing all the work
* update changelog
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # pyspellchecker
 
+## Version 0.5.3
+* Handle memory issues when trying to correct or find candidates for extremely long words
+
+## Version 0.5.2
+Ensure input is encoded correctly; resolves [#53](https://github.com/barrust/pyspellchecker/issues/53)
+
+## Version 0.5.1
+Handle windows encoding issues [#48](https://github.com/barrust/pyspellchecker/issues/48)
+Deterministic order to corrections [#47](https://github.com/barrust/pyspellchecker/issues/47)
+
 ## Version 0.5.0
 * Add tokenizer to the Spell object
 * Add Support for local dictionaries to be case sensitive
diff --git a/spellchecker/info.py b/spellchecker/info.py
@@ -5,7 +5,7 @@
 __maintainer__ = "Tyler Barrus"
 __email__ = "barrust@gmail.com"
 __license__ = "MIT"
-__version__ = "0.5.2"
+__version__ = "0.5.3"
 __credits__ = ["Peter Norvig"]
 __url__ = "https://github.com/barrust/pyspellchecker"
 __bugtrack_url__ = "{0}/issues".format(__url__)
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -159,6 +159,10 @@ def candidates(self, word):
         word = ENSURE_UNICODE(word)
         if self.known([word]):  # short-cut if word is correct already
             return {word}
+
+        if not self._check_if_should_check(word):
+            return {word}
+
         # get edit distance 1...
         res = [x for x in self.edit_distance_1(word)]
         tmp = self.known(res)
@@ -186,7 +190,7 @@ def known(self, words):
             w
             for w in tmp
             if w in self._word_frequency.dictionary
-            or not self._check_if_should_check(w)
+            and self._check_if_should_check(w)
         )
 
     def unknown(self, words):
@@ -215,7 +219,7 @@ def edit_distance_1(self, word):
             Returns:
                 set: The set of strings that are edit distance one from the \
                 provided word """
-        word = ENSURE_UNICODE(word).lower()
+        word = ENSURE_UNICODE(word).lower() if not self._case_sensitive else ENSURE_UNICODE(word)
         if self._check_if_should_check(word) is False:
             return {word}
         letters = self._word_frequency.letters
@@ -235,7 +239,7 @@ def edit_distance_2(self, word):
             Returns:
                 set: The set of strings that are edit distance two from the \
                 provided word """
-        word = ENSURE_UNICODE(word).lower()
+        word = ENSURE_UNICODE(word).lower() if not self._case_sensitive else ENSURE_UNICODE(word)
         return [
             e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)
         ]
@@ -257,10 +261,11 @@ def __edit_distance_alt(self, words):
         ]
         return [e2 for e1 in tmp for e2 in self.edit_distance_1(e1)]
 
-    @staticmethod
-    def _check_if_should_check(word):
+    def _check_if_should_check(self, word):
         if len(word) == 1 and word in string.punctuation:
             return False
+        if len(word) > self._word_frequency.longest_word_length + 3:  # magic number to allow removal of up to 2 letters.
+            return False
         try:  # check if it is a number (int, float, etc)
             float(word)
             return False
@@ -281,6 +286,7 @@ class WordFrequency(object):
         "_letters",
         "_tokenizer",
         "_case_sensitive",
+        "_longest_word_length"
     ]
 
     def __init__(self, tokenizer=None, case_sensitive=False):
@@ -289,6 +295,7 @@ def __init__(self, tokenizer=None, case_sensitive=False):
         self._unique_words = 0
         self._letters = set()
         self._case_sensitive = case_sensitive
+        self._longest_word_length = 0
 
         self._tokenizer = _parse_into_words
         if tokenizer is not None:
@@ -351,6 +358,14 @@ def letters(self):
                 Not settable """
         return self._letters
 
+    @property
+    def longest_word_length(self):
+        """ int: The longest word length in the dictionary
+
+            Note:
+                Not settable """
+        return self._longest_word_length
+
     def tokenize(self, text):
         """ Tokenize the provided string object into individual words
 
@@ -486,8 +501,11 @@ def remove_by_threshold(self, threshold=5):
 
     def _update_dictionary(self):
         """ Update the word frequency object """
+        self._longest_word_length = 0
         self._total_words = sum(self._dictionary.values())
         self._unique_words = len(self._dictionary.keys())
         self._letters = set()
         for key in self._dictionary:
+            if len(key) > self._longest_word_length:
+                self._longest_word_length = len(key)
             self._letters.update(key)
diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py
@@ -68,8 +68,8 @@ def test_word_known(self):
         self.assertEqual(spell.known(['sherlock']), {'sherlock'})
         self.assertEqual(spell.known(['holmes']), {'holmes'})
         self.assertEqual(spell.known(['known']), {'known'})
-        self.assertEqual(spell.known(['-']), {'-'})
 
+        self.assertEqual(spell.known(['-']), set())
         self.assertEqual(spell.known(['foobar']), set())
         self.assertEqual(spell.known(['ths']), set())
         self.assertEqual(spell.known(['ergos']), set())
@@ -281,6 +281,29 @@ def test_capitalization_when_case_sensitive_defaults_to_false(self):
         self.assertEqual(spell.candidates('BB'), {'bob', 'bab'})
         self.assertEqual(spell.correction('BB'), 'bob')
 
+    def test_large_words(self):
+        ''' test checking for words that are clearly larger than the largest dictionary word '''
+        spell = SpellChecker(language=None, distance=2)
+        spell.word_frequency.add('Bob')
+
+        words = ['Bb', 'bb', 'BB']
+        self.assertEqual(spell.unknown(words), {'bb'})
+
+        known_words = ['BOB', 'bOb']
+        self.assertEqual(spell.known(known_words), {'bob'})
+
+        self.assertEqual(spell.correction('bobs'), 'bob')
+        self.assertEqual(spell.correction('bobb'), 'bob')
+        self.assertEqual(spell.correction('bobby'), 'bob')
+        self.assertEqual(spell.word_frequency.longest_word_length, 3)
+        self.assertEqual(spell.correction('bobbys'), 'bobbys')
+
+    def test_extremely_large_words(self):
+        ''' test when a word is just extreamly large '''
+        spell = SpellChecker()
+        horrible_word = 'thisisnotarealisticwordthisisnotarealisticwordthisisnotarealisticwordthisisnotarealisticword'
+        self.assertEqual(spell.correction(horrible_word), horrible_word)
+
     def test_capitalization_when_case_sensitive_true(self):
         ''' test that capitalization affects comparisons '''
         spell = SpellChecker(language=None, case_sensitive=True)