@@ -159,6 +159,10 @@ def candidates(self, word):
159159 word = ENSURE_UNICODE (word )
160160 if self .known ([word ]): # short-cut if word is correct already
161161 return {word }
162+
163+ if not self ._check_if_should_check (word ):
164+ return {word }
165+
162166 # get edit distance 1...
163167 res = [x for x in self .edit_distance_1 (word )]
164168 tmp = self .known (res )
@@ -186,7 +190,7 @@ def known(self, words):
186190 w
187191 for w in tmp
188192 if w in self ._word_frequency .dictionary
189- or not self ._check_if_should_check (w )
193+ and self ._check_if_should_check (w )
190194 )
191195
192196 def unknown (self , words ):
@@ -215,7 +219,7 @@ def edit_distance_1(self, word):
215219 Returns:
216220 set: The set of strings that are edit distance one from the \
217221 provided word """
218- word = ENSURE_UNICODE (word ).lower ()
222+ word = ENSURE_UNICODE (word ).lower () if not self . _case_sensitive else ENSURE_UNICODE ( word )
219223 if self ._check_if_should_check (word ) is False :
220224 return {word }
221225 letters = self ._word_frequency .letters
@@ -235,7 +239,7 @@ def edit_distance_2(self, word):
235239 Returns:
236240 set: The set of strings that are edit distance two from the \
237241 provided word """
238- word = ENSURE_UNICODE (word ).lower ()
242+ word = ENSURE_UNICODE (word ).lower () if not self . _case_sensitive else ENSURE_UNICODE ( word )
239243 return [
240244 e2 for e1 in self .edit_distance_1 (word ) for e2 in self .edit_distance_1 (e1 )
241245 ]
@@ -257,10 +261,11 @@ def __edit_distance_alt(self, words):
257261 ]
258262 return [e2 for e1 in tmp for e2 in self .edit_distance_1 (e1 )]
259263
260- @staticmethod
261- def _check_if_should_check (word ):
264+ def _check_if_should_check (self , word ):
262265 if len (word ) == 1 and word in string .punctuation :
263266 return False
267+ if len (word ) > self ._word_frequency .longest_word_length + 3 : # magic number to allow removal of up to 2 letters.
268+ return False
264269 try : # check if it is a number (int, float, etc)
265270 float (word )
266271 return False
@@ -281,6 +286,7 @@ class WordFrequency(object):
281286 "_letters" ,
282287 "_tokenizer" ,
283288 "_case_sensitive" ,
289+ "_longest_word_length"
284290 ]
285291
286292 def __init__ (self , tokenizer = None , case_sensitive = False ):
@@ -289,6 +295,7 @@ def __init__(self, tokenizer=None, case_sensitive=False):
289295 self ._unique_words = 0
290296 self ._letters = set ()
291297 self ._case_sensitive = case_sensitive
298+ self ._longest_word_length = 0
292299
293300 self ._tokenizer = _parse_into_words
294301 if tokenizer is not None :
@@ -351,6 +358,14 @@ def letters(self):
351358 Not settable """
352359 return self ._letters
353360
361+ @property
362+ def longest_word_length (self ):
363+ """ int: The longest word length in the dictionary
364+
365+ Note:
366+ Not settable """
367+ return self ._longest_word_length
368+
354369 def tokenize (self , text ):
355370 """ Tokenize the provided string object into individual words
356371
@@ -486,8 +501,11 @@ def remove_by_threshold(self, threshold=5):
486501
487502 def _update_dictionary (self ):
488503 """ Update the word frequency object """
504+ self ._longest_word_length = 0
489505 self ._total_words = sum (self ._dictionary .values ())
490506 self ._unique_words = len (self ._dictionary .keys ())
491507 self ._letters = set ()
492508 for key in self ._dictionary :
509+ if len (key ) > self ._longest_word_length :
510+ self ._longest_word_length = len (key )
493511 self ._letters .update (key )
0 commit comments