Skip to content

Commit 9e298a5

Browse files
authored
Hotfix/long word fix (#58)
* handle really long words without doing all the work * update changelog
1 parent d27baf5 commit 9e298a5

File tree

4 files changed

+58
-7
lines changed

4 files changed

+58
-7
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# pyspellchecker
22

3+
## Version 0.5.3
4+
* Handle memory issues when trying to correct or find candidates for extremely long words
5+
6+
## Version 0.5.2
7+
Ensure input is encoded correctly; resolves [#53](https://github.com/barrust/pyspellchecker/issues/53)
8+
9+
## Version 0.5.1
10+
Handle windows encoding issues [#48](https://github.com/barrust/pyspellchecker/issues/48)
11+
Deterministic order to corrections [#47](https://github.com/barrust/pyspellchecker/issues/47)
12+
313
## Version 0.5.0
414
* Add tokenizer to the Spell object
515
* Add Support for local dictionaries to be case sensitive

spellchecker/info.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
__maintainer__ = "Tyler Barrus"
66
__email__ = "barrust@gmail.com"
77
__license__ = "MIT"
8-
__version__ = "0.5.2"
8+
__version__ = "0.5.3"
99
__credits__ = ["Peter Norvig"]
1010
__url__ = "https://github.com/barrust/pyspellchecker"
1111
__bugtrack_url__ = "{0}/issues".format(__url__)

spellchecker/spellchecker.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,10 @@ def candidates(self, word):
159159
word = ENSURE_UNICODE(word)
160160
if self.known([word]): # short-cut if word is correct already
161161
return {word}
162+
163+
if not self._check_if_should_check(word):
164+
return {word}
165+
162166
# get edit distance 1...
163167
res = [x for x in self.edit_distance_1(word)]
164168
tmp = self.known(res)
@@ -186,7 +190,7 @@ def known(self, words):
186190
w
187191
for w in tmp
188192
if w in self._word_frequency.dictionary
189-
or not self._check_if_should_check(w)
193+
and self._check_if_should_check(w)
190194
)
191195

192196
def unknown(self, words):
@@ -215,7 +219,7 @@ def edit_distance_1(self, word):
215219
Returns:
216220
set: The set of strings that are edit distance one from the \
217221
provided word """
218-
word = ENSURE_UNICODE(word).lower()
222+
word = ENSURE_UNICODE(word).lower() if not self._case_sensitive else ENSURE_UNICODE(word)
219223
if self._check_if_should_check(word) is False:
220224
return {word}
221225
letters = self._word_frequency.letters
@@ -235,7 +239,7 @@ def edit_distance_2(self, word):
235239
Returns:
236240
set: The set of strings that are edit distance two from the \
237241
provided word """
238-
word = ENSURE_UNICODE(word).lower()
242+
word = ENSURE_UNICODE(word).lower() if not self._case_sensitive else ENSURE_UNICODE(word)
239243
return [
240244
e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)
241245
]
@@ -257,10 +261,11 @@ def __edit_distance_alt(self, words):
257261
]
258262
return [e2 for e1 in tmp for e2 in self.edit_distance_1(e1)]
259263

260-
@staticmethod
261-
def _check_if_should_check(word):
264+
def _check_if_should_check(self, word):
262265
if len(word) == 1 and word in string.punctuation:
263266
return False
267+
if len(word) > self._word_frequency.longest_word_length + 3: # magic number to allow removal of up to 2 letters.
268+
return False
264269
try: # check if it is a number (int, float, etc)
265270
float(word)
266271
return False
@@ -281,6 +286,7 @@ class WordFrequency(object):
281286
"_letters",
282287
"_tokenizer",
283288
"_case_sensitive",
289+
"_longest_word_length"
284290
]
285291

286292
def __init__(self, tokenizer=None, case_sensitive=False):
@@ -289,6 +295,7 @@ def __init__(self, tokenizer=None, case_sensitive=False):
289295
self._unique_words = 0
290296
self._letters = set()
291297
self._case_sensitive = case_sensitive
298+
self._longest_word_length = 0
292299

293300
self._tokenizer = _parse_into_words
294301
if tokenizer is not None:
@@ -351,6 +358,14 @@ def letters(self):
351358
Not settable """
352359
return self._letters
353360

361+
@property
362+
def longest_word_length(self):
363+
""" int: The longest word length in the dictionary
364+
365+
Note:
366+
Not settable """
367+
return self._longest_word_length
368+
354369
def tokenize(self, text):
355370
""" Tokenize the provided string object into individual words
356371
@@ -486,8 +501,11 @@ def remove_by_threshold(self, threshold=5):
486501

487502
def _update_dictionary(self):
488503
""" Update the word frequency object """
504+
self._longest_word_length = 0
489505
self._total_words = sum(self._dictionary.values())
490506
self._unique_words = len(self._dictionary.keys())
491507
self._letters = set()
492508
for key in self._dictionary:
509+
if len(key) > self._longest_word_length:
510+
self._longest_word_length = len(key)
493511
self._letters.update(key)

tests/spellchecker_test.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ def test_word_known(self):
6868
self.assertEqual(spell.known(['sherlock']), {'sherlock'})
6969
self.assertEqual(spell.known(['holmes']), {'holmes'})
7070
self.assertEqual(spell.known(['known']), {'known'})
71-
self.assertEqual(spell.known(['-']), {'-'})
7271

72+
self.assertEqual(spell.known(['-']), set())
7373
self.assertEqual(spell.known(['foobar']), set())
7474
self.assertEqual(spell.known(['ths']), set())
7575
self.assertEqual(spell.known(['ergos']), set())
@@ -281,6 +281,29 @@ def test_capitalization_when_case_sensitive_defaults_to_false(self):
281281
self.assertEqual(spell.candidates('BB'), {'bob', 'bab'})
282282
self.assertEqual(spell.correction('BB'), 'bob')
283283

284+
def test_large_words(self):
285+
''' test checking for words that are clearly larger than the largest dictionary word '''
286+
spell = SpellChecker(language=None, distance=2)
287+
spell.word_frequency.add('Bob')
288+
289+
words = ['Bb', 'bb', 'BB']
290+
self.assertEqual(spell.unknown(words), {'bb'})
291+
292+
known_words = ['BOB', 'bOb']
293+
self.assertEqual(spell.known(known_words), {'bob'})
294+
295+
self.assertEqual(spell.correction('bobs'), 'bob')
296+
self.assertEqual(spell.correction('bobb'), 'bob')
297+
self.assertEqual(spell.correction('bobby'), 'bob')
298+
self.assertEqual(spell.word_frequency.longest_word_length, 3)
299+
self.assertEqual(spell.correction('bobbys'), 'bobbys')
300+
301+
def test_extremely_large_words(self):
302+
''' test when a word is just extreamly large '''
303+
spell = SpellChecker()
304+
horrible_word = 'thisisnotarealisticwordthisisnotarealisticwordthisisnotarealisticwordthisisnotarealisticword'
305+
self.assertEqual(spell.correction(horrible_word), horrible_word)
306+
284307
def test_capitalization_when_case_sensitive_true(self):
285308
''' test that capitalization affects comparisons '''
286309
spell = SpellChecker(language=None, case_sensitive=True)

0 commit comments

Comments
 (0)