Skip to content

Commit c8b7ac5

Browse files
committed
Turn off difflib "autojunk" heuristic in fuzzy matching
difflib has a heuristic that used to make fuzzy matching unreliable for >200char strings. See python/cpython#90825 Fixes #969
1 parent 63bb71a commit c8b7ac5

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

babel/messages/catalog.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414
from collections import OrderedDict
1515
from collections.abc import Iterable, Iterator
1616
from copy import copy
17-
from difflib import get_close_matches
17+
from difflib import SequenceMatcher
1818
from email import message_from_string
19+
from heapq import nlargest
1920
from typing import TYPE_CHECKING
2021

2122
from babel import __version__ as VERSION
@@ -31,6 +32,31 @@
3132

3233
__all__ = ['Message', 'Catalog', 'TranslationError']
3334

35+
def get_close_matches(word, possibilities, n=3, cutoff=0.6):
36+
"""A modified version of ``difflib.get_close_matches``.
37+
38+
It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
39+
around https://github.com/python/cpython/issues/90825.
40+
"""
41+
if not n > 0:
42+
raise ValueError("n must be > 0: %r" % (n,))
43+
if not 0.0 <= cutoff <= 1.0:
44+
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
45+
result = []
46+
s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
47+
s.set_seq2(word)
48+
for x in possibilities:
49+
s.set_seq1(x)
50+
if s.real_quick_ratio() >= cutoff and \
51+
s.quick_ratio() >= cutoff and \
52+
s.ratio() >= cutoff:
53+
result.append((s.ratio(), x))
54+
55+
# Move the best scorers to head of list
56+
result = nlargest(n, result)
57+
# Strip scores for the best n matches
58+
return [x for score, x in result]
59+
3460

3561
PYTHON_FORMAT = re.compile(r'''
3662
\%

tests/messages/test_catalog.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,25 @@ def test_update_fuzzy_matching_no_cascading(self):
209209
assert cat['fooo'].string == 'Vohe'
210210
assert cat['fooo'].fuzzy is True
211211

212+
def test_update_fuzzy_matching_long_string(self):
213+
lipsum = "\
214+
Lorem Ipsum is simply dummy text of the printing and typesetting \
215+
industry. Lorem Ipsum has been the industry's standard dummy text ever \
216+
since the 1500s, when an unknown printer took a galley of type and \
217+
scrambled it to make a type specimen book. It has survived not only \
218+
five centuries, but also the leap into electronic typesetting, \
219+
remaining essentially unchanged. It was popularised in the 1960s with \
220+
the release of Letraset sheets containing Lorem Ipsum passages, and \
221+
more recently with desktop publishing software like Aldus PageMaker \
222+
including versions of Lorem Ipsum."
223+
cat = catalog.Catalog()
224+
cat.add("ZZZZZZ " + lipsum, "foo")
225+
tmpl = catalog.Catalog()
226+
tmpl.add(lipsum + " ZZZZZZ")
227+
cat.update(tmpl)
228+
assert cat[lipsum + " ZZZZZZ"].fuzzy is True
229+
assert len(cat.obsolete) == 0
230+
212231
def test_update_without_fuzzy_matching(self):
213232
cat = catalog.Catalog()
214233
cat.add('fo', 'Voh')

0 commit comments

Comments
 (0)