Skip to content

Commit 19b7238

Browse files
authored
Merge pull request #84 from juliangilbey/split-damerau-levenshtein
Add new DamerauLevenshtein... classes
2 parents c9fbf57 + efd915c commit 19b7238

File tree

4 files changed

+88
-13
lines changed

4 files changed

+88
-13
lines changed

tests/test_edit/test_damerau_levenshtein.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@
44
# project
55
import textdistance
66

7-
87
ALG = textdistance.DamerauLevenshtein
98

10-
11-
@pytest.mark.parametrize('left, right, expected', [
9+
COMMON = [
1210
('test', 'text', 1),
1311
('test', 'tset', 1),
1412
('test', 'qwy', 4),
@@ -24,15 +22,35 @@
2422
('ab', 'ba', 1),
2523
('ab', 'cde', 3),
2624
('ab', 'ac', 1),
27-
('ab', 'ba', 1),
2825
('ab', 'bc', 2),
26+
]
27+
28+
29+
@pytest.mark.parametrize('left, right, expected', COMMON + [
30+
('ab', 'bca', 3),
31+
('abcd', 'bdac', 4),
2932
])
30-
def test_distance(left, right, expected):
33+
def test_distance_restricted(left, right, expected):
3134
actual = ALG(external=False)(left, right)
3235
assert actual == expected
3336

3437
actual = ALG(external=True)(left, right)
3538
assert actual == expected
3639

37-
actual = ALG()._pure_python(left, right)
40+
actual = ALG()._pure_python_restricted(left, right)
41+
assert actual == expected
42+
43+
44+
@pytest.mark.parametrize('left, right, expected', COMMON + [
45+
('ab', 'bca', 2),
46+
('abcd', 'bdac', 3),
47+
])
48+
def test_distance_unrestricted(left, right, expected):
49+
actual = ALG(external=False, restricted=False)(left, right)
50+
assert actual == expected
51+
52+
actual = ALG(external=True, restricted=False)(left, right)
53+
assert actual == expected
54+
55+
actual = ALG()._pure_python_unrestricted(left, right)
3856
assert actual == expected

textdistance/algorithms/edit_based.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ class DamerauLevenshtein(_Base):
148148
* substitution: ABC -> ABE, ADC, FBC..
149149
* transposition: ABC -> ACB, BAC
150150
151+
If `restricted=False`, it will calculate unrestricted distance,
152+
where the same character can be touched more than once.
153+
So the distance between BA and ACB is 2: BA -> AB -> ACB.
154+
151155
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
152156
"""
153157

@@ -156,10 +160,12 @@ def __init__(
156160
qval: int = 1,
157161
test_func: TestFunc | None = None,
158162
external: bool = True,
163+
restricted: bool = True,
159164
) -> None:
160165
self.qval = qval
161166
self.test_func = test_func or self._ident
162167
self.external = external
168+
self.restricted = restricted
163169

164170
def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int:
165171
# TODO: doesn't pass tests, need improve
@@ -194,11 +200,52 @@ def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int:
194200

195201
return d[len(s1) - 1][len(s2) - 1]
196202

197-
def _pure_python(self, s1: Sequence[T], s2: Sequence[T]) -> int:
203+
def _pure_python_unrestricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:
204+
"""https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
205+
"""
206+
d: dict[tuple[int, int], int] = {}
207+
da: dict[T, int] = {}
208+
209+
len1 = len(s1)
210+
len2 = len(s2)
211+
212+
maxdist = len1 + len2
213+
d[-1, -1] = maxdist
214+
215+
# matrix
216+
for i in range(len(s1) + 1):
217+
d[i, -1] = maxdist
218+
d[i, 0] = i
219+
for j in range(len(s2) + 1):
220+
d[-1, j] = maxdist
221+
d[0, j] = j
222+
223+
for i, cs1 in enumerate(s1, start=1):
224+
db = 0
225+
for j, cs2 in enumerate(s2, start=1):
226+
i1 = da.get(cs2, 0)
227+
j1 = db
228+
if self.test_func(cs1, cs2):
229+
cost = 0
230+
db = j
231+
else:
232+
cost = 1
233+
234+
d[i, j] = min(
235+
d[i - 1, j - 1] + cost, # substitution
236+
d[i, j - 1] + 1, # insertion
237+
d[i - 1, j] + 1, # deletion
238+
d[i1 - 1, j1 - 1] + (i - i1) - 1 + (j - j1), # transposition
239+
)
240+
da[cs1] = i
241+
242+
return d[len1, len2]
243+
244+
def _pure_python_restricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:
198245
"""
199246
https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/
200247
"""
201-
d = {}
248+
d: dict[tuple[int, int], int] = {}
202249

203250
# matrix
204251
for i in range(-1, len(s1) + 1):
@@ -241,7 +288,9 @@ def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int:
241288
# if numpy:
242289
# return self._numpy(s1, s2)
243290
# else:
244-
return self._pure_python(s1, s2)
291+
if self.restricted:
292+
return self._pure_python_restricted(s1, s2)
293+
return self._pure_python_unrestricted(s1, s2)
245294

246295

247296
class JaroWinkler(_BaseSimilarity):

textdistance/libraries.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
{
22
"DamerauLevenshtein": [
3+
[
4+
"rapidfuzz.distance.OSA",
5+
"distance"
6+
],
37
[
48
"rapidfuzz.distance.DamerauLevenshtein",
59
"distance"

textdistance/libraries.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,14 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
166166
reg = prototype.register
167167

168168
alg = 'DamerauLevenshtein'
169-
reg(alg, LibraryBase('abydos.distance', 'DamerauLevenshtein', presets={}, attr='dist_abs'))
170-
reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
171-
reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
172-
reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance'))
169+
reg(alg, LibraryBase(
170+
'abydos.distance', 'DamerauLevenshtein', presets={}, attr='dist_abs',
171+
conditions=dict(restricted=False),
172+
))
173+
reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance', conditions=dict(restricted=True)))
174+
reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance', conditions=dict(restricted=False)))
175+
reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance', conditions=dict(restricted=False)))
176+
reg(alg, LibraryBase('rapidfuzz.distance.OSA', 'distance', conditions=dict(restricted=True)))
173177

174178
alg = 'Hamming'
175179
reg(alg, LibraryBase('abydos.distance', 'Hamming', presets={}, attr='dist_abs'))

0 commit comments

Comments
 (0)