Skip to content

Commit f6578ff

Browse files
author
Eirik Lorgen Tanberg
committed
[TEXT-235] Add Damerau-Levenshtein distance
1 parent c0599f4 commit f6578ff

File tree

2 files changed

+419
-0
lines changed

2 files changed

+419
-0
lines changed
Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* https://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.commons.text.similarity;
18+
19+
/**
20+
* An algorithm for measuring the difference between two character sequences using the
21+
* <a href="https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance">Damerau-Levenshtein Distance</a>.
22+
*
23+
* <p>
24+
* This is the number of changes needed to change one sequence into another, where each change is a single character
25+
* modification (deletion, insertion, substitution, or transposition of two adjacent characters).
26+
* </p>
27+
* <p>
28+
* This implementation uses the optimal string alignment distance variant of the Damerau-Levenshtein distance,
29+
* which uses O(min(m,n)) space complexity with rolling arrays.
30+
* </p>
31+
*
32+
* @see <a href="https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance">Damerau-Levenshtein Distance on Wikipedia</a>
33+
* @since 1.0
34+
*/
35+
public class DamerauLevenshteinDistance implements EditDistance<Integer> {
36+
37+
/**
38+
* The singleton instance.
39+
*/
40+
private static final DamerauLevenshteinDistance INSTANCE = new DamerauLevenshteinDistance();
41+
42+
/**
43+
* Gets the default instance.
44+
*
45+
* @return The default instance.
46+
*/
47+
public static DamerauLevenshteinDistance getDefaultInstance() {
48+
return INSTANCE;
49+
}
50+
51+
/**
52+
* Finds the Damerau-Levenshtein distance between two CharSequences if it's less than or equal to a given threshold.
53+
*
54+
* @param left the first SimilarityInput, must not be null.
55+
* @param right the second SimilarityInput, must not be null.
56+
* @param threshold the target threshold, must not be negative.
57+
* @return result distance, or -1 if distance exceeds threshold
58+
*/
59+
private static <E> int limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) {
60+
if (left == null || right == null) {
61+
throw new IllegalArgumentException("CharSequences must not be null");
62+
}
63+
if (threshold < 0) {
64+
throw new IllegalArgumentException("Threshold must not be negative");
65+
}
66+
67+
int leftLength = left.length();
68+
int rightLength = right.length();
69+
70+
// if one string is empty, the edit distance is necessarily the length of the other
71+
if (leftLength == 0) {
72+
return rightLength <= threshold ? rightLength : -1;
73+
}
74+
if (rightLength == 0) {
75+
return leftLength <= threshold ? leftLength : -1;
76+
}
77+
78+
// the edit distance cannot be less than the length difference
79+
if (Math.abs(leftLength - rightLength) > threshold) {
80+
return -1;
81+
}
82+
83+
// Ensure left is the shorter string to minimize memory usage
84+
if (leftLength > rightLength) {
85+
SimilarityInput<E> temp = left;
86+
left = right;
87+
right = temp;
88+
leftLength = left.length();
89+
rightLength = right.length();
90+
}
91+
92+
// Need 3 rows for transposition: current, previous, and before previous
93+
int[] prevPrev = new int[rightLength + 1];
94+
int[] prev = new int[rightLength + 1];
95+
int[] curr = new int[rightLength + 1];
96+
97+
// Initialize the first row: transforming empty string to right[0..j] requires j insertions
98+
for (int j = 0; j <= rightLength; j++) {
99+
prev[j] = j;
100+
}
101+
102+
for (int i = 1; i <= leftLength; i++) {
103+
// Transforming left[0..i] to empty string requires i deletions
104+
curr[0] = i;
105+
106+
int minInRow = curr[0]; // Track minimum value in current row for early termination
107+
108+
for (int j = 1; j <= rightLength; j++) {
109+
int cost = left.at(i - 1).equals(right.at(j - 1)) ? 0 : 1;
110+
111+
curr[j] = Math.min(
112+
Math.min(
113+
prev[j] + 1, // Delete from left
114+
curr[j - 1] + 1 // Insert into left
115+
),
116+
prev[j - 1] + cost // Substitute (or match)
117+
);
118+
119+
// Check for transposition of adjacent characters
120+
if (i > 1 && j > 1 &&
121+
left.at(i - 1).equals(right.at(j - 2)) &&
122+
left.at(i - 2).equals(right.at(j - 1))) {
123+
curr[j] = Math.min(curr[j], prevPrev[j - 2] + cost);
124+
}
125+
126+
minInRow = Math.min(minInRow, curr[j]);
127+
}
128+
129+
// Early termination: if minimum value in current row exceeds threshold,
130+
// the final result will definitely exceed threshold
131+
if (minInRow > threshold) {
132+
return -1;
133+
}
134+
135+
// Rotate arrays for next iteration: prevPrev <- prev <- curr
136+
int[] temp = prevPrev;
137+
prevPrev = prev;
138+
prev = curr;
139+
curr = temp;
140+
}
141+
142+
return prev[rightLength] <= threshold ? prev[rightLength] : -1;
143+
}
144+
145+
/**
146+
* Finds the Damerau-Levenshtein distance between two inputs using optimal string alignment.
147+
*
148+
* @param left the first CharSequence, must not be null.
149+
* @param right the second CharSequence, must not be null.
150+
* @return result distance.
151+
* @throws IllegalArgumentException if either CharSequence input is {@code null}.
152+
*/
153+
private static <E> int unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) {
154+
if (left == null || right == null) {
155+
throw new IllegalArgumentException("CharSequences must not be null");
156+
}
157+
158+
int leftLength = left.length();
159+
int rightLength = right.length();
160+
161+
if (leftLength == 0) {
162+
return rightLength;
163+
}
164+
if (rightLength == 0) {
165+
return leftLength;
166+
}
167+
168+
// Ensure left is the shorter string to minimize memory usage
169+
if (leftLength > rightLength) {
170+
SimilarityInput<E> temp = left;
171+
left = right;
172+
right = temp;
173+
leftLength = left.length();
174+
rightLength = right.length();
175+
}
176+
177+
// Need 3 rows for transposition: current, previous, and before previous
178+
// prevPrev[j] = min ops to transform left[0..i-2] to right[0..j]
179+
// prev[j] = min ops to transform left[0..i-1] to right[0..j]
180+
// curr[j] = min ops to transform left[0..i] to right[0..j]
181+
int[] prevPrev = new int[rightLength + 1];
182+
int[] prev = new int[rightLength + 1];
183+
int[] curr = new int[rightLength + 1];
184+
185+
// Initialize the first row: transforming empty string to right[0..j] requires j insertions
186+
for (int j = 0; j <= rightLength; j++) {
187+
prev[j] = j;
188+
}
189+
190+
for (int i = 1; i <= leftLength; i++) {
191+
// Transforming left[0..i] to empty string requires i deletions
192+
curr[0] = i;
193+
194+
for (int j = 1; j <= rightLength; j++) {
195+
int cost = left.at(i - 1).equals(right.at(j - 1)) ? 0 : 1;
196+
197+
curr[j] = Math.min(
198+
Math.min(
199+
prev[j] + 1, // Delete from left
200+
curr[j - 1] + 1 // Insert into left
201+
),
202+
prev[j - 1] + cost // Substitute (or match)
203+
);
204+
205+
// Check for transposition of adjacent characters
206+
if (i > 1 && j > 1 &&
207+
left.at(i - 1).equals(right.at(j - 2)) &&
208+
left.at(i - 2).equals(right.at(j - 1))) {
209+
curr[j] = Math.min(curr[j], prevPrev[j - 2] + cost);
210+
}
211+
}
212+
213+
// Rotate arrays for next iteration: prevPrev <- prev <- curr
214+
int[] temp = prevPrev;
215+
prevPrev = prev;
216+
prev = curr;
217+
curr = temp;
218+
}
219+
220+
return prev[rightLength];
221+
}
222+
223+
/**
224+
* Threshold.
225+
*/
226+
private final Integer threshold;
227+
228+
/**
229+
* Constructs a default instance that uses a version of the algorithm that does not use a threshold parameter.
230+
*
231+
* @see DamerauLevenshteinDistance#getDefaultInstance()
232+
* @deprecated Use {@link #getDefaultInstance()}.
233+
*/
234+
@Deprecated
235+
public DamerauLevenshteinDistance() {
236+
this(null);
237+
}
238+
239+
/**
240+
* Constructs a new instance. If the threshold is not null, distance calculations will be limited to a maximum length.
241+
* If the threshold is null, the unlimited version of the algorithm will be used.
242+
*
243+
* @param threshold If this is null then distances calculations will not be limited. This may not be negative.
244+
*/
245+
public DamerauLevenshteinDistance(final Integer threshold) {
246+
if (threshold != null && threshold < 0) {
247+
throw new IllegalArgumentException("Threshold must not be negative");
248+
}
249+
this.threshold = threshold;
250+
}
251+
252+
/**
253+
* Computes the Damerau-Levenshtein distance between two Strings.
254+
*
255+
* <p>
256+
* A higher score indicates a greater distance.
257+
* </p>
258+
*
259+
* @param left the first input, must not be null.
260+
* @param right the second input, must not be null.
261+
* @return result distance, or -1 if threshold is exceeded.
262+
* @throws IllegalArgumentException if either String input {@code null}.
263+
*/
264+
@Override
265+
public Integer apply(final CharSequence left, final CharSequence right) {
266+
return apply(SimilarityInput.input(left), SimilarityInput.input(right));
267+
}
268+
269+
/**
270+
* Computes the Damerau-Levenshtein distance between two inputs.
271+
*
272+
* <p>
273+
* A higher score indicates a greater distance.
274+
* </p>
275+
*
276+
* @param <E> The type of similarity score unit.
277+
* @param left the first input, must not be null.
278+
* @param right the second input, must not be null.
279+
* @return result distance, or -1 if threshold is exceeded.
280+
* @throws IllegalArgumentException if either String input {@code null}.
281+
* @since 1.13.0
282+
*/
283+
public <E> Integer apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
284+
if (threshold != null) {
285+
return limitedCompare(left, right, threshold);
286+
}
287+
return unlimitedCompare(left, right);
288+
}
289+
290+
/**
291+
* Gets the distance threshold.
292+
*
293+
* @return The distance threshold.
294+
*/
295+
public Integer getThreshold() {
296+
return threshold;
297+
}
298+
}

0 commit comments

Comments
 (0)