Skip to content

Commit f742a54

Browse files
author
Eirik Lorgen Tanberg
committed
[TEXT-235] Implement Damerau-Levenshtein distance
1 parent c0599f4 commit f742a54

File tree

2 files changed

+563
-0
lines changed

2 files changed

+563
-0
lines changed
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* https://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.commons.text.similarity;
18+
19+
/**
20+
* An algorithm for measuring the difference between two character sequences using the
21+
* <a href="https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance">Damerau-Levenshtein Distance</a>.
22+
*
23+
* <p>
24+
* This is the number of changes needed to change one sequence into another, where each change is a single character
25+
* modification (deletion, insertion, substitution, or transposition of two adjacent characters).
26+
* </p>
27+
*
28+
* @see <a href="https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance">Damerau-Levenshtein Distance on Wikipedia</a>
29+
* @since 1.15.0
30+
*/
31+
public class DamerauLevenshteinDistance implements EditDistance<Integer> {
32+
33+
/**
34+
* The singleton instance.
35+
*/
36+
private static final DamerauLevenshteinDistance INSTANCE = new DamerauLevenshteinDistance();
37+
38+
/**
39+
* Gets the default instance.
40+
*
41+
* @return The default instance.
42+
*/
43+
public static DamerauLevenshteinDistance getDefaultInstance() {
44+
return INSTANCE;
45+
}
46+
47+
/**
48+
* Utility function to ensure distance is valid according to threshold
49+
*
50+
* @param distance The distance value
51+
* @param threshold The threshold value
52+
* @return The distance value, or {@code -1} if distance is greater than threshold
53+
*/
54+
private static int clampDistance(final int distance, final int threshold) {
55+
return distance > threshold ? -1 : distance;
56+
}
57+
58+
/**
59+
* Finds the Damerau-Levenshtein distance between two CharSequences if it's less than or equal to a given threshold.
60+
*
61+
* @param left the first SimilarityInput, must not be null.
62+
* @param right the second SimilarityInput, must not be null.
63+
* @param threshold the target threshold, must not be negative.
64+
* @return result distance, or -1 if distance exceeds threshold
65+
*/
66+
private static <E> int limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) {
67+
if (left == null || right == null) {
68+
throw new IllegalArgumentException("Left/right inputs must not be null");
69+
}
70+
71+
if (threshold < 0) {
72+
throw new IllegalArgumentException("Threshold can not be negative");
73+
}
74+
75+
/*
76+
* Implementation based on https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
77+
*/
78+
79+
int leftLength = left.length();
80+
int rightLength = right.length();
81+
82+
if (leftLength == 0) {
83+
return clampDistance(rightLength, threshold);
84+
}
85+
86+
if (rightLength == 0) {
87+
return clampDistance(leftLength, threshold);
88+
}
89+
90+
// Inspired by LevenshteinDistance impl; swap the input strings to consume less memory
91+
if (rightLength > leftLength) {
92+
final SimilarityInput<E> tmp = left;
93+
left = right;
94+
right = tmp;
95+
leftLength = rightLength;
96+
rightLength = right.length();
97+
}
98+
99+
// If the difference between the lengths of the strings is greater than the threshold, we must at least do
100+
// threshold operations so we can return early
101+
if (leftLength - rightLength > threshold) {
102+
return -1;
103+
}
104+
105+
// Use three arrays of minimum possible size to reduce memory usage. This avoids having to create a 2D
106+
// array of size leftLength * rightLength
107+
int[] curr = new int[rightLength + 1];
108+
int[] prev = new int[rightLength + 1];
109+
int[] prevPrev = new int[rightLength + 1];
110+
int[] temp; // Temp variable use to shuffle arrays at the end of each iteration
111+
112+
int rightIndex, leftIndex, cost, minCost;
113+
114+
// Changing empty sequence to [0..i] requires i insertions
115+
for (rightIndex = 0; rightIndex <= rightLength; rightIndex++) {
116+
prev[rightIndex] = rightIndex;
117+
}
118+
119+
// Calculate how many operations it takes to change right[0..rightIndex] into left[0..leftIndex]
120+
// For each iteration
121+
// - curr[i] contains the cost of changing right[0..i] into left[0..leftIndex]
122+
// (computed in current iteration)
123+
// - prev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 1]
124+
// (computed in previous iteration)
125+
// - prevPrev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 2]
126+
// (computed in iteration before previous)
127+
for (leftIndex = 1; leftIndex <= leftLength; leftIndex++) {
128+
// For right[0..0] we must insert leftIndex characters, which means the cost is always leftIndex
129+
curr[0] = leftIndex;
130+
131+
minCost = Integer.MAX_VALUE;
132+
133+
for (rightIndex = 1; rightIndex <= rightLength; rightIndex++) {
134+
cost = (left.at(leftIndex - 1) == right.at(rightIndex - 1)) ? 0 : 1;
135+
136+
// Select cheapest operation
137+
curr[rightIndex] = Math.min(
138+
Math.min(
139+
prev[rightIndex] + 1, // Delete current character
140+
curr[rightIndex - 1] + 1 // Insert current character
141+
),
142+
prev[rightIndex - 1] + cost // Replace (or no cost if same character)
143+
);
144+
145+
// Check if adjacent characters are the same -> transpose if cheaper
146+
if (leftIndex > 1
147+
&& rightIndex > 1
148+
&& left.at(leftIndex - 1) == right.at(rightIndex - 2)
149+
&& left.at(leftIndex - 2) == right.at(rightIndex - 1)) {
150+
// Use cost here, to properly handle two subsequent equal letters
151+
curr[rightIndex] = Math.min(curr[rightIndex], prevPrev[rightIndex - 2] + cost);
152+
}
153+
154+
minCost = Math.min(curr[rightIndex], minCost);
155+
}
156+
157+
// If there was no total cost for this entire iteration to transform right to left[0..leftIndex], there
158+
// can not be a way to do it below threshold. This is because we have no way to reduce the overall cost
159+
// in later operations.
160+
if (minCost > threshold) {
161+
return -1;
162+
}
163+
164+
// Rotate arrays for next iteration
165+
temp = prevPrev;
166+
prevPrev = prev;
167+
prev = curr;
168+
curr = temp;
169+
}
170+
171+
// Prev contains the value computed in the latest iteration
172+
return clampDistance(prev[rightLength], threshold);
173+
}
174+
175+
/**
176+
* Finds the Damerau-Levenshtein distance between two inputs using optimal string alignment.
177+
*
178+
* @param left the first CharSequence, must not be null.
179+
* @param right the second CharSequence, must not be null.
180+
* @return result distance.
181+
* @throws IllegalArgumentException if either CharSequence input is {@code null}.
182+
*/
183+
private static <E> int unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) {
184+
if (left == null || right == null) {
185+
throw new IllegalArgumentException("Left/right inputs must not be null");
186+
}
187+
188+
/*
189+
* Implementation based on https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
190+
*/
191+
192+
int leftLength = left.length();
193+
int rightLength = right.length();
194+
195+
if (leftLength == 0) {
196+
return rightLength;
197+
}
198+
199+
if (rightLength == 0) {
200+
return leftLength;
201+
}
202+
203+
// Inspired by LevenshteinDistance impl; swap the input strings to consume less memory
204+
if (rightLength > leftLength) {
205+
final SimilarityInput<E> tmp = left;
206+
left = right;
207+
right = tmp;
208+
leftLength = rightLength;
209+
rightLength = right.length();
210+
}
211+
212+
// Use three arrays of minimum possible size to reduce memory usage. This avoids having to create a 2D
213+
// array of size leftLength * rightLength
214+
int[] curr = new int[rightLength + 1];
215+
int[] prev = new int[rightLength + 1];
216+
int[] prevPrev = new int[rightLength + 1];
217+
int[] temp; // Temp variable use to shuffle arrays at the end of each iteration
218+
219+
int rightIndex, leftIndex, cost;
220+
221+
// Changing empty sequence to [0..i] requires i insertions
222+
for (rightIndex = 0; rightIndex <= rightLength; rightIndex++) {
223+
prev[rightIndex] = rightIndex;
224+
}
225+
226+
// Calculate how many operations it takes to change right[0..rightIndex] into left[0..leftIndex]
227+
// For each iteration
228+
// - curr[i] contains the cost of changing right[0..i] into left[0..leftIndex]
229+
// (computed in current iteration)
230+
// - prev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 1]
231+
// (computed in previous iteration)
232+
// - prevPrev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 2]
233+
// (computed in iteration before previous)
234+
for (leftIndex = 1; leftIndex <= leftLength; leftIndex++) {
235+
// For right[0..0] we must insert leftIndex characters, which means the cost is always leftIndex
236+
curr[0] = leftIndex;
237+
238+
for (rightIndex = 1; rightIndex <= rightLength; rightIndex++) {
239+
cost = (left.at(leftIndex - 1) == right.at(rightIndex - 1)) ? 0 : 1;
240+
241+
// Select cheapest operation
242+
curr[rightIndex] = Math.min(
243+
Math.min(
244+
prev[rightIndex] + 1, // Delete current character
245+
curr[rightIndex - 1] + 1 // Insert current character
246+
),
247+
prev[rightIndex - 1] + cost // Replace (or no cost if same character)
248+
);
249+
250+
// Check if adjacent characters are the same -> transpose if cheaper
251+
if (leftIndex > 1
252+
&& rightIndex > 1
253+
&& left.at(leftIndex - 1) == right.at(rightIndex - 2)
254+
&& left.at(leftIndex - 2) == right.at(rightIndex - 1)) {
255+
// Use cost here, to properly handle two subsequent equal letters
256+
curr[rightIndex] = Math.min(curr[rightIndex], prevPrev[rightIndex - 2] + cost);
257+
}
258+
}
259+
260+
// Rotate arrays for next iteration
261+
temp = prevPrev;
262+
prevPrev = prev;
263+
prev = curr;
264+
curr = temp;
265+
}
266+
267+
// Prev contains the value computed in the latest iteration
268+
return prev[rightLength];
269+
}
270+
271+
/**
272+
* Threshold.
273+
*/
274+
private final Integer threshold;
275+
276+
/**
277+
* Constructs a default instance that uses a version of the algorithm that does not use a threshold parameter.
278+
*
279+
* @see DamerauLevenshteinDistance#getDefaultInstance()
280+
* @deprecated Use {@link #getDefaultInstance()}.
281+
*/
282+
@Deprecated
283+
public DamerauLevenshteinDistance() {
284+
this(null);
285+
}
286+
287+
/**
288+
* Constructs a new instance. If the threshold is not null, distance calculations will be limited to a maximum length.
289+
* If the threshold is null, the unlimited version of the algorithm will be used.
290+
*
291+
* @param threshold If this is null then distances calculations will not be limited. This may not be negative.
292+
*/
293+
public DamerauLevenshteinDistance(final Integer threshold) {
294+
if (threshold != null && threshold < 0) {
295+
throw new IllegalArgumentException("Threshold must not be negative");
296+
}
297+
this.threshold = threshold;
298+
}
299+
300+
/**
301+
* Computes the Damerau-Levenshtein distance between two Strings.
302+
*
303+
* <p>
304+
* A higher score indicates a greater distance.
305+
* </p>
306+
*
307+
* @param left the first input, must not be null.
308+
* @param right the second input, must not be null.
309+
* @return result distance, or -1 if threshold is exceeded.
310+
* @throws IllegalArgumentException if either String input {@code null}.
311+
*/
312+
@Override
313+
public Integer apply(final CharSequence left, final CharSequence right) {
314+
return apply(SimilarityInput.input(left), SimilarityInput.input(right));
315+
}
316+
317+
/**
318+
* Computes the Damerau-Levenshtein distance between two inputs.
319+
*
320+
* <p>
321+
* A higher score indicates a greater distance.
322+
* </p>
323+
*
324+
* @param <E> The type of similarity score unit.
325+
* @param left the first input, must not be null.
326+
* @param right the second input, must not be null.
327+
* @return result distance, or -1 if threshold is exceeded.
328+
* @throws IllegalArgumentException if either String input {@code null}.
329+
* @since 1.13.0
330+
*/
331+
public <E> Integer apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
332+
if (threshold != null) {
333+
return limitedCompare(left, right, threshold);
334+
}
335+
return unlimitedCompare(left, right);
336+
}
337+
338+
/**
339+
* Gets the distance threshold.
340+
*
341+
* @return The distance threshold.
342+
*/
343+
public Integer getThreshold() {
344+
return threshold;
345+
}
346+
}

0 commit comments

Comments
 (0)