Skip to content

Commit 43bd2ad

Browse files
committed
Catch up to upstream 1.2.0 release by adding early termination support to Levenshtein and WeightedLevenshtein
1 parent 551e431 commit 43bd2ad

File tree

4 files changed

+78
-4
lines changed

4 files changed

+78
-4
lines changed

src/F23.StringSimilarity/Levenshtein.cs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ namespace F23.StringSimilarity
3434
/// change one string into the other.
3535
public class Levenshtein : IMetricStringDistance
3636
{
37+
/// <summary>
38+
/// Equivalent to Distance(s1, s2, Int32.MaxValue).
39+
/// </summary>
40+
/// <param name="s1">The first string to compare.</param>
41+
/// <param name="s2">The second string to compare.</param>
42+
/// <returns>The Levenshtein distance between strings</returns>
43+
public double Distance(string s1, string s2)
44+
{
45+
return Distance(s1, s2, int.MaxValue);
46+
}
47+
3748
/// <summary>
3849
/// The Levenshtein distance, or edit distance, between two words is the
3950
/// Minimum number of single-character edits (insertions, deletions or
@@ -56,9 +67,14 @@ public class Levenshtein : IMetricStringDistance
5667
/// </summary>
5768
/// <param name="s1">The first string to compare.</param>
5869
/// <param name="s2">The second string to compare.</param>
70+
/// <param name="limit">The maximum result to compute before stopping. This
71+
/// means that the calculation can terminate early if you
72+
/// only care about strings with a certain similarity.
73+
/// Set this to Int32.MaxValue if you want to run the
74+
/// calculation to completion in every case.</param>
5975
/// <returns>The Levenshtein distance between strings</returns>
6076
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
61-
public double Distance(string s1, string s2)
77+
public double Distance(string s1, string s2, int limit)
6278
{
6379
if (s1 == null)
6480
{
@@ -105,6 +121,8 @@ public double Distance(string s1, string s2)
105121
// edit distance is delete (i+1) chars from s to match empty t
106122
v1[0] = i + 1;
107123

124+
int minv1 = v1[0];
125+
108126
// use formula to fill in the rest of the row
109127
for (int j = 0; j < s2.Length; j++)
110128
{
@@ -118,10 +136,17 @@ public double Distance(string s1, string s2)
118136
Math.Min(
119137
v0[j + 1] + 1, // Cost of remove
120138
v0[j] + cost)); // Cost of substitution
139+
140+
minv1 = Math.Min(minv1, v1[j + 1]);
141+
}
142+
143+
if (minv1 >= limit)
144+
{
145+
return limit;
121146
}
122147

123148
// copy v1 (current row) to v0 (previous row) for next iteration
124-
//System.arraycopy(v1, 0, v0, 0, v0.length);
149+
// System.arraycopy(v1, 0, v0, 0, v0.length);
125150

126151
// Flip references to current and previous row
127152
vtemp = v0;

src/F23.StringSimilarity/WeightedLevenshtein.cs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,30 @@ public WeightedLevenshtein(ICharacterSubstitution characterSubstitution,
5959
_characterInsDel = characterInsDel;
6060
}
6161

62+
/// <summary>
63+
/// Equivalent to Distance(s1, s2, Double.MaxValue).
64+
/// </summary>
65+
/// <param name="s1">The first string to compare.</param>
66+
/// <param name="s2">The second string to compare.</param>
67+
/// <returns>The computed weighted Levenshtein distance.</returns>
68+
public double Distance(string s1, string s2)
69+
{
70+
return Distance(s1, s2, double.MaxValue);
71+
}
72+
6273
/// <summary>
6374
/// Compute Levenshtein distance using provided weights for substitution.
6475
/// </summary>
6576
/// <param name="s1">The first string to compare.</param>
6677
/// <param name="s2">The second string to compare.</param>
78+
/// <param name="limit">The maximum result to compute before stopping. This
79+
/// means that the calculation can terminate early if you
80+
/// only care about strings with a certain similarity.
81+
/// Set this to Double.MaxValue if you want to run the
82+
/// calculation to completion in every case.</param>
6783
/// <returns>The computed weighted Levenshtein distance.</returns>
6884
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
69-
public double Distance(string s1, string s2)
85+
public double Distance(string s1, string s2, double limit)
7086
{
7187
if (s1 == null)
7288
{
@@ -93,7 +109,7 @@ public double Distance(string s1, string s2)
93109
return s1.Length;
94110
}
95111

96-
// create two work vectors of integer distances
112+
// create two work vectors of floating point (i.e. weighted) distances
97113
double[] v0 = new double[s2.Length + 1];
98114
double[] v1 = new double[s2.Length + 1];
99115
double[] vtemp;
@@ -118,6 +134,8 @@ public double Distance(string s1, string s2)
118134
// to match empty t.
119135
v1[0] = v0[0] + deletionCost;
120136

137+
double minv1 = v1[0];
138+
121139
// use formula to fill in the rest of the row
122140
for (int j = 0; j < s2.Length; j++)
123141
{
@@ -136,6 +154,13 @@ public double Distance(string s1, string s2)
136154
Math.Min(
137155
v0[j + 1] + deletionCost, // Cost of deletion
138156
v0[j] + cost)); // Cost of substitution
157+
158+
minv1 = Math.Min(minv1, v1[j + 1]);
159+
}
160+
161+
if (minv1 >= limit)
162+
{
163+
return limit;
139164
}
140165

141166
// copy v1 (current row) to v0 (previous row) for next iteration

test/F23.StringSimilarity.Tests/LevenshteinTest.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ public void TestDistance()
4242
Assert.Equal(expected: 2.0, actual: instance.Distance("My string", "M string2"));
4343
Assert.Equal(expected: 1.0, actual: instance.Distance("My string", "My $tring"));
4444

45+
// With limits.
46+
Assert.Equal(2.0, instance.Distance("My string", "M string2", 4));
47+
Assert.Equal(2.0, instance.Distance("My string", "M string2", 2));
48+
Assert.Equal(1.0, instance.Distance("My string", "M string2", 1));
49+
4550
NullEmptyTests.TestDistance(instance);
4651
}
4752
}

test/F23.StringSimilarity.Tests/WeightedLevenshteinTest.cs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ public void TestDistance()
4242
Assert.Equal(1.0, instance.Distance("Strng", "String"), 1);
4343
Assert.Equal(1.0, instance.Distance("String", "Strng"), 1);
4444

45+
// With limits.
46+
Assert.Equal(0.0, instance.Distance("String1", "String1", double.MaxValue), 1);
47+
Assert.Equal(0.0, instance.Distance("String1", "String1", 2.0), 1);
48+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", double.MaxValue), 1);
49+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 2.0), 1);
50+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 1.5), 1);
51+
Assert.Equal(1.0, instance.Distance("String1", "Srring2", 1.0), 1);
52+
Assert.Equal(4.0, instance.Distance("String1", "Potato", 4.0), 1);
53+
4554
NullEmptyTests.TestDistance(instance);
4655
}
4756

@@ -54,6 +63,7 @@ public void TestDistanceCharacterInsDelInterface()
5463
Assert.Equal(0.0, instance.Distance("String1", "String1"), 1);
5564
Assert.Equal(0.5, instance.Distance("String1", "Srring1"), 1);
5665
Assert.Equal(1.5, instance.Distance("String1", "Srring2"), 1);
66+
5767
// Cost of insert of 'i' is less than normal, so these scores are
5868
// different than testDistance above. Note that the cost of delete
5969
// has been set differently than the cost of insert, so the distance
@@ -63,6 +73,15 @@ public void TestDistanceCharacterInsDelInterface()
6373
Assert.Equal(1.0, instance.Distance("Strig", "String"), 1);
6474
Assert.Equal(1.0, instance.Distance("String", "Strig"), 1);
6575

76+
// Same as above with limits.
77+
Assert.Equal(0.0, instance.Distance("String1", "String1", double.MaxValue), 1);
78+
Assert.Equal(0.0, instance.Distance("String1", "String1", 2.0), 1);
79+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", double.MaxValue), 1);
80+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 2.0), 1);
81+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 1.5), 1);
82+
Assert.Equal(1.0, instance.Distance("String1", "Srring2", 1.0), 1);
83+
Assert.Equal(4.0, instance.Distance("String1", "Potato", 4.0), 1);
84+
6685
NullEmptyTests.TestDistance(instance);
6786
}
6887

0 commit comments

Comments
 (0)