Skip to content

Commit 4e75331

Browse files
authored
Merge pull request #10 from paulirwin/java-catchup
Catch up code to upstream 0.20 release (3797290)
2 parents 6718a54 + 7ae78ca commit 4e75331

File tree

16 files changed

+397
-467
lines changed

16 files changed

+397
-467
lines changed

src/F23.StringSimilarity/Cosine.cs

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
*/
2424

2525
using System;
26+
using System.Collections.Generic;
2627
using F23.StringSimilarity.Interfaces;
28+
// ReSharper disable LoopCanBeConvertedToQuery
2729

2830
namespace F23.StringSimilarity
2931
{
@@ -47,17 +49,16 @@ public Cosine(int k) : base(k) { }
4749
/// Default k is 3.
4850
/// </summary>
4951
public Cosine() { }
50-
52+
5153
public double Similarity(string s1, string s2)
5254
{
5355
if (s1.Length < k || s2.Length < k)
5456
{
5557
return 0;
5658
}
5759

58-
KShingling ks = new KShingling(k);
59-
int[] profile1 = ks.GetArrayProfile(s1);
60-
int[] profile2 = ks.GetArrayProfile(s2);
60+
var profile1 = GetProfile(s1);
61+
var profile2 = GetProfile(s2);
6162

6263
return DotProduct(profile1, profile2) / (Norm(profile1) * Norm(profile2));
6364
}
@@ -68,31 +69,49 @@ public double Similarity(string s1, string s2)
6869
* @param profile
6970
* @return L2 norm
7071
*/
71-
private static double Norm(int[] profile)
72+
private static double Norm(IDictionary<string, int> profile)
7273
{
7374
double agg = 0;
7475

75-
foreach (var v in profile)
76+
foreach (var entry in profile)
7677
{
77-
agg += (double)v * v;
78+
agg += 1.0 * entry.Value * entry.Value;
7879
}
7980

8081
return Math.Sqrt(agg);
8182
}
8283

83-
private static double DotProduct(int[] profile1, int[] profile2)
84+
private static double DotProduct(IDictionary<string, int> profile1,
85+
IDictionary<string, int> profile2)
8486
{
85-
int length = Math.Min(profile1.Length, profile2.Length);
87+
// Loop over the smallest map
88+
var small_profile = profile2;
89+
var large_profile = profile1;
90+
91+
if (profile1.Count < profile2.Count)
92+
{
93+
small_profile = profile1;
94+
large_profile = profile2;
95+
}
8696

8797
double agg = 0;
88-
for (int i = 0; i < length; i++)
98+
foreach (var entry in small_profile)
8999
{
90-
agg += (double)profile1[i] * profile2[i];
100+
int i;
101+
102+
if (!large_profile.TryGetValue(entry.Key, out i)) continue;
103+
104+
agg += 1.0 * entry.Value * i;
91105
}
106+
92107
return agg;
93108
}
94109

95110
public double Distance(string s1, string s2)
96111
=> 1.0 - Similarity(s1, s2);
112+
113+
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
114+
=> DotProduct(profile1, profile2)
115+
/ (Norm(profile1) * Norm(profile2));
97116
}
98117
}

src/F23.StringSimilarity/Damerau.cs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,12 @@ public double Distance(string s1, string s2)
5252

5353
for (int d = 0; d < s1.Length; d++)
5454
{
55-
if (!da.ContainsKey(s1[d]))
56-
{
57-
da[s1[d]] = 0;
58-
}
55+
da[s1[d]] = 0;
5956
}
6057

6158
for (int d = 0; d < s2.Length; d++)
6259
{
63-
if (!da.ContainsKey(s2[d]))
64-
{
65-
da[s2[d]] = 0;
66-
}
60+
da[s2[d]] = 0;
6761
}
6862

6963
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2016 feature[23]
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
using System;
26+
using System.Collections.Generic;
27+
using F23.StringSimilarity.Interfaces;
28+
29+
namespace F23.StringSimilarity.Experimental
30+
{
31+
/// <summary>
32+
/// Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
33+
/// and Longest Common Subsequence.
34+
/// Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
35+
/// https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
36+
/// </summary>
37+
public class Sift4 : IStringDistance
38+
{
39+
private const int DEFAULT_MAX_OFFSET = 10;
40+
41+
/// <summary>
42+
/// Gets or sets the maximum distance to search for character transposition.
43+
/// Compuse cost of algorithm is O(n . MaxOffset)
44+
/// </summary>
45+
public int MaxOffset { get; set; } = DEFAULT_MAX_OFFSET;
46+
47+
/// <summary>
48+
/// Used to store relation between same character in different positions
49+
/// c1 and c2 in the input strings.
50+
/// </summary>
51+
/// <remarks>
52+
/// .NET port notes: should this be a struct instead?
53+
/// </remarks>
54+
private class Offset
55+
{
56+
internal readonly int c1;
57+
internal readonly int c2;
58+
internal bool trans;
59+
60+
internal Offset(int c1, int c2, bool trans)
61+
{
62+
this.c1 = c1;
63+
this.c2 = c2;
64+
this.trans = trans;
65+
}
66+
}
67+
68+
/// <summary>
69+
/// Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
70+
/// and Longest Common Subsequence.
71+
/// Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
72+
/// https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
73+
/// </summary>
74+
/// <param name="s1"></param>
75+
/// <param name="s2"></param>
76+
/// <returns></returns>
77+
public double Distance(string s1, string s2)
78+
{
79+
if (string.IsNullOrEmpty(s1))
80+
{
81+
if (s2 == null)
82+
{
83+
return 0;
84+
}
85+
86+
return s2.Length;
87+
}
88+
89+
if (string.IsNullOrEmpty(s2))
90+
{
91+
return s1.Length;
92+
}
93+
94+
int l1 = s1.Length;
95+
int l2 = s2.Length;
96+
97+
int c1 = 0; //cursor for string 1
98+
int c2 = 0; //cursor for string 2
99+
int lcss = 0; //largest common subsequence
100+
int local_cs = 0; //local common substring
101+
int trans = 0; //number of transpositions ('ab' vs 'ba')
102+
103+
// offset pair array, for computing the transpositions
104+
var offset_arr = new List<Offset>();
105+
106+
while ((c1 < l1) && (c2 < l2))
107+
{
108+
if (s1[c1] == s2[c2])
109+
{
110+
local_cs++;
111+
bool is_trans = false;
112+
// see if current match is a transposition
113+
int i = 0;
114+
while (i < offset_arr.Count)
115+
{
116+
Offset ofs = offset_arr[i];
117+
if (c1 <= ofs.c1 || c2 <= ofs.c2)
118+
{
119+
// when two matches cross, the one considered a
120+
// transposition is the one with the largest difference
121+
// in offsets
122+
is_trans = Math.Abs(c2 - c1) >= Math.Abs(ofs.c2 - ofs.c1);
123+
124+
if (is_trans)
125+
{
126+
trans++;
127+
}
128+
else
129+
{
130+
if (!ofs.trans)
131+
{
132+
ofs.trans = true;
133+
trans++;
134+
}
135+
}
136+
137+
break;
138+
}
139+
else
140+
{
141+
if (c1 > ofs.c2 && c2 > ofs.c1)
142+
{
143+
offset_arr.RemoveAt(i);
144+
}
145+
else
146+
{
147+
i++;
148+
}
149+
}
150+
}
151+
152+
offset_arr.Add(new Offset(c1, c2, is_trans));
153+
}
154+
else
155+
{
156+
// s1.charAt(c1) != s2.charAt(c2)
157+
lcss += local_cs;
158+
local_cs = 0;
159+
if (c1 != c2)
160+
{
161+
//using min allows the computation of transpositions
162+
c1 = Math.Min(c1, c2);
163+
c2 = c1;
164+
}
165+
166+
// if matching characters are found, remove 1 from both cursors
167+
// (they get incremented at the end of the loop)
168+
// so that we can have only one code block handling matches
169+
for (int i = 0;
170+
i < MaxOffset && (c1 + i < l1 || c2 + i < l2);
171+
i++)
172+
{
173+
if ((c1 + i < l1) && (s1[c1 + i] == s2[c2]))
174+
{
175+
c1 += i - 1;
176+
c2--;
177+
break;
178+
}
179+
180+
if ((c2 + i < l2) && (s1[c1] == s2[c2 + i]))
181+
{
182+
c1--;
183+
c2 += i - 1;
184+
break;
185+
}
186+
}
187+
}
188+
c1++;
189+
c2++;
190+
// this covers the case where the last match is on the last token
191+
// in list, so that it can compute transpositions correctly
192+
if ((c1 >= l1) || (c2 >= l2))
193+
{
194+
lcss += local_cs;
195+
local_cs = 0;
196+
c1 = Math.Min(c1, c2);
197+
c2 = c1;
198+
}
199+
}
200+
201+
lcss += local_cs;
202+
203+
// add the cost of transpositions to the final result
204+
return Math.Round((double) (Math.Max(l1, l2) - lcss + trans));
205+
}
206+
}
207+
}

src/F23.StringSimilarity/F23.StringSimilarity.csproj

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
<ItemGroup>
4343
<Compile Include="Cosine.cs" />
4444
<Compile Include="Damerau.cs" />
45+
<Compile Include="Experimental\Sift4.cs" />
4546
<Compile Include="ICharacterSubstitution.cs" />
4647
<Compile Include="Interfaces\IMetricStringDistance.cs" />
4748
<Compile Include="Interfaces\INormalizedStringDistance.cs" />
@@ -50,7 +51,6 @@
5051
<Compile Include="Interfaces\IStringSimilarity.cs" />
5152
<Compile Include="Jaccard.cs" />
5253
<Compile Include="JaroWinkler.cs" />
53-
<Compile Include="KShingling.cs" />
5454
<Compile Include="Levenshtein.cs" />
5555
<Compile Include="LongestCommonSubsequence.cs" />
5656
<Compile Include="MetricLCS.cs" />
@@ -61,8 +61,6 @@
6161
<Compile Include="QGram.cs" />
6262
<Compile Include="ShingleBased.cs" />
6363
<Compile Include="SorensenDice.cs" />
64-
<Compile Include="StringProfile.cs" />
65-
<Compile Include="StringSet.cs" />
6664
<Compile Include="Support\ArrayExtensions.cs" />
6765
<Compile Include="Utils\SparseBooleanVector.cs" />
6866
<Compile Include="Utils\SparseIntegerVector.cs" />

src/F23.StringSimilarity/Jaccard.cs

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
*/
2424

2525
using System;
26+
using System.Collections.Generic;
2627
using F23.StringSimilarity.Interfaces;
2728
using F23.StringSimilarity.Support;
29+
// ReSharper disable LoopCanBeConvertedToQuery
2830

2931
namespace F23.StringSimilarity
3032
{
@@ -42,32 +44,22 @@ public Jaccard() { }
4244
/// <returns>Similarity</returns>
4345
public double Similarity(string s1, string s2)
4446
{
45-
KShingling ks = new KShingling(k);
46-
int[] profile1 = ks.GetArrayProfile(s1);
47-
int[] profile2 = ks.GetArrayProfile(s2);
47+
var profile1 = GetProfile(s1);
48+
var profile2 = GetProfile(s2);
4849

49-
int length = Math.Max(profile1.Length, profile2.Length);
50-
51-
profile1 = profile1.WithPadding(length);
52-
profile2 = profile2.WithPadding(length);
50+
var union = new HashSet<string>();
51+
union.UnionWith(profile1.Keys);
52+
union.UnionWith(profile2.Keys);
5353

5454
int inter = 0;
55-
int union = 0;
5655

57-
for (int i = 0; i < length; i++)
56+
foreach (var key in union)
5857
{
59-
if (profile1[i] > 0 || profile2[i] > 0)
60-
{
61-
union++;
62-
63-
if (profile1[i] > 0 && profile2[i] > 0)
64-
{
65-
inter++;
66-
}
67-
}
58+
if (profile1.ContainsKey(key) && profile2.ContainsKey(key))
59+
inter++;
6860
}
6961

70-
return 1.0 * inter / union;
62+
return 1.0 * inter / union.Count;
7163
}
7264

7365

0 commit comments

Comments
 (0)