Skip to content

Commit 7ae78ca

Browse files
committed
Catch .NET port up to upstream 0.20 release
1 parent d4bf691 commit 7ae78ca

File tree

8 files changed

+266
-16
lines changed

8 files changed

+266
-16
lines changed

src/F23.StringSimilarity/Cosine.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,11 @@ private static double DotProduct(IDictionary<string, int> profile1,
9797
double agg = 0;
9898
foreach (var entry in small_profile)
9999
{
100-
if (!large_profile.ContainsKey(entry.Key)) continue;
100+
int i;
101+
102+
if (!large_profile.TryGetValue(entry.Key, out i)) continue;
101103

102-
agg += 1.0 * entry.Value * large_profile[entry.Key];
104+
agg += 1.0 * entry.Value * i;
103105
}
104106

105107
return agg;

src/F23.StringSimilarity/Damerau.cs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,12 @@ public double Distance(string s1, string s2)
5252

5353
for (int d = 0; d < s1.Length; d++)
5454
{
55-
if (!da.ContainsKey(s1[d]))
56-
{
57-
da[s1[d]] = 0;
58-
}
55+
da[s1[d]] = 0;
5956
}
6057

6158
for (int d = 0; d < s2.Length; d++)
6259
{
63-
if (!da.ContainsKey(s2[d]))
64-
{
65-
da[s2[d]] = 0;
66-
}
60+
da[s2[d]] = 0;
6761
}
6862

6963
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2016 feature[23]
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
using System;
26+
using System.Collections.Generic;
27+
using F23.StringSimilarity.Interfaces;
28+
29+
namespace F23.StringSimilarity.Experimental
30+
{
31+
/// <summary>
32+
/// Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
33+
/// and Longest Common Subsequence.
34+
/// Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
35+
/// https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
36+
/// </summary>
37+
public class Sift4 : IStringDistance
38+
{
39+
private const int DEFAULT_MAX_OFFSET = 10;
40+
41+
/// <summary>
42+
/// Gets or sets the maximum distance to search for character transposition.
43+
/// Compuse cost of algorithm is O(n . MaxOffset)
44+
/// </summary>
45+
public int MaxOffset { get; set; } = DEFAULT_MAX_OFFSET;
46+
47+
/// <summary>
48+
/// Used to store relation between same character in different positions
49+
/// c1 and c2 in the input strings.
50+
/// </summary>
51+
/// <remarks>
52+
/// .NET port notes: should this be a struct instead?
53+
/// </remarks>
54+
private class Offset
55+
{
56+
internal readonly int c1;
57+
internal readonly int c2;
58+
internal bool trans;
59+
60+
internal Offset(int c1, int c2, bool trans)
61+
{
62+
this.c1 = c1;
63+
this.c2 = c2;
64+
this.trans = trans;
65+
}
66+
}
67+
68+
/// <summary>
69+
/// Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
70+
/// and Longest Common Subsequence.
71+
/// Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
72+
/// https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
73+
/// </summary>
74+
/// <param name="s1"></param>
75+
/// <param name="s2"></param>
76+
/// <returns></returns>
77+
public double Distance(string s1, string s2)
78+
{
79+
if (string.IsNullOrEmpty(s1))
80+
{
81+
if (s2 == null)
82+
{
83+
return 0;
84+
}
85+
86+
return s2.Length;
87+
}
88+
89+
if (string.IsNullOrEmpty(s2))
90+
{
91+
return s1.Length;
92+
}
93+
94+
int l1 = s1.Length;
95+
int l2 = s2.Length;
96+
97+
int c1 = 0; //cursor for string 1
98+
int c2 = 0; //cursor for string 2
99+
int lcss = 0; //largest common subsequence
100+
int local_cs = 0; //local common substring
101+
int trans = 0; //number of transpositions ('ab' vs 'ba')
102+
103+
// offset pair array, for computing the transpositions
104+
var offset_arr = new List<Offset>();
105+
106+
while ((c1 < l1) && (c2 < l2))
107+
{
108+
if (s1[c1] == s2[c2])
109+
{
110+
local_cs++;
111+
bool is_trans = false;
112+
// see if current match is a transposition
113+
int i = 0;
114+
while (i < offset_arr.Count)
115+
{
116+
Offset ofs = offset_arr[i];
117+
if (c1 <= ofs.c1 || c2 <= ofs.c2)
118+
{
119+
// when two matches cross, the one considered a
120+
// transposition is the one with the largest difference
121+
// in offsets
122+
is_trans = Math.Abs(c2 - c1) >= Math.Abs(ofs.c2 - ofs.c1);
123+
124+
if (is_trans)
125+
{
126+
trans++;
127+
}
128+
else
129+
{
130+
if (!ofs.trans)
131+
{
132+
ofs.trans = true;
133+
trans++;
134+
}
135+
}
136+
137+
break;
138+
}
139+
else
140+
{
141+
if (c1 > ofs.c2 && c2 > ofs.c1)
142+
{
143+
offset_arr.RemoveAt(i);
144+
}
145+
else
146+
{
147+
i++;
148+
}
149+
}
150+
}
151+
152+
offset_arr.Add(new Offset(c1, c2, is_trans));
153+
}
154+
else
155+
{
156+
// s1.charAt(c1) != s2.charAt(c2)
157+
lcss += local_cs;
158+
local_cs = 0;
159+
if (c1 != c2)
160+
{
161+
//using min allows the computation of transpositions
162+
c1 = Math.Min(c1, c2);
163+
c2 = c1;
164+
}
165+
166+
// if matching characters are found, remove 1 from both cursors
167+
// (they get incremented at the end of the loop)
168+
// so that we can have only one code block handling matches
169+
for (int i = 0;
170+
i < MaxOffset && (c1 + i < l1 || c2 + i < l2);
171+
i++)
172+
{
173+
if ((c1 + i < l1) && (s1[c1 + i] == s2[c2]))
174+
{
175+
c1 += i - 1;
176+
c2--;
177+
break;
178+
}
179+
180+
if ((c2 + i < l2) && (s1[c1] == s2[c2 + i]))
181+
{
182+
c1--;
183+
c2 += i - 1;
184+
break;
185+
}
186+
}
187+
}
188+
c1++;
189+
c2++;
190+
// this covers the case where the last match is on the last token
191+
// in list, so that it can compute transpositions correctly
192+
if ((c1 >= l1) || (c2 >= l2))
193+
{
194+
lcss += local_cs;
195+
local_cs = 0;
196+
c1 = Math.Min(c1, c2);
197+
c2 = c1;
198+
}
199+
}
200+
201+
lcss += local_cs;
202+
203+
// add the cost of transpositions to the final result
204+
return Math.Round((double) (Math.Max(l1, l2) - lcss + trans));
205+
}
206+
}
207+
}

src/F23.StringSimilarity/F23.StringSimilarity.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
<ItemGroup>
4343
<Compile Include="Cosine.cs" />
4444
<Compile Include="Damerau.cs" />
45+
<Compile Include="Experimental\Sift4.cs" />
4546
<Compile Include="ICharacterSubstitution.cs" />
4647
<Compile Include="Interfaces\IMetricStringDistance.cs" />
4748
<Compile Include="Interfaces\INormalizedStringDistance.cs" />

src/F23.StringSimilarity/QGram.cs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,18 @@ public double Distance(string s1, string s2)
8383
int v1 = 0;
8484
int v2 = 0;
8585

86-
if (profile1.ContainsKey(key))
86+
int iv1;
87+
88+
if (profile1.TryGetValue(key, out iv1))
8789
{
88-
v1 = profile1[key];
90+
v1 = iv1;
8991
}
9092

91-
if (profile2.ContainsKey(key))
93+
int iv2;
94+
95+
if (profile2.TryGetValue(key, out iv2))
9296
{
93-
v2 = profile2[key];
97+
v2 = iv2;
9498
}
9599

96100
agg += Math.Abs(v1 - v2);

src/F23.StringSimilarity/ShingleBased.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,11 @@ protected IDictionary<string, int> GetProfile(string s)
7272
{
7373
var shingle = string_no_space.Substring(i, k);
7474

75-
if (shingles.ContainsKey(shingle))
75+
int old;
76+
77+
if (shingles.TryGetValue(shingle, out old))
7678
{
77-
shingles[shingle] += 1;
79+
shingles[shingle] = old + 1;
7880
}
7981
else
8082
{
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
using System;
2+
using F23.StringSimilarity.Experimental;
3+
using Xunit;
4+
5+
namespace F23.StringSimilarity.Tests.Experimental
6+
{
7+
public class Sift4Test
8+
{
9+
[Fact]
10+
public void TestDistance()
11+
{
12+
string s1 = "This is the first string";
13+
string s2 = "And this is another string";
14+
15+
var sift4 = new Sift4
16+
{
17+
MaxOffset = 5
18+
};
19+
20+
double result = sift4.Distance(s1, s2);
21+
22+
Assert.Equal(
23+
expected: 11.0,
24+
actual: result,
25+
precision: 1); // 0.0
26+
27+
sift4.MaxOffset = 10;
28+
29+
result = sift4.Distance(
30+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
31+
"Amet Lorm ispum dolor sit amet, consetetur adixxxpiscing elit.");
32+
33+
Assert.Equal(
34+
expected: 12.0,
35+
actual: result,
36+
precision: 1); // 0.0
37+
}
38+
}
39+
}

test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
</Choose>
6666
<ItemGroup>
6767
<Compile Include="DamerauTest.cs" />
68+
<Compile Include="Experimental\Sift4Test.cs" />
6869
<Compile Include="NGramTest.cs" />
6970
<Compile Include="OptimalStringAlignmentTest.cs" />
7071
<Compile Include="Properties\AssemblyInfo.cs" />

0 commit comments

Comments
 (0)