Skip to content

Commit d4bf691

Browse files
committed
Catch up code to upstream 0.19 release (d9439d6)
1 parent 845d4f1 commit d4bf691

File tree

13 files changed

+139
-459
lines changed

13 files changed

+139
-459
lines changed

src/F23.StringSimilarity/Cosine.cs

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
*/
2424

2525
using System;
26+
using System.Collections.Generic;
2627
using F23.StringSimilarity.Interfaces;
28+
// ReSharper disable LoopCanBeConvertedToQuery
2729

2830
namespace F23.StringSimilarity
2931
{
@@ -47,17 +49,16 @@ public Cosine(int k) : base(k) { }
4749
/// Default k is 3.
4850
/// </summary>
4951
public Cosine() { }
50-
52+
5153
public double Similarity(string s1, string s2)
5254
{
5355
if (s1.Length < k || s2.Length < k)
5456
{
5557
return 0;
5658
}
5759

58-
KShingling ks = new KShingling(k);
59-
int[] profile1 = ks.GetArrayProfile(s1);
60-
int[] profile2 = ks.GetArrayProfile(s2);
60+
var profile1 = GetProfile(s1);
61+
var profile2 = GetProfile(s2);
6162

6263
return DotProduct(profile1, profile2) / (Norm(profile1) * Norm(profile2));
6364
}
@@ -68,31 +69,47 @@ public double Similarity(string s1, string s2)
6869
* @param profile
6970
* @return L2 norm
7071
*/
71-
private static double Norm(int[] profile)
72+
private static double Norm(IDictionary<string, int> profile)
7273
{
7374
double agg = 0;
7475

75-
foreach (var v in profile)
76+
foreach (var entry in profile)
7677
{
77-
agg += (double)v * v;
78+
agg += 1.0 * entry.Value * entry.Value;
7879
}
7980

8081
return Math.Sqrt(agg);
8182
}
8283

83-
private static double DotProduct(int[] profile1, int[] profile2)
84+
private static double DotProduct(IDictionary<string, int> profile1,
85+
IDictionary<string, int> profile2)
8486
{
85-
int length = Math.Min(profile1.Length, profile2.Length);
87+
// Loop over the smallest map
88+
var small_profile = profile2;
89+
var large_profile = profile1;
90+
91+
if (profile1.Count < profile2.Count)
92+
{
93+
small_profile = profile1;
94+
large_profile = profile2;
95+
}
8696

8797
double agg = 0;
88-
for (int i = 0; i < length; i++)
98+
foreach (var entry in small_profile)
8999
{
90-
agg += (double)profile1[i] * profile2[i];
100+
if (!large_profile.ContainsKey(entry.Key)) continue;
101+
102+
agg += 1.0 * entry.Value * large_profile[entry.Key];
91103
}
104+
92105
return agg;
93106
}
94107

95108
public double Distance(string s1, string s2)
96109
=> 1.0 - Similarity(s1, s2);
110+
111+
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
112+
=> DotProduct(profile1, profile2)
113+
/ (Norm(profile1) * Norm(profile2));
97114
}
98115
}

src/F23.StringSimilarity/F23.StringSimilarity.csproj

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
<Compile Include="Interfaces\IStringSimilarity.cs" />
5151
<Compile Include="Jaccard.cs" />
5252
<Compile Include="JaroWinkler.cs" />
53-
<Compile Include="KShingling.cs" />
5453
<Compile Include="Levenshtein.cs" />
5554
<Compile Include="LongestCommonSubsequence.cs" />
5655
<Compile Include="MetricLCS.cs" />
@@ -61,8 +60,6 @@
6160
<Compile Include="QGram.cs" />
6261
<Compile Include="ShingleBased.cs" />
6362
<Compile Include="SorensenDice.cs" />
64-
<Compile Include="StringProfile.cs" />
65-
<Compile Include="StringSet.cs" />
6663
<Compile Include="Support\ArrayExtensions.cs" />
6764
<Compile Include="Utils\SparseBooleanVector.cs" />
6865
<Compile Include="Utils\SparseIntegerVector.cs" />

src/F23.StringSimilarity/Jaccard.cs

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
*/
2424

2525
using System;
26+
using System.Collections.Generic;
2627
using F23.StringSimilarity.Interfaces;
2728
using F23.StringSimilarity.Support;
29+
// ReSharper disable LoopCanBeConvertedToQuery
2830

2931
namespace F23.StringSimilarity
3032
{
@@ -42,32 +44,22 @@ public Jaccard() { }
4244
/// <returns>Similarity</returns>
4345
public double Similarity(string s1, string s2)
4446
{
45-
KShingling ks = new KShingling(k);
46-
int[] profile1 = ks.GetArrayProfile(s1);
47-
int[] profile2 = ks.GetArrayProfile(s2);
47+
var profile1 = GetProfile(s1);
48+
var profile2 = GetProfile(s2);
4849

49-
int length = Math.Max(profile1.Length, profile2.Length);
50-
51-
profile1 = profile1.WithPadding(length);
52-
profile2 = profile2.WithPadding(length);
50+
var union = new HashSet<string>();
51+
union.UnionWith(profile1.Keys);
52+
union.UnionWith(profile2.Keys);
5353

5454
int inter = 0;
55-
int union = 0;
5655

57-
for (int i = 0; i < length; i++)
56+
foreach (var key in union)
5857
{
59-
if (profile1[i] > 0 || profile2[i] > 0)
60-
{
61-
union++;
62-
63-
if (profile1[i] > 0 && profile2[i] > 0)
64-
{
65-
inter++;
66-
}
67-
}
58+
if (profile1.ContainsKey(key) && profile2.ContainsKey(key))
59+
inter++;
6860
}
6961

70-
return 1.0 * inter / union;
62+
return 1.0 * inter / union.Count;
7163
}
7264

7365

src/F23.StringSimilarity/KShingling.cs

Lines changed: 0 additions & 163 deletions
This file was deleted.

src/F23.StringSimilarity/NGram.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,11 @@ public double Distance(string s0, string s1)
6060
{
6161
if (sl == tl)
6262
{
63-
return 1;
63+
return 0;
6464
}
6565
else
6666
{
67-
return 0;
67+
return 1;
6868
}
6969
}
7070

@@ -128,7 +128,7 @@ public double Distance(string s0, string s1)
128128
}
129129
else
130130
{
131-
t_j = s1.Substring(j - n, j).ToCharArray();
131+
t_j = s1.Substring(j - n, n).ToCharArray();
132132
}
133133
d[0] = j;
134134
for (i = 1; i <= sl; i++)
@@ -160,7 +160,7 @@ public double Distance(string s0, string s1)
160160

161161
// Our last action in the above loop was to switch d and p, so p now
162162
// actually has the most recent cost counts
163-
return 1.0 - (p[sl] / Math.Max(tl, sl));
163+
return p[sl] / Math.Max(tl, sl);
164164
}
165165
}
166166
}

0 commit comments

Comments
 (0)