Skip to content

Commit e7f31aa

Browse files
committed
Catch up to 2.0.0 release of upstream project as of commit 1924ab8. Implement Ratcliff-Obserhelp algorithm. Add missing SIFT4 example to README. Fix documentation issue #17.
1 parent b1b6399 commit e7f31aa

File tree

6 files changed

+306
-5
lines changed

6 files changed

+306
-5
lines changed
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
2-
<s:Boolean x:Key="/Default/UserDictionary/Words/=Levenshtein/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
2+
<s:Boolean x:Key="/Default/UserDictionary/Words/=Levenshtein/@EntryIndexedValue">True</s:Boolean>
3+
<s:Boolean x:Key="/Default/UserDictionary/Words/=Obershelp/@EntryIndexedValue">True</s:Boolean>
4+
<s:Boolean x:Key="/Default/UserDictionary/Words/=Ratcliff/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>

README.md

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ A library implementing different string similarity and distance measures. A doze
2525
* [Cosine similarity](#shingle-n-gram-based-algorithms)
2626
* [Jaccard index](#shingle-n-gram-based-algorithms)
2727
* [Sorensen-Dice coefficient](#shingle-n-gram-based-algorithms)
28+
* [Ratcliff-Obershelp](#ratcliff-obershelp)
29+
* [Experimental](#experimental)
30+
* [SIFT4](#sift4)
2831
* [License](#license)
2932

3033
## Download
@@ -52,6 +55,7 @@ The main characteristics of each implemented algorithm are presented below. The
5255
| [Cosine similarity](#cosine-similarity) |similarity<br>distance | Yes | No | Profile | O(m+n) |
5356
| [Jaccard index](#jaccard-index) |similarity<br>distance | Yes | Yes | Set | O(m+n) |
5457
| [Sorensen-Dice coefficient](#sorensen-dice-coefficient) |similarity<br>distance | Yes | No | Set | O(m+n) |
58+
| [Ratcliff-Obershelp](#ratcliff-obershelp) |similarity<br>distance | Yes | No | | ? | |
5559

5660
[1] In this library, Levenshtein edit distance, LCS distance and their sibblings are computed using the **dynamic programming** method, which has a cost O(m.n). For Levenshtein distance, the algorithm is sometimes called **Wagner-Fischer algorithm** ("The string-to-string correction problem", 1974). The original algorithm uses a matrix of size m x n to store the Levenshtein distance between string prefixes.
5761

@@ -335,7 +339,7 @@ public class Program
335339
{
336340
public static void Main(string[] args)
337341
{
338-
// produces 0.416666
342+
// produces 0.583333
339343
var twogram = new NGram(2);
340344
Console.WriteLine(twogram.Distance("ABCD", "ABTUIO"));
341345

@@ -427,6 +431,64 @@ Similar to Jaccard index, but this time the similarity is computed as 2 * |V1 in
427431

428432
Distance is computed as 1 - cosine similarity.
429433

434+
## Ratcliff-Obershelp
435+
Ratcliff/Obershelp Pattern Recognition, also known as Gestalt Pattern Matching, is a string-matching algorithm for determining the similarity of two strings. It was developed in 1983 by John W. Ratcliff and John A. Obershelp and published in the Dr. Dobb's Journal in July 1988
436+
437+
Ratcliff/Obershelp computes the similarity between 2 strings, and the returned value lies in the interval [0.0, 1.0].
438+
439+
The distance is computed as 1 - Ratcliff/Obershelp similarity.
440+
441+
```cs
442+
using System;
443+
using F23.StringSimilarity;
444+
445+
public class Program
446+
{
447+
public static void Main(string[] args)
448+
{
449+
var ro = new RatcliffObershelp();
450+
451+
// substitution of s and t
452+
Console.WriteLine(ro.Similarity("My string", "My tsring"));
453+
454+
// substitution of s and n
455+
Console.WriteLine(ro.Similarity("My string", "My ntrisg"));
456+
}
457+
}
458+
```
459+
460+
will produce:
461+
462+
```
463+
0.8888888888888888
464+
0.7777777777777778
465+
```
466+
467+
## Experimental
468+
469+
### SIFT4
470+
SIFT4 is a general purpose string distance algorithm inspired by JaroWinkler and Longest Common Subsequence. It was developed to produce a distance measure that matches as close as possible to the human perception of string distance. Hence it takes into account elements like character substitution, character distance, longest common subsequence etc. It was developed using experimental testing, and without theoretical background.
471+
472+
```cs
473+
using System;
474+
using System.Diagnostics;
475+
using F23.StringSimilarity;
476+
477+
public class Program
478+
{
479+
public static void Main(string[] args)
480+
{
481+
var s1 = "This is the first string";
482+
var s2 = "And this is another string";
483+
var sift4 = new Sift4();
484+
sift4.MaxOffset = 5;
485+
double expResult = 11.0;
486+
double result = sift4.Distance(s1, s2);
487+
Debug.Assert(Math.Abs(result - expResult) < 0.1);
488+
}
489+
}
490+
```
491+
430492
## License
431493

432494
This code is licensed under the MIT license.

src/F23.StringSimilarity/Experimental/Sift4.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ namespace F23.StringSimilarity.Experimental
3232
/// Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
3333
/// and Longest Common Subsequence.
3434
/// Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
35-
/// https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
35+
/// https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
36+
/// https://blackdoor.github.io/blog/sift4-java/
3637
/// </summary>
3738
public class Sift4 : IStringDistance
3839
{
@@ -69,7 +70,8 @@ internal Offset(int c1, int c2, bool trans)
6970
/// Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
7071
/// and Longest Common Subsequence.
7172
/// Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
72-
/// https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
73+
/// https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
74+
/// https://blackdoor.github.io/blog/sift4-java/
7375
/// </summary>
7476
/// <param name="s1"></param>
7577
/// <param name="s2"></param>
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using F23.StringSimilarity.Interfaces;
4+
5+
namespace F23.StringSimilarity
6+
{
7+
/// <summary>
8+
/// Ratcliff/Obershelp pattern recognition
9+
///
10+
/// The Ratcliff/Obershelp algorithm computes the similarity of two strings a
11+
/// the doubled number of matching characters divided by the total number of
12+
/// characters in the two strings.Matching characters are those in the longest
13+
/// common subsequence plus, recursively, matching characters in the unmatched
14+
/// region on either side of the longest common subsequence.
15+
/// The Ratcliff/Obershelp distance is computed as 1 - Ratcliff/Obershelp
16+
/// similarity.
17+
///
18+
/// Author: Ligi https://github.com/dxpux (as a patch for fuzzystring)
19+
/// Ported to java from .net by denmase
20+
/// Ported back to .NET by paulirwin to retain compatibility with upstream Java project
21+
/// </summary>
22+
public class RatcliffObershelp : INormalizedStringSimilarity, INormalizedStringDistance
23+
{
24+
/// <summary>
25+
/// Compute the Ratcliff-Obershelp similarity between strings.
26+
/// </summary>
27+
/// <param name="s1">The first string to compare.</param>
28+
/// <param name="s2">The second string to compare.</param>
29+
/// <returns>The RatcliffObershelp similarity in the range [0, 1]</returns>
30+
/// <exception cref="System.ArgumentNullException">If s1 or s2 is null.</exception>
31+
public double Similarity(string s1, string s2)
32+
{
33+
if (s1 == null)
34+
{
35+
throw new ArgumentNullException(nameof(s1), "s1 must not be null");
36+
}
37+
38+
if (s2 == null)
39+
{
40+
throw new ArgumentNullException(nameof(s2), "s2 must not be null");
41+
}
42+
43+
if (s1.Equals(s2))
44+
{
45+
return 1.0d;
46+
}
47+
48+
var matches = GetMatchList(s1, s2);
49+
int sumOfMatches = 0;
50+
51+
foreach (var match in matches)
52+
{
53+
sumOfMatches += match.Length;
54+
}
55+
56+
return 2.0d * sumOfMatches / (s1.Length + s2.Length);
57+
}
58+
59+
/// <summary>
60+
/// Return 1 - similarity.
61+
/// </summary>
62+
/// <param name="s1">The first string to compare.</param>
63+
/// <param name="s2">The second string to compare.</param>
64+
/// <returns>1 - similarity</returns>
65+
/// <exception cref="System.ArgumentNullException">If s1 or s2 is null.</exception>
66+
public double Distance(string s1, string s2)
67+
{
68+
return 1.0d - Similarity(s1, s2);
69+
}
70+
71+
private static IList<string> GetMatchList(string s1, string s2)
72+
{
73+
var list = new List<string>();
74+
var match = FrontMaxMatch(s1, s2);
75+
76+
if (match.Length > 0)
77+
{
78+
var frontSource = s1.Substring(0, s1.IndexOf(match, StringComparison.Ordinal));
79+
var frontTarget = s2.Substring(0, s2.IndexOf(match, StringComparison.Ordinal));
80+
var frontQueue = GetMatchList(frontSource, frontTarget);
81+
82+
var endSource = s1.Substring(s1.IndexOf(match, StringComparison.Ordinal) + match.Length);
83+
var endTarget = s2.Substring(s2.IndexOf(match, StringComparison.Ordinal) + match.Length);
84+
var endQueue = GetMatchList(endSource, endTarget);
85+
86+
list.Add(match);
87+
list.AddRange(frontQueue);
88+
list.AddRange(endQueue);
89+
}
90+
91+
return list;
92+
}
93+
94+
private static string FrontMaxMatch(string s1, string s2)
95+
{
96+
int longest = 0;
97+
var longestSubstring = "";
98+
99+
for (int i = 0; i < s1.Length; ++i)
100+
{
101+
for (int j = i + 1; j <= s1.Length; ++j)
102+
{
103+
var substring = s1.Substring(i, j - i);
104+
if (s2.Contains(substring) && substring.Length > longest)
105+
{
106+
longest = substring.Length;
107+
longestSubstring = substring;
108+
}
109+
}
110+
}
111+
112+
return longestSubstring;
113+
}
114+
}
115+
}

test/F23.StringSimilarity.Tests/CosineTest.cs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,32 @@ public async Task TestLargeString()
9999
public void TestDistance()
100100
{
101101
var instance = new Cosine();
102+
103+
double result = instance.Distance("ABC", "ABCE");
104+
Assert.Equal(0.29, result, 2);
105+
102106
NullEmptyTests.TestDistance(instance);
107+
}
108+
109+
[Fact]
110+
public void TestDistanceSmallString()
111+
{
112+
var instance = new Cosine(3);
113+
double result = instance.Distance("AB", "ABCE");
114+
Assert.Equal(1, result, 5);
115+
}
116+
117+
[Fact]
118+
public async Task TestDistanceLargeString()
119+
{
120+
var cos = new Cosine();
121+
122+
// read from 2 text files
123+
var string1 = await ReadResourceFileAsync("71816-2.txt");
124+
var string2 = await ReadResourceFileAsync("11328-1.txt");
125+
double similarity = cos.Distance(string1, string2);
103126

104-
// TODO: regular (non-null/empty) distance tests
127+
Assert.Equal(0.1885, similarity, 3);
105128
}
106129

107130
[Fact]
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
using F23.StringSimilarity.Tests.TestUtil;
2+
using Xunit;
3+
4+
namespace F23.StringSimilarity.Tests
5+
{
6+
public class RatcliffObershelpTest
7+
{
8+
[Fact]
9+
public void TestSimilarity()
10+
{
11+
var instance = new RatcliffObershelp();
12+
13+
// test data from other algorithms
14+
// "My string" vs "My tsring"
15+
// Substrings:
16+
// "ring" ==> 4, "My s" ==> 3, "s" ==> 1
17+
// Ratcliff-Obershelp = 2*(sum of substrings)/(length of s1 + length of s2)
18+
// = 2*(4 + 3 + 1) / (9 + 9)
19+
// = 16/18
20+
// = 0.888888
21+
// NOTE.NET: actual result is 0.8888888888 repeating, but Xunit rounds to 0.888889.
22+
// Modified assertion from upstream Java code to reflect rounding difference between assertion APIs.
23+
Assert.Equal(
24+
0.888889,
25+
instance.Similarity("My string", "My tsring"),
26+
6);
27+
28+
// test data from other algorithms
29+
// "My string" vs "My tsring"
30+
// Substrings:
31+
// "My " ==> 3, "tri" ==> 3, "g" ==> 1
32+
// Ratcliff-Obershelp = 2*(sum of substrings)/(length of s1 + length of s2)
33+
// = 2*(3 + 3 + 1) / (9 + 9)
34+
// = 14/18
35+
// = 0.777778
36+
Assert.Equal(
37+
0.777778,
38+
instance.Similarity("My string", "My ntrisg"),
39+
6);
40+
41+
// test data from essay by Ilya Ilyankou
42+
// "Comparison of Jaro-Winkler and Ratcliff/Obershelp algorithms
43+
// in spell check"
44+
// https://ilyankou.files.wordpress.com/2015/06/ib-extended-essay.pdf
45+
// p13, expected result is 0.857
46+
Assert.Equal(
47+
0.857,
48+
instance.Similarity("MATEMATICA", "MATHEMATICS"),
49+
3);
50+
51+
// test data from stringmetric
52+
// https://github.com/rockymadden/stringmetric
53+
// expected output is 0.7368421052631579
54+
Assert.Equal(
55+
0.736842,
56+
instance.Similarity("aleksander", "alexandre"),
57+
6);
58+
59+
// test data from stringmetric
60+
// https://github.com/rockymadden/stringmetric
61+
// expected output is 0.6666666666666666
62+
// NOTE.NET: actual result is 0.6666666666 repeating, but Xunit rounds to 0.666667.
63+
// Modified assertion from upstream Java code to reflect rounding difference between assertion APIs.
64+
Assert.Equal(
65+
0.666667,
66+
instance.Similarity("pennsylvania", "pencilvaneya"),
67+
6);
68+
69+
// test data from wikipedia
70+
// https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
71+
// expected output is 14/18 = 0.7777777777777778
72+
Assert.Equal(
73+
0.777778,
74+
instance.Similarity("WIKIMEDIA", "WIKIMANIA"),
75+
6);
76+
77+
// test data from wikipedia
78+
// https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
79+
// expected output is 24/40 = 0.65
80+
Assert.Equal(
81+
0.6,
82+
instance.Similarity("GESTALT PATTERN MATCHING", "GESTALT PRACTICE"),
83+
6);
84+
85+
NullEmptyTests.TestSimilarity(instance);
86+
}
87+
88+
[Fact]
89+
public void TestDistance()
90+
{
91+
var instance = new RatcliffObershelp();
92+
NullEmptyTests.TestDistance(instance);
93+
94+
// TODO: regular (non-null/empty) distance tests
95+
}
96+
}
97+
}

0 commit comments

Comments
 (0)