Skip to content

Commit cd48234

Browse files
committed
jaro winkler
1 parent 407da95 commit cd48234

File tree

3 files changed

+48
-9
lines changed

3 files changed

+48
-9
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
# string-metrics
22

3-
Ports some popular string algorithms for Crystal:
3+
String metrics and phonetic algorithms for Crystal:
44
* [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
55
* [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
66
* [Damerau–Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
7+
* [Jaro(-Winkler) Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
8+
79

810
## Installation
911

spec/string-metrics_spec.cr

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,18 @@ describe "StringMetrics" do
7171
it "jaro both empty" do
7272
StringMetrics.jaro("", "").should eq(1)
7373
end
74+
75+
it "jaro winkler basic test" do
76+
StringMetrics.jaro_winkler("MARTHA", "MARHTA").round(2).should eq(0.96)
77+
StringMetrics.jaro_winkler("DIXON", "DICKSONX").round(2).should eq(0.81)
78+
end
79+
80+
it "jaro winkler one empty" do
81+
StringMetrics.jaro_winkler("", "MARHTA").should eq(0)
82+
end
83+
84+
it "jaro winkler both empty" do
85+
StringMetrics.jaro_winkler("", "").should eq(1)
86+
end
87+
7488
end

src/string-metrics.cr

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,15 +88,14 @@ module StringMetrics
8888
score[s1.size + 1][s2.size + 1]
8989
end
9090

91-
# ### Jaro distance/similarity
92-
# A measure of similarity between two strings based on matching characters
93-
# Based off https://rosettacode.org/wiki/Jaro_distance#Python
94-
def self.jaro(s1 : String, s2 : String)
91+
# Based off https://rosettacode.org/wiki/Jaro_distance#Python and
92+
# https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py
93+
private def self.reused_jaro_winkler(s1 : String, s2 : String, winkler = true, scaling_factor = 0.1)
9594
s1_size = s1.size
9695
s2_size = s2.size
9796
return 1 if [s1_size, s2_size].all? { |i| i == 0 }
98-
99-
match_distance = {s1_size, s2_size}.max / 2 - 1
97+
max_len = {s1_size, s2_size}.max
98+
match_distance = max_len / 2 - 1
10099

101100
s1_matches = [false] * s1_size
102101
s2_matches = [false] * s2_size
@@ -135,10 +134,34 @@ module StringMetrics
135134
end
136135
k += 1
137136
end
138-
((matches.fdiv s1_size) + (matches.fdiv s2_size) + ((matches - transpositions.fdiv 2 ).fdiv matches)).fdiv 3
137+
weight = ((matches.fdiv s1_size) + (matches.fdiv s2_size) + ((matches - transpositions.fdiv 2 ).fdiv matches)).fdiv 3
138+
139+
if winkler && weight > 0.7 && s1_size > 3 && s2_size > 3
140+
j = {max_len, 4}.min
141+
i = 0
142+
while i < j && s1_chars[i] == s2_chars[i]
143+
i += 1
144+
end
145+
if i > 0
146+
weight += i * scaling_factor * (1.0 - weight)
147+
end
148+
end
149+
weight
139150
end
140151

152+
# ### Jaro distance
153+
# A measure of similarity between two strings based on matching characters.
154+
# 0 is no similarity while 1 is an exact match
155+
def self.jaro(s1 : String, s2 : String)
156+
reused_jaro_winkler(s1, s2, false)
157+
end
141158

142-
end
159+
# ### Jaro Winkler distance
160+
# Similar to regular Jaro, but gives a higher score for matching from the beginning
161+
# of the string.
162+
def self.jaro_winkler(s1 : String, s2 : String, scaling_factor = 0.1)
163+
reused_jaro_winkler(s1, s2, scaling_factor: scaling_factor)
164+
end
143165

144166

167+
end

0 commit comments

Comments
 (0)