Skip to content

Commit 407da95

Browse files
committed
jaro
1 parent 0a2e099 commit 407da95

File tree

2 files changed

+67
-0
lines changed

2 files changed

+67
-0
lines changed

spec/string-metrics_spec.cr

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,18 @@ describe "StringMetrics" do
5757
it "empty strings damerau" do
5858
StringMetrics.damerau_levenshtein("", "").should eq(0)
5959
end
60+
61+
it "jaro basic test" do
62+
StringMetrics.jaro("MARTHA", "MARHTA").round(2).should eq(0.94)
63+
StringMetrics.jaro("DIXON", "DICKSONX").round(2).should eq(0.77)
64+
StringMetrics.jaro("JELLYFISH", "SMELLYFISH").round(2).should eq(0.90)
65+
end
66+
67+
it "jaro one empty" do
68+
StringMetrics.jaro("", "MARHTA").should eq(0)
69+
end
70+
71+
it "jaro both empty" do
72+
StringMetrics.jaro("", "").should eq(1)
73+
end
6074
end

src/string-metrics.cr

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,57 @@ module StringMetrics
8888
score[s1.size + 1][s2.size + 1]
8989
end
9090

91+
# ### Jaro distance/similarity
92+
# A measure of similarity between two strings based on matching characters
93+
# Based off https://rosettacode.org/wiki/Jaro_distance#Python
94+
def self.jaro(s1 : String, s2 : String)
95+
s1_size = s1.size
96+
s2_size = s2.size
97+
return 1 if [s1_size, s2_size].all? { |i| i == 0 }
98+
99+
match_distance = {s1_size, s2_size}.max / 2 - 1
100+
101+
s1_matches = [false] * s1_size
102+
s2_matches = [false] * s2_size
103+
104+
matches = 0
105+
transpositions = 0
106+
s1_chars = s1.chars
107+
s2_chars = s2.chars
108+
109+
(0...s1_size).each do |i|
110+
start = {i - match_distance, 0}.max
111+
ending = {i + match_distance + 1, s2_size}.min
112+
113+
(start...ending).each do |j|
114+
next if s2_matches[j]
115+
next if s1_chars[i] != s2_chars[j]
116+
s1_matches[i] = true
117+
s2_matches[j] = true
118+
matches += 1
119+
break
120+
end
121+
end
122+
123+
return 0 if matches == 0
124+
125+
k = 0
126+
(0...s1_size).each do |i|
127+
next if !s1_matches[i]
128+
129+
while !s2_matches[k]
130+
k += 1
131+
end
132+
133+
if s1_chars[i] != s2_chars[k]
134+
transpositions += 1
135+
end
136+
k += 1
137+
end
138+
((matches.fdiv s1_size) + (matches.fdiv s2_size) + ((matches - transpositions.fdiv 2 ).fdiv matches)).fdiv 3
139+
end
140+
141+
91142
end
143+
144+

0 commit comments

Comments
 (0)