File tree Expand file tree Collapse file tree 2 files changed +67
-0
lines changed
Expand file tree Collapse file tree 2 files changed +67
-0
lines changed Original file line number Diff line number Diff line change @@ -57,4 +57,18 @@ describe "StringMetrics" do
5757 it " empty strings damerau" do
5858 StringMetrics .damerau_levenshtein(" " , " " ).should eq(0 )
5959 end
60+
61+ it " jaro basic test" do
62+ StringMetrics .jaro(" MARTHA" , " MARHTA" ).round(2 ).should eq(0.94 )
63+ StringMetrics .jaro(" DIXON" , " DICKSONX" ).round(2 ).should eq(0.77 )
64+ StringMetrics .jaro(" JELLYFISH" , " SMELLYFISH" ).round(2 ).should eq(0.90 )
65+ end
66+
67+ it " jaro one empty" do
68+ StringMetrics .jaro(" " , " MARHTA" ).should eq(0 )
69+ end
70+
71+ it " jaro both empty" do
72+ StringMetrics .jaro(" " , " " ).should eq(1 )
73+ end
6074end
Original file line number Diff line number Diff line change @@ -88,4 +88,57 @@ module StringMetrics
8888 score[s1.size + 1 ][s2.size + 1 ]
8989 end
9090
91+ # ### Jaro distance/similarity
92+ # A measure of similarity between two strings based on matching characters
93+ # Based off https://rosettacode.org/wiki/Jaro_distance#Python
94+ def self.jaro (s1 : String , s2 : String )
95+ s1_size = s1.size
96+ s2_size = s2.size
97+ return 1 if [s1_size, s2_size].all? { |i | i == 0 }
98+
99+ match_distance = {s1_size, s2_size}.max / 2 - 1
100+
101+ s1_matches = [false ] * s1_size
102+ s2_matches = [false ] * s2_size
103+
104+ matches = 0
105+ transpositions = 0
106+ s1_chars = s1.chars
107+ s2_chars = s2.chars
108+
109+ (0...s1_size ).each do |i |
110+ start = {i - match_distance, 0 }.max
111+ ending = {i + match_distance + 1 , s2_size}.min
112+
113+ (start...ending).each do |j |
114+ next if s2_matches[j]
115+ next if s1_chars[i] != s2_chars[j]
116+ s1_matches[i] = true
117+ s2_matches[j] = true
118+ matches += 1
119+ break
120+ end
121+ end
122+
123+ return 0 if matches == 0
124+
125+ k = 0
126+ (0...s1_size ).each do |i |
127+ next if ! s1_matches[i]
128+
129+ while ! s2_matches[k]
130+ k += 1
131+ end
132+
133+ if s1_chars[i] != s2_chars[k]
134+ transpositions += 1
135+ end
136+ k += 1
137+ end
138+ ((matches.fdiv s1_size) + (matches.fdiv s2_size) + ((matches - transpositions.fdiv 2 ).fdiv matches)).fdiv 3
139+ end
140+
141+
91142end
143+
144+
You can’t perform that action at this time.
0 commit comments