@@ -88,15 +88,14 @@ module StringMetrics
8888 score[s1.size + 1 ][s2.size + 1 ]
8989 end
9090
91- # ### Jaro distance/similarity
92- # A measure of similarity between two strings based on matching characters
93- # Based off https://rosettacode.org/wiki/Jaro_distance#Python
94- def self.jaro (s1 : String , s2 : String )
91+ # Based off https://rosettacode.org/wiki/Jaro_distance#Python and
92+ # https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py
93+ private def self.reused_jaro_winkler (s1 : String , s2 : String , winkler = true , scaling_factor = 0.1 )
9594 s1_size = s1.size
9695 s2_size = s2.size
9796 return 1 if [s1_size, s2_size].all? { |i | i == 0 }
98-
99- match_distance = {s1_size, s2_size}.max / 2 - 1
97+ max_len = {s1_size, s2_size}.max
98+ match_distance = max_len / 2 - 1
10099
101100 s1_matches = [false ] * s1_size
102101 s2_matches = [false ] * s2_size
@@ -135,10 +134,34 @@ module StringMetrics
135134 end
136135 k += 1
137136 end
138- ((matches.fdiv s1_size) + (matches.fdiv s2_size) + ((matches - transpositions.fdiv 2 ).fdiv matches)).fdiv 3
137+ weight = ((matches.fdiv s1_size) + (matches.fdiv s2_size) + ((matches - transpositions.fdiv 2 ).fdiv matches)).fdiv 3
138+
139+ if winkler && weight > 0.7 && s1_size > 3 && s2_size > 3
140+ j = {max_len, 4 }.min
141+ i = 0
142+ while i < j && s1_chars[i] == s2_chars[i]
143+ i += 1
144+ end
145+ if i > 0
146+ weight += i * scaling_factor * (1.0 - weight)
147+ end
148+ end
149+ weight
139150 end
140151
152+ # ### Jaro distance
153+ # A measure of similarity between two strings based on matching characters.
154+ # 0 is no similarity while 1 is an exact match
155+ def self.jaro (s1 : String , s2 : String )
156+ reused_jaro_winkler(s1, s2, false )
157+ end
141158
142- end
159+ # ### Jaro Winkler distance
160+ # Similar to regular Jaro, but gives a higher score for matching from the beginning
161+ # of the string.
162+ def self.jaro_winkler (s1 : String , s2 : String , scaling_factor = 0.1 )
163+ reused_jaro_winkler(s1, s2, scaling_factor: scaling_factor)
164+ end
143165
144166
167+ end
0 commit comments