-
Notifications
You must be signed in to change notification settings - Fork 16
Description
Thanks for your convenient extension. It may be really useful in many cases. But it gives the wrong output for Jaro-Winkler score.
The problem is in Jaro part: you can't count transpositions correctly running only one loop. There are missing transpositions in a case when strings are offset, has different length etc. I was banging my head in denial of the second for loop too. And there is no way to avoid it, presumably.
You may want to check this implementation: https://www.rosettacode.org/wiki/Jaro-Winkler_distance#Swift, however it looks like there is a problem in common prefix count.
Also you may want to look into these implementations, prefix similarity is treated correct there: https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/, but there aren't Swift example.
Comparison of my take on Jaro-Winkler score vs yours vs Rosetta Code's
import Foundation
extension String {
func similarity(to: String, with algorithm: StringSimilarityAlgorithm = .jaroWinkler) -> Double {
stringSimilarityScore(self, to: to, with: algorithm)
}
}
// Reimplementation
func stringSimilarityScore(_ from: String, to: String, with algo: StringSimilarityAlgorithm = .jaroWinkler) -> Double {
let fromLen = from.count
let toLen = to.count
guard fromLen != 0, toLen != 0 else {
if fromLen == toLen { return 1 }
return 0
}
let matchDistance: Int = {
let d = (max(fromLen, toLen)/2) - 1
if d == 0 { return 1 }
return d
}()
var matches = 0
var fromMatches = Array<Bool>(repeating: false, count: fromLen)
var toMatches = Array<Bool>(repeating: false, count: toLen)
from.enumerated().forEach{ ee in
let seekStartIdx = max(0, ee.offset - matchDistance)
let seekEndIdx = min(ee.offset + matchDistance, toLen - 1)
guard seekStartIdx <= seekEndIdx else {
return
}
for si in seekStartIdx...seekEndIdx {
if toMatches[si] { continue }
let ti = to.index(to.startIndex, offsetBy: si)
let tc = to[ti]
if tc == ee.element {
matches += 1
fromMatches[ee.offset] = true
toMatches[si] = true
break
}
}
}
var transpositions = 0
var seenToIdx = 0
for i in 0..<fromLen {
if fromMatches[i] == false {
continue
}
if seenToIdx >= toLen {
break
}
while toMatches[seenToIdx] == false {
seenToIdx += 1
}
let fromStrIdx = from.index(from.startIndex, offsetBy: i)
let fromChar = from[fromStrIdx]
let toStrIdx = to.index(to.startIndex, offsetBy: seenToIdx)
let toChar = to[toStrIdx]
if fromChar != toChar {
transpositions += 1
}
seenToIdx += 1
}
let jaro: Double = {
if matches == 0 { return 0 }
let matchesDouble = Double(matches)
let mDivFrom = matchesDouble / Double(fromLen)
let mDivTo = matchesDouble / Double(toLen)
let halfTransp = Double(transpositions) / 2
return (mDivFrom + mDivTo + (matchesDouble - halfTransp) / matchesDouble) / 3
}()
switch algo {
case .jaro:
return jaro
case .jaroWinkler:
let commonPrefix = min(4, from.commonPrefix(with: to).count)
return (jaro + Double(commonPrefix) * 0.1 * (1.0 - jaro))
}
}
enum StringSimilarityAlgorithm {
case jaro, jaroWinkler
}
// from https://github.com/autozimu/StringMetric.swift
extension String {
/// Get Jaro-Winkler distance.
///
/// (Score is normalized such that 0 equates to no similarity and 1 is an exact match).
///
/// Reference <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
/// - Parameter target: The target `String`.
/// - Returns: The Jaro-Winkler distance between the receiver and `target`.
public func distanceJaroWinkler(between target: String) -> Double {
var stringOne = self
var stringTwo = target
if stringOne.count > stringTwo.count {
stringTwo = self
stringOne = target
}
let stringOneCount = stringOne.count
let stringTwoCount = stringTwo.count
if stringOneCount == 0 && stringTwoCount == 0 {
return 1.0
}
let matchingDistance = stringTwoCount / 2
var matchingCharactersCount: Double = 0
var transpositionsCount: Double = 0
var previousPosition = -1
// Count matching characters and transpositions.
for (i, stringOneChar) in stringOne.enumerated() {
for (j, stringTwoChar) in stringTwo.enumerated() {
if max(0, i - matchingDistance)..<min(stringTwoCount, i + matchingDistance) ~= j {
if stringOneChar == stringTwoChar {
matchingCharactersCount += 1
if previousPosition != -1 && j < previousPosition {
transpositionsCount += 1
}
previousPosition = j
break
}
}
}
}
if matchingCharactersCount == 0.0 {
return 0.0
}
// Count common prefix (up to a maximum of 4 characters)
let commonPrefixCount = min(max(Double(self.commonPrefix(with: target).count), 0), 4)
let jaroSimilarity = (matchingCharactersCount / Double(stringOneCount) + matchingCharactersCount / Double(stringTwoCount) + (matchingCharactersCount - transpositionsCount) / matchingCharactersCount) / 3
// Default is 0.1, should never exceed 0.25 (otherwise similarity score could exceed 1.0)
let commonPrefixScalingFactor = 0.1
return jaroSimilarity + commonPrefixCount * commonPrefixScalingFactor * (1 - jaroSimilarity)
}
}
// From rosettacode.org
func jaroWinklerDistance(string1: String, string2: String) -> Double {
var st1 = Array(string1)
var st2 = Array(string2)
var len1 = st1.count
var len2 = st2.count
if len1 < len2 {
swap(&st1, &st2)
swap(&len1, &len2)
}
if len2 == 0 {
return len1 == 0 ? 0.0 : 1.0
}
let delta = max(1, len1 / 2) - 1
var flag = Array(repeating: false, count: len2)
var ch1Match: [Character] = []
ch1Match.reserveCapacity(len1)
for idx1 in 0..<len1 {
let ch1 = st1[idx1]
for idx2 in 0..<len2 {
let ch2 = st2[idx2]
if idx2 <= idx1 + delta && idx2 + delta >= idx1 && ch1 == ch2 && !flag[idx2] {
flag[idx2] = true
ch1Match.append(ch1)
break
}
}
}
let matches = ch1Match.count
if matches == 0 {
return 1.0
}
var transpositions = 0
var idx1 = 0
for idx2 in 0..<len2 {
if flag[idx2] {
if st2[idx2] != ch1Match[idx1] {
transpositions += 1
}
idx1 += 1
}
}
let m = Double(matches)
let jaro =
(m / Double(len1) + m / Double(len2) + (m - Double(transpositions) / 2.0) / m) / 3.0
var commonPrefix = 0
for i in 0..<min(4, len2) {
if st1[i] == st2[i] {
commonPrefix += 1
}
}
return 1.0 - (jaro + Double(commonPrefix) * 0.1 * (1.0 - jaro))
}
extension String {
func rosettaJaroWinkler(to: String) -> Double {
return 1 - jaroWinklerDistance(string1: self, string2: to)
}
}
print("illabo")
print("transpositions".similarity(to: "sun transitions"))
print("transpositions".similarity(to: "trance positions"))
print("transpositions".similarity(to: "post transitions"))
print("transpositions".similarity(to: "disposition"))
print("transpositions".similarity(to: "transpositiosn"))
print("transpositions".similarity(to: "tranpsositions"))
print("kitten".similarity(to: "sitting"))
print("君子和而不同".similarity(to: "小人同而不和"))
print("accomodate".similarity(to: "accommodate"))
print("accomodate".similarity(to: "accommodated"))
print("accomodate".similarity(to: "accommodates"))
print("accomodate".similarity(to: "accommodating"))
print("accomodate".similarity(to: "accommodation"))
print("CRATE".similarity(to: "TRACE"))
print("TRATE".similarity(to: "TRACE"))
print("DwAyNE".similarity(to: "DuANE"))
print("autozimu")
print("transpositions".distanceJaroWinkler(between: "sun transitions"))
print("transpositions".distanceJaroWinkler(between: "trance positions"))
print("transpositions".distanceJaroWinkler(between: "post transitions"))
print("transpositions".distanceJaroWinkler(between: "disposition"))
print("transpositions".distanceJaroWinkler(between: "transpositiosn"))
print("transpositions".distanceJaroWinkler(between: "tranpsositions"))
print("kitten".distanceJaroWinkler(between: "sitting"))
print("君子和而不同".distanceJaroWinkler(between: "小人同而不和"))
print("accomodate".distanceJaroWinkler(between: "accommodate"))
print("accomodate".distanceJaroWinkler(between: "accommodated"))
print("accomodate".distanceJaroWinkler(between: "accommodates"))
print("accomodate".distanceJaroWinkler(between: "accommodating"))
print("accomodate".distanceJaroWinkler(between: "accommodation"))
print("CRATE".distanceJaroWinkler(between: "TRACE"))
print("TRATE".distanceJaroWinkler(between: "TRACE"))
print("DwAyNE".distanceJaroWinkler(between: "DuANE"))
print("rosettacode.org")
print("transpositions".rosettaJaroWinkler(to: "sun transitions"))
print("transpositions".rosettaJaroWinkler(to: "trance positions"))
print("transpositions".rosettaJaroWinkler(to: "post transitions"))
print("transpositions".rosettaJaroWinkler(to: "disposition"))
print("transpositions".rosettaJaroWinkler(to: "transpositiosn"))
print("transpositions".rosettaJaroWinkler(to: "tranpsositions"))
print("kitten".rosettaJaroWinkler(to: "sitting"))
print("君子和而不同".rosettaJaroWinkler(to: "小人同而不和"))
print("accomodate".rosettaJaroWinkler(to: "accommodate"))
print("accomodate".rosettaJaroWinkler(to: "accommodated"))
print("accomodate".rosettaJaroWinkler(to: "accommodates"))
print("accomodate".rosettaJaroWinkler(to: "accommodating"))
print("accomodate".rosettaJaroWinkler(to: "accommodation"))
print("CRATE".rosettaJaroWinkler(to: "TRACE"))
print("TRATE".rosettaJaroWinkler(to: "TRACE"))
print("DwAyNE".rosettaJaroWinkler(to: "DuANE"))