|
3 | 3 | /// Functions to compute the edit distance between two strings |
4 | 4 | module internal Internal.Utilities.EditDistance |
5 | 5 |
|
6 | | -/// Given an offset and a radius from that offset, |
7 | | -/// does mChar exist in that part of str? |
8 | | -let inline existsInWin (mChar: char) (str: string) offset rad = |
9 | | - let startAt = max 0 (offset - rad) |
10 | | - let endAt = min (offset + rad) (String.length str - 1) |
| 6 | +open System |
| 7 | +open System.Collections.Generic |
| 8 | + |
| 9 | +/// Given an offset and a radius from that offset, does mChar exist in that part of str? |
| 10 | +let inline existsInWin (mChar: char) (str: string) (offset: int) (rad: int) = |
| 11 | + let startAt = Math.Max(0, offset - rad) |
| 12 | + let endAt = Math.Min(offset + rad, str.Length - 1) |
11 | 13 | if endAt - startAt < 0 then false |
12 | 14 | else |
13 | 15 | let rec exists index = |
14 | 16 | if str.[index] = mChar then true |
15 | 17 | elif index = endAt then false |
16 | 18 | else exists (index + 1) |
17 | 19 | exists startAt |
18 | | - |
19 | | -/// The jaro distance between s1 and s2 |
20 | | -let jaro s1 s2 = |
21 | | - // The radius is half of the lesser |
22 | | - // of the two string lengths rounded up. |
| 20 | + |
| 21 | +let jaro (s1: string) (s2: string) = |
| 22 | + // The radius is half of the lesser of the two string lengths rounded up. |
23 | 23 | let matchRadius = |
24 | | - let minLen = |
25 | | - min (String.length s1) (String.length s2) in |
26 | | - minLen / 2 + minLen % 2 |
27 | | - |
| 24 | + let minLen = Math.Min(s1.Length, s2.Length) |
| 25 | + minLen / 2 + minLen % 2 |
| 26 | + |
28 | 27 | // An inner function which recursively finds the number |
29 | 28 | // of matched characters within the radius. |
30 | 29 | let commonChars (chars1: string) (chars2: string) = |
31 | | - let rec inner i result = |
32 | | - match i with |
33 | | - | -1 -> result |
34 | | - | _ -> if existsInWin chars1.[i] chars2 i matchRadius |
35 | | - then inner (i - 1) (chars1.[i] :: result) |
36 | | - else inner (i - 1) result |
37 | | - inner (chars1.Length - 1) [] |
38 | | - |
| 30 | + let result = ResizeArray(chars1.Length) |
| 31 | + for i = 0 to chars1.Length - 1 do |
| 32 | + let c = chars1.[i] |
| 33 | + if existsInWin c chars2 i matchRadius then |
| 34 | + result.Add c |
| 35 | + result |
| 36 | + |
39 | 37 | // The sets of common characters and their lengths as floats |
40 | 38 | let c1 = commonChars s1 s2 |
41 | 39 | let c2 = commonChars s2 s1 |
42 | | - let c1length = float (List.length c1) |
43 | | - let c2length = float (List.length c2) |
44 | | - |
45 | | - // The number of transpositions within |
46 | | - // the sets of common characters. |
47 | | - let transpositions = |
48 | | - let rec inner cl1 cl2 result = |
49 | | - match cl1, cl2 with |
50 | | - | [], _ | _, [] -> result |
51 | | - | c1h :: c1t, c2h :: c2t -> |
52 | | - if c1h <> c2h |
53 | | - then inner c1t c2t (result + 1.0) |
54 | | - else inner c1t c2t result |
55 | | - let mismatches = inner c1 c2 0.0 |
| 40 | + let c1length = float c1.Count |
| 41 | + let c2length = float c2.Count |
| 42 | + |
| 43 | + // The number of transpositions within the sets of common characters. |
| 44 | + let transpositions = |
| 45 | + let mutable mismatches = 0.0 |
| 46 | + for i = 0 to (Math.Min(c1.Count, c2.Count)) - 1 do |
| 47 | + if c1.[i] <> c2.[i] then |
| 48 | + mismatches <- mismatches + 1.0 |
| 49 | + |
56 | 50 | // If one common string is longer than the other |
57 | 51 | // each additional char counts as half a transposition |
58 | 52 | (mismatches + abs (c1length - c2length)) / 2.0 |
59 | | - |
60 | | - let s1length = float (String.length s1) |
61 | | - let s2length = float (String.length s2) |
62 | | - let tLength = max c1length c2length |
63 | | - |
64 | | - // The jaro distance as given by |
65 | | - // 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc ) |
66 | | - let result = (c1length / s1length + |
67 | | - c2length / s2length + |
68 | | - (tLength - transpositions) / tLength) |
69 | | - / 3.0 |
70 | | - |
| 53 | + |
| 54 | + let tLength = Math.Max(c1length, c2length) |
| 55 | + |
| 56 | + // The jaro distance as given by 1/3 ( m2/|s1| + m1/|s2| + (mc-t)/mc ) |
| 57 | + let result = (c1length / float s1.Length + c2length / float s2.Length + (tLength - transpositions) / tLength) / 3.0 |
| 58 | + |
71 | 59 | // This is for cases where |s1|, |s2| or m are zero |
72 | | - if System.Double.IsNaN result then 0.0 else result |
| 60 | + if Double.IsNaN result then 0.0 else result |
73 | 61 |
|
74 | 62 | /// Calculates the Jaro-Winkler edit distance between two strings. |
75 | 63 | /// The edit distance is a metric that allows to measure the amount of similarity between two strings. |
|
0 commit comments