|
1 | 1 | module Text.Fuzzy.Levenshtein where
|
2 | 2 |
|
3 |
| -import Data.Function (fix) |
4 | 3 | import Data.List (sortOn)
|
5 |
| -import Data.MemoTrie |
| 4 | +import Data.Text (Text) |
6 | 5 | import qualified Data.Text as T
|
7 |
| -import qualified Data.Text.Array as T |
8 |
| -import Data.Text.Internal (Text (..)) |
| 6 | +import Text.EditDistance |
9 | 7 | import Text.Fuzzy.Parallel
|
10 | 8 |
|
11 |
| --- | Same caveats apply w.r.t. ASCII as in 'Text.Fuzzy.Parallel'. |
12 |
| --- Might be worth optimizing this at some point, but it's good enoughᵗᵐ for now |
13 |
| -levenshtein :: Text -> Text -> Int |
14 |
| -levenshtein a b | T.null a = T.length b |
15 |
| -levenshtein a b | T.null b = T.length a |
16 |
| -levenshtein (Text aBuf aOff aLen) (Text bBuf bOff bLen) = do |
17 |
| - let aTot = aOff + aLen |
18 |
| - bTot = bOff + bLen |
19 |
| - go' _ (!aIx, !bIx) | aIx >= aTot || bIx >= bTot = max (aTot - aIx) (bTot - bIx) |
20 |
| - go' f (!aIx, !bIx) | T.unsafeIndex aBuf aIx == T.unsafeIndex bBuf bIx = f (aIx + 1, bIx + 1) |
21 |
| - go' f (!aIx, !bIx) = |
22 |
| - minimum |
23 |
| - [ 2 + f (aIx + 1, bIx + 1), -- Give substitutions a heavier cost, so multiple typos cost more |
24 |
| - 1 + f (aIx + 1, bIx), |
25 |
| - 1 + f (aIx, bIx + 1) |
26 |
| - ] |
27 |
| - go = fix (memo . go') |
28 |
| - go (aOff, bOff) |
29 |
| - |
30 | 9 | -- | Sort the given list according to it's levenshtein distance relative to the
|
31 | 10 | -- given string.
|
32 | 11 | levenshteinScored :: Int -> Text -> [Text] -> [Scored Text]
|
33 |
| -levenshteinScored chunkSize needle haystack = |
| 12 | +levenshteinScored chunkSize needle haystack = do |
| 13 | + let levenshtein = levenshteinDistance $ defaultEditCosts {substitutionCosts=ConstantCost 2} |
34 | 14 | sortOn score $
|
35 | 15 | matchPar chunkSize needle haystack id $
|
36 |
| - \a b -> Just $ levenshtein a b |
| 16 | + \a b -> Just $ levenshtein (T.unpack a) (T.unpack b) |
0 commit comments