NickCrews · jstammers · Oct 8, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
@@ -1,6 +1,7 @@
 * [Home](index.md)
 * Examples
     * [Patent Deduplication](examples/patent_deduplication.ipynb)
+    * [String Comparison Methods](examples/string_comparisons.ipynb)
 * Concepts
     * [Goals and Alternatives](concepts/goals_and_alternatives.md)
     * [Fellegi-Sunter Model](concepts/fs.md)

diff --git a/docs/examples/string_comparisons.ipynb b/docs/examples/string_comparisons.ipynb
diff --git a/mismo/eda/__init__.py b/mismo/eda/__init__.py
@@ -4,3 +4,7 @@
 
 from mismo.eda._plot import distribution_chart as distribution_chart
 from mismo.eda._plot import distribution_dashboard as distribution_dashboard
+from mismo.eda._string import (
+    string_comparator_score_chart as string_comparator_score_chart,
+)
+from mismo.eda._string import string_comparator_scores as string_comparator_scores
diff --git a/mismo/eda/_string.py b/mismo/eda/_string.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import altair as alt
+from ibis import _
+from ibis import selectors as s
+from ibis.expr import types as ir
+
+from mismo.text import (
+    damerau_levenshtein,
+    damerau_levenshtein_ratio,
+    jaccard,
+    jaro_similarity,
+    jaro_winkler_similarity,
+    levenshtein,
+    levenshtein_ratio,
+)
+
+
+def string_comparator_scores(table: ir.Table, col1: str, col2: str) -> ir.Table:
+    """Create a table of string comparison measures between two columns.
+
+    This calculates the following similarity measures which range between 0 and 1:
+    - The Jaro similarity
+    - The Jaro-Winkler similarity
+    - The Levenshtein ratio
+    - The Damerau-Levenshtein ratio
+
+    as well as the following edit distances:
+    - The Levenshtein distance
+    - The Damerau-Levenshtein distance
+
+
+    Parameters
+    ----------
+
+    table : ir.Table
+        An ibis table containing string columns.
+    col1: str
+        The name of the first column.
+    col2: str
+        The name of the second column.
+
+    Returns
+    -------
+    A table of string comparison measures between two columns.
+
+    Examples
+    --------
+
+    >>> import ibis
+    >>> from mismo.eda import string_comparator_scores
+    >>> ibis.options.interactive = True
+    >>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"],
+    ... "string2": ["foo", "bam", "fizz buzz"]})
+    >>> string_comparator_scores(table, col1="string1", col2="string2")
+    ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━┓
+    ┃ string1 ┃ string2   ┃ jaro_similarity ┃ jaro_winkler_similarity ┃ … ┃
+    ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━┩
+    │ string  │ string    │ float64         │ float64                 │ … │
+    ├─────────┼───────────┼─────────────────┼─────────────────────────┼───┤
+    │ foo     │ foo       │        1.000000 │                1.000000 │ … │
+    │ bar     │ bam       │        0.777778 │                0.822222 │ … │
+    │ fizz    │ fizz buzz │        0.814815 │                0.888889 │ … │
+    └─────────┴───────────┴─────────────────┴─────────────────────────┴───┘
+    """
+    comp_table = (
+        table.select(_[col1].name("string1"), _[col2].name("string2"))
+        .mutate(
+            jaro_similarity=jaro_similarity(_.string1, _.string2),
+            jaro_winkler_similarity=jaro_winkler_similarity(_.string1, _.string2),
+            jaccard_similarity=jaccard(_.string1, _.string2),
+            levenshtein_ratio=levenshtein_ratio(_.string1, _.string2),
+            damerau_levenshtein_ratio=damerau_levenshtein_ratio(_.string1, _.string2),
+            levenshtein_distance=levenshtein(_.string1, _.string2),
+            damerau_levenshtein_distance=damerau_levenshtein(_.string1, _.string2),
+        )
+        .cache()
+    )
+
+    return comp_table
+
+
+def string_comparator_score_chart(table: ir.Table, col1: str, col2: str) -> alt.Chart:
+    """Create a heatmap of string comparison measures between two columns.
+
+    Examples
+    --------
+
+    >>> import ibis
+    >>> from mismo.eda import string_comparator_score_chart
+    >>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"],
+    ... "string2": ["foo", "bam", "fizz buzz"]})
+    >>> string_comparator_score_chart(table, col1="string1", col2="string2")
+    alt.Chart(...)
+    """
+
+    comp_table = string_comparator_scores(table, col1, col2).mutate(
+        strings_to_compare=_.string1.concat(", ", _.string2)
+    )
+    similarity_records = (
+        comp_table.select(
+            "strings_to_compare", s.contains("similarity") | s.contains("ratio")
+        )
+        .pivot_longer(
+            ~s.cols("strings_to_compare"), names_to="comparator", values_to="value"
+        )
+        .mutate(
+            comparator=_.comparator.re_replace("(_similarity)|(_ratio)", ""),
+        )
+    )
+    distance_records = (
+        comp_table.select("strings_to_compare", s.contains("distance"))
+        .pivot_longer(
+            ~s.cols("strings_to_compare"), names_to="comparator", values_to="value"
+        )
+        .mutate(
+            comparator=_.comparator.re_replace("_distance", ""),
+        )
+    )
+    base = (
+        alt.Chart(similarity_records, title="Similarity")
+        .mark_rect()
+        .encode(
+            y=alt.Text(
+                "strings_to_compare:O",
+                title="String comparison",
+            ),
+            x=alt.Text("comparator:O", title=None),
+            color=alt.Color("value:Q", legend=None, scale=alt.Scale(domain=(0, 1))),
+        )
+    )
+
+    text = base.mark_text().encode(
+        alt.Text("value:Q", format=".2f"),
+        color=alt.value("black"),
+    )
+
+    distance_base = (
+        alt.Chart(distance_records, title="Distance")
+        .mark_rect()
+        .encode(
+            y=alt.Text("strings_to_compare:O", axis=None),
+            x=alt.Text("comparator:O", title=None),
+            color=alt.Color("value:Q", legend=None).scale(
+                scheme="yelloworangered", reverse=True
+            ),
+        )
+    )
+
+    distance_text = distance_base.mark_text().encode(
+        alt.Text("value:Q", format=".2f"),
+        color=alt.value("black"),
+    )
+    chart = alt.hconcat(
+        base + text,
+        distance_base + distance_text,
+        title=alt.Title(text="Heatmaps of string comparison metrics", anchor="middle"),
+        config=alt.Config(
+            view=alt.ViewConfig(discreteHeight={"step": 30}, discreteWidth={"step": 40})
+        ),
+    ).resolve_scale(color="independent", size="independent")
+    return chart
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
diff --git a/mismo/text/__init__.py b/mismo/text/__init__.py
@@ -3,10 +3,15 @@
 from __future__ import annotations
 
 from mismo.text._features import ngrams as ngrams
+from mismo.text._features import tokenize as tokenize
 from mismo.text._similarity import damerau_levenshtein as damerau_levenshtein
 from mismo.text._similarity import (
     damerau_levenshtein_ratio as damerau_levenshtein_ratio,
 )
 from mismo.text._similarity import double_metaphone as double_metaphone
+from mismo.text._similarity import jaccard as jaccard
+from mismo.text._similarity import jaro_similarity as jaro_similarity
+from mismo.text._similarity import jaro_winkler_similarity as jaro_winkler_similarity
+from mismo.text._similarity import levenshtein as levenshtein
 from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio
 from mismo.text._strings import norm_whitespace as norm_whitespace
diff --git a/mismo/text/_similarity.py b/mismo/text/_similarity.py
@@ -107,3 +107,50 @@ def _dist_ratio(s1, s2, dist):
     s2 = _util.ensure_ibis(s2, "string")
     lenmax = ibis.greatest(s1.length(), s2.length())
     return (lenmax - dist(s1, s2)) / lenmax
+
+
+@ibis.udf.scalar.builtin
+def jaro_similarity(s1: str, s2: str) -> float:
+    """The jaro similarity between `s1` and `s2`.
+
+    This is defined as
+    `sj = 1/3 * (m/l_1 + m/l_2 + (m-t)/m)`
+
+    where `m` is the number of matching characters between s1 and s2 and `t` is the
+    number of transpositions between `s1` and `s2`.
+    """
+
+
+# TODO: This isn't portable between backends
+@ibis.udf.scalar.builtin
+def jaro_winkler_similarity(s1: str, s2: str) -> float:
+    """The Jaro-Winkler similarity between `s1` and `s2`.
+
+    The Jaro-Winkler similarity is a variant of the Jaro similarity that
+    measures the number of edits between two strings
+    and places a higher importance on the prefix.
+
+    It is defined as `(sjw = sj + l * p * (1-sj)`
+    where `sj` is the Jaro similarity, `l` is the length of the common prefix  (up to a
+    maximum of 4) and `p` is a constant scaling factor (up to a maximum of 0.25, but
+    typically set to 0.1)
+    """
+
+
+@ibis.udf.scalar.builtin
+def jaccard(s1: str, s2: str) -> float:
+    """The Jaccard similarity between `s1` and `s2
+
+    This is equivalent to
+
+    ```python
+    from mismo.sets import jaccard as jaccard_set
+    from mismo.text import tokenize
+
+    t1 = tokenize(s1)
+    t2 = tokenize(s2)
+    jaccard_set(t1, t2)
+    ```
+
+    but is added here for convenience.
+    """
diff --git a/mismo/text/tests/test_similarity.py b/mismo/text/tests/test_similarity.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-from mismo import text
+from mismo import sets, text
 
 
 @pytest.mark.parametrize(
@@ -38,3 +38,39 @@ def test_levenshtein_ratio(string1, string2, expected):
         assert np.isnan(result)
     else:
         assert expected == result
+
+
+@pytest.mark.parametrize(
+    "string1,string2,expected",
+    [
+        ("foo", "foo", 1),
+        ("foo", "food", 0.942),
+        ("bar", "bim", 0.5555),
+        ("", "", 0),
+        (None, None, np.nan),
+    ],
+)
+def test_jaro_winkler_similarity(string1, string2, expected):
+    result = text.jaro_winkler_similarity(string1, string2).execute()
+    if np.isnan(expected):
+        assert np.isnan(result)
+    else:
+        assert result == pytest.approx(expected, 0.001)
+
+
+@pytest.mark.parametrize(
+    "string1,string2,expected",
+    [
+        ("foo", "foo", 1),
+        ("foo bar", "foo", 0.3333),  # this is currently failing
+        ("foo bar", "bar foo", 1),
+    ],
+)
+def test_jaccard_string_similarity(string1, string2, expected):
+    """Test that the string and set jaccard methods are equivalent."""
+    result = text.jaccard(string1, string2).execute()
+    tokens1 = text.tokenize(string1)
+    tokens2 = text.tokenize(string2)
+    set_result = sets.jaccard(tokens1, tokens2).execute()
+    assert result == pytest.approx(set_result, 0.001)
+    assert result == pytest.approx(expected, 0.001)