NickCrews · jstammers · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .benchmarks
 .coverage
+.hypothesis
 .pdm-python
 .venv
 .vscode

diff --git a/mismo/_util.py b/mismo/_util.py
@@ -320,3 +320,18 @@ def struct_tokens(struct: ir.StructValue, *, unique: bool = True) -> ir.ArrayVal
     if unique:
         tokens = tokens.unique()
     return tokens
+
+
+def tokenize(s: ir.StringValue, *, unique: bool = False, remove_punctuation: bool = False) -> ir.ArrayValue:
+    """Tokenize a string.
+
+    If `unique` is True, only unique tokens are returned.
+    If `remove_punctuation` is True, punctuation is removed from the tokens."""
+    if remove_punctuation:
+        tokens = s.re_split(r"\W*\s+\W*")
+    else:
+        tokens = s.re_split(r"\s+")
+    tokens = tokens.filter(lambda x: x != "")
+    if unique:
+        tokens = tokens.unique()
+    return tokens
diff --git a/mismo/lib/geo/tests/test_postal_benchmark.py b/mismo/lib/geo/tests/test_postal_benchmark.py
@@ -8,8 +8,7 @@
 import pytest
 
 from mismo.lib.geo import postal_parse_address
-from mismo.lib.geo._address import ADDRESS_SCHEMA
-
+from mismo.lib.geo._postal import _ADDRESS_SCHEMA as ADDRESS_SCHEMA
 try:
     from postal.parser import parse_address as _parse_address
 except ImportError:

diff --git a/mismo/text/__init__.py b/mismo/text/__init__.py
@@ -5,5 +5,8 @@
 from mismo.text._similarity import damerau_levenshtein as damerau_levenshtein
 from mismo.text._similarity import double_metaphone as double_metaphone
 from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio
+from mismo.text._similarity import token_set_ratio as token_set_ratio
+from mismo.text._similarity import token_sort_ratio as token_sort_ratio
+from mismo.text._similarity import partial_token_sort_ratio as partial_token_sort_ratio
 from mismo.text._strings import ngrams as ngrams
 from mismo.text._strings import norm_whitespace as norm_whitespace
diff --git a/mismo/text/_similarity.py b/mismo/text/_similarity.py
@@ -91,3 +91,131 @@ def levenshtein_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValu
     lenmax = ibis.greatest(s1.length(), s2.length())
     ldist = s1.levenshtein(s2)
     return (lenmax - ldist) / lenmax
+
+def token_set_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
+    """The ratio of the intersection of the token sets of two strings to the union of the token sets.
+
+    This is a measure of how similar two strings are, based on the set of tokens they contain.
+    It is a variation of the Jaccard index, where the intersection and union are based on the
+    set of tokens in the strings.
+
+    Parameters
+    ----------
+    s1:
+        The first string
+
+    s2:
+        The second string
+
+    Returns
+    -------
+    token_set_ratio:
+        The ratio of the intersection of the token sets to the union of the token sets
+
+    Examples
+    --------
+    >>> from mismo.text import token_set_ratio
+    >>> token_set_ratio("mile mile", "mile mike").execute()
+    0.75
+    >>> token_set_ratio("mile mile", "mile").execute()
+    1.0
+    >>> token_set_ratio("mile mile", "").execute()
+    0.0
+    >>> token_set_ratio("", "").execute()
+    nan
+    """
+    s1 = _util.ensure_ibis(s1, "string")
+    s2 = _util.ensure_ibis(s2, "string")
+
+    # Extract unique tokens from the strings
+    tokens1 = _util.tokenize(s1, unique=True, remove_punctuation=True)
+    tokens2 = _util.tokenize(s2, unique=True, remove_punctuation=True)
+
+
+    # Find the intersection and differences
+    intersection = tokens1.intersect(tokens2)
+    difference1 = tokens1.filter(lambda x: ~tokens2.contains(x))
+    difference2 = tokens2.filter(lambda x: ~tokens1.contains(x))
+
+    # Calculate lengths
+    len_intersection = intersection.length()
+    len_diff1 = difference1.length()
+    len_diff2 = difference2.length()
+
+    # Calculate scores
+    score1 = len_intersection / (len_intersection + len_diff1)
+    score2 = len_intersection / (len_intersection + len_diff2)
+    score3 = (len_intersection + len_diff1) / (len_intersection + len_diff2)
+
+    # Calculate final ratio
+    ratio = ibis.greatest(score1, score2, score3) * 100
+    return ratio
+
+
+def token_sort_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
+    """The levenshtein ratio of two strings after tokenizing and sorting the tokens.
+
+    This is a useful measure of similarity when the order of the tokens is not important, 
+    for example with addresses.
+
+    Parameters
+    ----------
+    s1:
+        The first string
+
+    s2:
+        The second string
+
+    Returns
+    -------
+    token_sort_ratio:
+        The levenstein ratio of the sorted tokens
+
+    Examples
+    --------
+    >>> from mismo.text import token_sort_ratio
+    >>> token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear").execute()
+    100
+    >>> token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear").execute()
+    84
+    """
+    s1 = _util.ensure_ibis(s1, "string")
+    s2 = _util.ensure_ibis(s2, "string")
+
+    tokens1 = _util.tokenize(s1, remove_punctuation=True)
+    tokens2 = _util.tokenize(s2, remove_punctuation=True)
+
+    sorted_tokens1 = tokens1.sort()
+    sorted_tokens2 = tokens2.sort()
+
+    sorted_str1 = sorted_tokens1.join(' ')
+    sorted_str2 = sorted_tokens2.join(' ')
+
+    ratio = levenshtein_ratio(sorted_str1, sorted_str2)
+    return ratio * 100
+
+
+def partial_token_sort_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
+    """Similar to token_sort_ratio, but only uses the minimum length string
+
+    This is useful when one of the strings may contain additional noise
+
+    """
+    s1 = _util.ensure_ibis(s1, "string")
+    s2 = _util.ensure_ibis(s2, "string")
+
+    tokens1 = _util.tokenize(s1, remove_punctuation=True)
+    tokens2 = _util.tokenize(s2, remove_punctuation=True)
+
+    sorted_tokens1 = tokens1.sort()
+    sorted_tokens2 = tokens2.sort()
+
+    sorted_str1 = sorted_tokens1.join(' ')
+    sorted_str2 = sorted_tokens2.join(' ')
+
+    min_len = ibis.least(sorted_str1.length(), sorted_str2.length())
+    sorted_str1 = sorted_str1.left(min_len)
+    sorted_str2 = sorted_str2.left(min_len)
+
+    ratio = levenshtein_ratio(sorted_str1, sorted_str2)
+    return ratio * 100
diff --git a/mismo/text/tests/test_fuzz_hypothesis.py b/mismo/text/tests/test_fuzz_hypothesis.py
@@ -0,0 +1,30 @@
+""" A test module that verifies the string simiilarity functions return the same values as those in rapidfuzz"""
+
+from rapidfuzz import fuzz
+from mismo import text
+from hypothesis import given, strategies as st
+
+
+@given(x=st.text(), y=st.text())
+def test_levenshtein_ratio(x,y):
+    expected = fuzz.ratio(x,y)
+    result = text.levenshtein_ratio(x,y).execute() * 100
+    assert expected == result
+
+@given(x=st.text(), y=st.text())
+def test_token_set_ratio(x,y):
+    expected = fuzz.token_set_ratio(x,y)
+    result = text.token_set_ratio(x,y).execute()
+    assert expected == result
+
+@given(x=st.text(), y=st.text())
+def test_token_sort_ratio(x,y):
+    expected = fuzz.token_sort_ratio(x,y)
+    result = text.token_sort_ratio(x,y).execute()
+    assert expected == result
+
+@given(x=st.text(), y=st.text())
+def test_partial_token_sort_ratio(x,y):
+    expected = fuzz.partial_token_sort_ratio(x,y)
+    result = text.partial_token_sort_ratio(x,y).execute()
+    assert expected == result
diff --git a/mismo/text/tests/test_similarity.py b/mismo/text/tests/test_similarity.py
@@ -38,3 +38,39 @@ def test_levenshtein_ratio(string1, string2, expected):
         assert np.isnan(result)
     else:
         assert expected == result
+
+@pytest.mark.parametrize(
+    "string1,string2,expected",
+    [
+        ("fuzzy was a bear", "fuzzy fuzzy was a bear", 100)
+    ]
+)
+def test_token_set_ratio(string1, string2, expected):
+    result = text.token_set_ratio(string1, string2).execute()
+    assert expected == result
+
+@pytest.mark.parametrize(
+    "string1, string2, expected",
+    [
+        ("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear", 100),
+        ("fuzzy was a bear", "fuzzy fuzzy was a bear", 84),
+
+    ]
+)
+def test_token_sort_ratio(string1, string2, expected):
+    result = text.token_sort_ratio(string1, string2).execute()
+    assert expected == result
+
+
+@pytest.mark.parametrize(
+    "string1, string2, expected",
+    [
+        ("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear", 100),
+        ("fuzzy was a bear", "fuzzy fuzzy was a bear", 100),
+        ('great is scala', 'java is great', 81),
+        ('C++ and Java', 'Java and Python', 64),
+    ]
+)
+def test_partial_token_sort_ratio(string1, string2, expected):
+    result = text.partial_token_sort_ratio(string1, string2).execute()
+    assert expected == result
diff --git a/mismo/text/tests/test_similarity_benchmark.py b/mismo/text/tests/test_similarity_benchmark.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import string
+import random
+
+import ibis
+from ibis import _
+import ibis.expr.types as ir
+import pytest
+
+from mismo import text
+from rapidfuzz.process import cpdist
+from rapidfuzz import fuzz
+import pyarrow as pa
+
+@ibis.udf.scalar.pyarrow
+def levenshtein_ratio(s1: str, s2: str) -> float:
+    return cpdist(s1.to_numpy(), s2.to_numpy())
+
+@ibis.udf.scalar.python
+def ratio(s1: str, s2: str) -> float:
+    return fuzz.ratio(s1, s2)
+
+def create_test_data() -> ir.Table:
+    random.seed(0)
+    words = [
+        "".join(random.choice(string.ascii_letters + string.digits) for _ in range(10))
+        for _ in range(10_000)
+    ]
+    arr1 = random.choices(words, k=10_000_000)
+    arr2 = random.choices(words, k=10_000_000)
+    return ibis.memtable({"s1": arr1, "s2": arr2})
+
+@pytest.fixture
+def data(backend: ibis.BaseBackend) -> ir.Table:
+     t = backend.create_table("data",create_test_data())
+     t = t.cache()
+     return t
+
+
+@pytest.mark.parametrize(
+    "fn",
+    [
+        pytest.param(ratio, id="rapidfuzz"),
+        pytest.param(levenshtein_ratio, id="rapidfuzz-process"),
+        pytest.param(text.levenshtein_ratio, id="mismo"),
+    ],
+)
+@pytest.mark.parametrize(
+    "nrows",
+    [
+        pytest.param(1_000, id="1k"),
+        pytest.param(10_000, id="10k"),
+        pytest.param(100_000, id="100k"),
+        pytest.param(1_000_000, id="1m"),
+        pytest.param(10_000_000, id="10m"),
+    ],
+)
+def test_benchmark_similarity(backend: ibis.BaseBackend, data, nrows, fn, benchmark):
+    inp = data.head(nrows).cache()
+
+    def run():
+        t = inp.mutate(result=fn(inp.s1, inp.s2))
+        return backend.create_table("temp", t, overwrite=True)
+
+    result = benchmark(run)
+    assert len(result.execute()) == nrows