Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.benchmarks
.coverage
.hypothesis
.pdm-python
.venv
.vscode
Expand Down
15 changes: 15 additions & 0 deletions mismo/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,18 @@ def struct_tokens(struct: ir.StructValue, *, unique: bool = True) -> ir.ArrayVal
if unique:
tokens = tokens.unique()
return tokens


def tokenize(s: ir.StringValue, *, unique: bool = False, remove_punctuation: bool = False) -> ir.ArrayValue:
"""Tokenize a string.

If `unique` is True, only unique tokens are returned.
If `remove_punctuation` is True, punctuation is removed from the tokens."""
if remove_punctuation:
tokens = s.re_split(r"\W*\s+\W*")
else:
tokens = s.re_split(r"\s+")
tokens = tokens.filter(lambda x: x != "")
if unique:
tokens = tokens.unique()
return tokens
3 changes: 1 addition & 2 deletions mismo/lib/geo/tests/test_postal_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import pytest

from mismo.lib.geo import postal_parse_address
from mismo.lib.geo._address import ADDRESS_SCHEMA

from mismo.lib.geo._postal import _ADDRESS_SCHEMA as ADDRESS_SCHEMA
try:
from postal.parser import parse_address as _parse_address
except ImportError:
Expand Down
3 changes: 3 additions & 0 deletions mismo/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,8 @@
from mismo.text._similarity import damerau_levenshtein as damerau_levenshtein
from mismo.text._similarity import double_metaphone as double_metaphone
from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio
from mismo.text._similarity import token_set_ratio as token_set_ratio
from mismo.text._similarity import token_sort_ratio as token_sort_ratio
from mismo.text._similarity import partial_token_sort_ratio as partial_token_sort_ratio
from mismo.text._strings import ngrams as ngrams
from mismo.text._strings import norm_whitespace as norm_whitespace
128 changes: 128 additions & 0 deletions mismo/text/_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,131 @@ def levenshtein_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValu
lenmax = ibis.greatest(s1.length(), s2.length())
ldist = s1.levenshtein(s2)
return (lenmax - ldist) / lenmax

def token_set_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
"""The ratio of the intersection of the token sets of two strings to the union of the token sets.

This is a measure of how similar two strings are, based on the set of tokens they contain.
It is a variation of the Jaccard index, where the intersection and union are based on the
set of tokens in the strings.

Parameters
----------
s1:
The first string

s2:
The second string

Returns
-------
token_set_ratio:
The ratio of the intersection of the token sets to the union of the token sets

Examples
--------
>>> from mismo.text import token_set_ratio
>>> token_set_ratio("mile mile", "mile mike").execute()
0.75
>>> token_set_ratio("mile mile", "mile").execute()
1.0
>>> token_set_ratio("mile mile", "").execute()
0.0
>>> token_set_ratio("", "").execute()
nan
"""
s1 = _util.ensure_ibis(s1, "string")
s2 = _util.ensure_ibis(s2, "string")

# Extract unique tokens from the strings
tokens1 = _util.tokenize(s1, unique=True, remove_punctuation=True)
tokens2 = _util.tokenize(s2, unique=True, remove_punctuation=True)


# Find the intersection and differences
intersection = tokens1.intersect(tokens2)
difference1 = tokens1.filter(lambda x: ~tokens2.contains(x))
difference2 = tokens2.filter(lambda x: ~tokens1.contains(x))

# Calculate lengths
len_intersection = intersection.length()
len_diff1 = difference1.length()
len_diff2 = difference2.length()

# Calculate scores
score1 = len_intersection / (len_intersection + len_diff1)
score2 = len_intersection / (len_intersection + len_diff2)
score3 = (len_intersection + len_diff1) / (len_intersection + len_diff2)

# Calculate final ratio
ratio = ibis.greatest(score1, score2, score3) * 100
return ratio


def token_sort_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
"""The levenshtein ratio of two strings after tokenizing and sorting the tokens.

This is a useful measure of similarity when the order of the tokens is not important,
for example with addresses.

Parameters
----------
s1:
The first string

s2:
The second string

Returns
-------
token_sort_ratio:
The levenstein ratio of the sorted tokens

Examples
--------
>>> from mismo.text import token_sort_ratio
>>> token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear").execute()
100
>>> token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear").execute()
84
"""
s1 = _util.ensure_ibis(s1, "string")
s2 = _util.ensure_ibis(s2, "string")

tokens1 = _util.tokenize(s1, remove_punctuation=True)
tokens2 = _util.tokenize(s2, remove_punctuation=True)

sorted_tokens1 = tokens1.sort()
sorted_tokens2 = tokens2.sort()

sorted_str1 = sorted_tokens1.join(' ')
sorted_str2 = sorted_tokens2.join(' ')

ratio = levenshtein_ratio(sorted_str1, sorted_str2)
return ratio * 100


def partial_token_sort_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue:
"""Similar to token_sort_ratio, but only uses the minimum length string

This is useful when one of the strings may contain additional noise

"""
s1 = _util.ensure_ibis(s1, "string")
s2 = _util.ensure_ibis(s2, "string")

tokens1 = _util.tokenize(s1, remove_punctuation=True)
tokens2 = _util.tokenize(s2, remove_punctuation=True)

sorted_tokens1 = tokens1.sort()
sorted_tokens2 = tokens2.sort()

sorted_str1 = sorted_tokens1.join(' ')
sorted_str2 = sorted_tokens2.join(' ')

min_len = ibis.least(sorted_str1.length(), sorted_str2.length())
sorted_str1 = sorted_str1.left(min_len)
sorted_str2 = sorted_str2.left(min_len)

ratio = levenshtein_ratio(sorted_str1, sorted_str2)
return ratio * 100
30 changes: 30 additions & 0 deletions mismo/text/tests/test_fuzz_hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
""" A test module that verifies the string simiilarity functions return the same values as those in rapidfuzz"""

from rapidfuzz import fuzz
from mismo import text
from hypothesis import given, strategies as st


@given(x=st.text(), y=st.text())
def test_levenshtein_ratio(x,y):
expected = fuzz.ratio(x,y)
result = text.levenshtein_ratio(x,y).execute() * 100
assert expected == result

@given(x=st.text(), y=st.text())
def test_token_set_ratio(x,y):
expected = fuzz.token_set_ratio(x,y)
result = text.token_set_ratio(x,y).execute()
assert expected == result

@given(x=st.text(), y=st.text())
def test_token_sort_ratio(x,y):
expected = fuzz.token_sort_ratio(x,y)
result = text.token_sort_ratio(x,y).execute()
assert expected == result

@given(x=st.text(), y=st.text())
def test_partial_token_sort_ratio(x,y):
expected = fuzz.partial_token_sort_ratio(x,y)
result = text.partial_token_sort_ratio(x,y).execute()
assert expected == result
36 changes: 36 additions & 0 deletions mismo/text/tests/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,39 @@ def test_levenshtein_ratio(string1, string2, expected):
assert np.isnan(result)
else:
assert expected == result

@pytest.mark.parametrize(
"string1,string2,expected",
[
("fuzzy was a bear", "fuzzy fuzzy was a bear", 100)
]
)
def test_token_set_ratio(string1, string2, expected):
result = text.token_set_ratio(string1, string2).execute()
assert expected == result

@pytest.mark.parametrize(
"string1, string2, expected",
[
("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear", 100),
("fuzzy was a bear", "fuzzy fuzzy was a bear", 84),

]
)
def test_token_sort_ratio(string1, string2, expected):
result = text.token_sort_ratio(string1, string2).execute()
assert expected == result


@pytest.mark.parametrize(
"string1, string2, expected",
[
("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear", 100),
("fuzzy was a bear", "fuzzy fuzzy was a bear", 100),
('great is scala', 'java is great', 81),
('C++ and Java', 'Java and Python', 64),
]
)
def test_partial_token_sort_ratio(string1, string2, expected):
result = text.partial_token_sort_ratio(string1, string2).execute()
assert expected == result
67 changes: 67 additions & 0 deletions mismo/text/tests/test_similarity_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

import string
import random

import ibis
from ibis import _
import ibis.expr.types as ir
import pytest

from mismo import text
from rapidfuzz.process import cpdist
from rapidfuzz import fuzz
import pyarrow as pa

@ibis.udf.scalar.pyarrow
def levenshtein_ratio(s1: str, s2: str) -> float:
return cpdist(s1.to_numpy(), s2.to_numpy())

@ibis.udf.scalar.python
def ratio(s1: str, s2: str) -> float:
return fuzz.ratio(s1, s2)

def create_test_data() -> ir.Table:
random.seed(0)
words = [
"".join(random.choice(string.ascii_letters + string.digits) for _ in range(10))
for _ in range(10_000)
]
arr1 = random.choices(words, k=10_000_000)
arr2 = random.choices(words, k=10_000_000)
return ibis.memtable({"s1": arr1, "s2": arr2})

@pytest.fixture
def data(backend: ibis.BaseBackend) -> ir.Table:
t = backend.create_table("data",create_test_data())
t = t.cache()
return t


@pytest.mark.parametrize(
"fn",
[
pytest.param(ratio, id="rapidfuzz"),
pytest.param(levenshtein_ratio, id="rapidfuzz-process"),
pytest.param(text.levenshtein_ratio, id="mismo"),
],
)
@pytest.mark.parametrize(
"nrows",
[
pytest.param(1_000, id="1k"),
pytest.param(10_000, id="10k"),
pytest.param(100_000, id="100k"),
pytest.param(1_000_000, id="1m"),
pytest.param(10_000_000, id="10m"),
],
)
def test_benchmark_similarity(backend: ibis.BaseBackend, data, nrows, fn, benchmark):
inp = data.head(nrows).cache()

def run():
t = inp.mutate(result=fn(inp.s1, inp.s2))
return backend.create_table("temp", t, overwrite=True)

result = benchmark(run)
assert len(result.execute()) == nrows
Loading