Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/SUMMARY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
* [Home](index.md)
* Examples
* [Patent Deduplication](examples/patent_deduplication.ipynb)
* [String Comparison Methods](examples/string_comparisons.ipynb)
* Concepts
* [Goals and Alternatives](concepts/goals_and_alternatives.md)
* [Fellegi-Sunter Model](concepts/fs.md)
Expand Down
278 changes: 278 additions & 0 deletions docs/examples/string_comparisons.ipynb

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions mismo/eda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@

from mismo.eda._plot import distribution_chart as distribution_chart
from mismo.eda._plot import distribution_dashboard as distribution_dashboard
from mismo.eda._string import (
string_comparator_score_chart as string_comparator_score_chart,
)
from mismo.eda._string import string_comparator_scores as string_comparator_scores
168 changes: 168 additions & 0 deletions mismo/eda/_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from __future__ import annotations

import altair as alt
from ibis import _
from ibis import selectors as s
from ibis.expr import types as ir

from mismo.text import (
damerau_levenshtein,
damerau_levenshtein_ratio,
jaccard,
jaro_similarity,
jaro_winkler_similarity,
levenshtein,
levenshtein_ratio,
)


def string_comparator_scores(table: ir.Table, col1: str, col2: str) -> ir.Table:
"""Create a table of string comparison measures between two columns.

This calculates the following similarity measures which range between 0 and 1:
- The Jaro similarity
- The Jaro-Winkler similarity
- The Levenshtein ratio
- The Damerau-Levenshtein ratio

as well as the following edit distances:
- The Levenshtein distance
- The Damerau-Levenshtein distance


Parameters
----------

table : ir.Table
An ibis table containing string columns.
col1: str
The name of the first column.
col2: str
The name of the second column.

Returns
-------
A table of string comparison measures between two columns.

Examples
--------

>>> import ibis
>>> from mismo.eda import string_comparator_scores
>>> ibis.options.interactive = True
>>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"],
... "string2": ["foo", "bam", "fizz buzz"]})
>>> string_comparator_scores(table, col1="string1", col2="string2")
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━┓
┃ string1 ┃ string2 ┃ jaro_similarity ┃ jaro_winkler_similarity ┃ … ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━┩
│ string │ string │ float64 │ float64 │ … │
├─────────┼───────────┼─────────────────┼─────────────────────────┼───┤
│ foo │ foo │ 1.000000 │ 1.000000 │ … │
│ bar │ bam │ 0.777778 │ 0.822222 │ … │
│ fizz │ fizz buzz │ 0.814815 │ 0.888889 │ … │
└─────────┴───────────┴─────────────────┴─────────────────────────┴───┘
"""
comp_table = (
table.select(_[col1].name("string1"), _[col2].name("string2"))
.mutate(
jaro_similarity=jaro_similarity(_.string1, _.string2),
jaro_winkler_similarity=jaro_winkler_similarity(_.string1, _.string2),
jaccard_similarity=jaccard(_.string1, _.string2),
levenshtein_ratio=levenshtein_ratio(_.string1, _.string2),
damerau_levenshtein_ratio=damerau_levenshtein_ratio(_.string1, _.string2),
levenshtein_distance=levenshtein(_.string1, _.string2),
damerau_levenshtein_distance=damerau_levenshtein(_.string1, _.string2),
)
.cache()
)

return comp_table


def string_comparator_score_chart(table: ir.Table, col1: str, col2: str) -> alt.Chart:
"""Create a heatmap of string comparison measures between two columns.

Examples
--------

>>> import ibis
>>> from mismo.eda import string_comparator_score_chart
>>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"],
... "string2": ["foo", "bam", "fizz buzz"]})
>>> string_comparator_score_chart(table, col1="string1", col2="string2")
alt.Chart(...)
"""

comp_table = string_comparator_scores(table, col1, col2).mutate(
strings_to_compare=_.string1.concat(", ", _.string2)
)
similarity_records = (
comp_table.select(
"strings_to_compare", s.contains("similarity") | s.contains("ratio")
)
.pivot_longer(
~s.cols("strings_to_compare"), names_to="comparator", values_to="value"
)
.mutate(
comparator=_.comparator.re_replace("(_similarity)|(_ratio)", ""),
)
)
distance_records = (
comp_table.select("strings_to_compare", s.contains("distance"))
.pivot_longer(
~s.cols("strings_to_compare"), names_to="comparator", values_to="value"
)
.mutate(
comparator=_.comparator.re_replace("_distance", ""),
)
)
base = (
alt.Chart(similarity_records, title="Similarity")
.mark_rect()
.encode(
y=alt.Text(
"strings_to_compare:O",
title="String comparison",
),
x=alt.Text("comparator:O", title=None),
color=alt.Color("value:Q", legend=None, scale=alt.Scale(domain=(0, 1))),
)
)

text = base.mark_text().encode(
alt.Text("value:Q", format=".2f"),
color=alt.value("black"),
)

distance_base = (
alt.Chart(distance_records, title="Distance")
.mark_rect()
.encode(
y=alt.Text("strings_to_compare:O", axis=None),
x=alt.Text("comparator:O", title=None),
color=alt.Color("value:Q", legend=None).scale(
scheme="yelloworangered", reverse=True
),
)
)

distance_text = distance_base.mark_text().encode(
alt.Text("value:Q", format=".2f"),
color=alt.value("black"),
)
chart = alt.hconcat(
base + text,
distance_base + distance_text,
title=alt.Title(text="Heatmaps of string comparison metrics", anchor="middle"),
config=alt.Config(
view=alt.ViewConfig(discreteHeight={"step": 30}, discreteWidth={"step": 40})
),
).resolve_scale(color="independent", size="independent")
return chart


if __name__ == "__main__":
import doctest

doctest.testmod()
5 changes: 5 additions & 0 deletions mismo/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
from __future__ import annotations

from mismo.text._features import ngrams as ngrams
from mismo.text._features import tokenize as tokenize
from mismo.text._similarity import damerau_levenshtein as damerau_levenshtein
from mismo.text._similarity import (
damerau_levenshtein_ratio as damerau_levenshtein_ratio,
)
from mismo.text._similarity import double_metaphone as double_metaphone
from mismo.text._similarity import jaccard as jaccard
from mismo.text._similarity import jaro_similarity as jaro_similarity
from mismo.text._similarity import jaro_winkler_similarity as jaro_winkler_similarity
from mismo.text._similarity import levenshtein as levenshtein
from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio
from mismo.text._strings import norm_whitespace as norm_whitespace
47 changes: 47 additions & 0 deletions mismo/text/_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,50 @@ def _dist_ratio(s1, s2, dist):
s2 = _util.ensure_ibis(s2, "string")
lenmax = ibis.greatest(s1.length(), s2.length())
return (lenmax - dist(s1, s2)) / lenmax


@ibis.udf.scalar.builtin
def jaro_similarity(s1: str, s2: str) -> float:
"""The jaro similarity between `s1` and `s2`.

This is defined as
`sj = 1/3 * (m/l_1 + m/l_2 + (m-t)/m)`

where `m` is the number of matching characters between s1 and s2 and `t` is the
number of transpositions between `s1` and `s2`.
"""


# TODO: This isn't portable between backends
@ibis.udf.scalar.builtin
def jaro_winkler_similarity(s1: str, s2: str) -> float:
"""The Jaro-Winkler similarity between `s1` and `s2`.

The Jaro-Winkler similarity is a variant of the Jaro similarity that
measures the number of edits between two strings
and places a higher importance on the prefix.

It is defined as `(sjw = sj + l * p * (1-sj)`
where `sj` is the Jaro similarity, `l` is the length of the common prefix (up to a
maximum of 4) and `p` is a constant scaling factor (up to a maximum of 0.25, but
typically set to 0.1)
"""


@ibis.udf.scalar.builtin
def jaccard(s1: str, s2: str) -> float:
"""The Jaccard similarity between `s1` and `s2

This is equivalent to

```python
from mismo.sets import jaccard as jaccard_set
from mismo.text import tokenize

t1 = tokenize(s1)
t2 = tokenize(s2)
jaccard_set(t1, t2)
```

but is added here for convenience.
"""
38 changes: 37 additions & 1 deletion mismo/text/tests/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pytest

from mismo import text
from mismo import sets, text


@pytest.mark.parametrize(
Expand Down Expand Up @@ -38,3 +38,39 @@ def test_levenshtein_ratio(string1, string2, expected):
assert np.isnan(result)
else:
assert expected == result


@pytest.mark.parametrize(
"string1,string2,expected",
[
("foo", "foo", 1),
("foo", "food", 0.942),
("bar", "bim", 0.5555),
("", "", 0),
(None, None, np.nan),
],
)
def test_jaro_winkler_similarity(string1, string2, expected):
result = text.jaro_winkler_similarity(string1, string2).execute()
if np.isnan(expected):
assert np.isnan(result)
else:
assert result == pytest.approx(expected, 0.001)


@pytest.mark.parametrize(
"string1,string2,expected",
[
("foo", "foo", 1),
("foo bar", "foo", 0.3333), # this is currently failing
("foo bar", "bar foo", 1),
],
)
def test_jaccard_string_similarity(string1, string2, expected):
"""Test that the string and set jaccard methods are equivalent."""
result = text.jaccard(string1, string2).execute()
tokens1 = text.tokenize(string1)
tokens2 = text.tokenize(string2)
set_result = sets.jaccard(tokens1, tokens2).execute()
assert result == pytest.approx(set_result, 0.001)
assert result == pytest.approx(expected, 0.001)