From 97d094ed59737d2ef4714777373dee89c2841e20 Mon Sep 17 00:00:00 2001 From: jstammers Date: Tue, 8 Oct 2024 12:50:38 +0100 Subject: [PATCH 1/4] implement Splink string comparison chart --- docs/examples/string_comparisons.ipynb | 242 +++++++++++++++++++++++++ mismo/eda/__init__.py | 4 + mismo/eda/_string.py | 168 +++++++++++++++++ mismo/text/__init__.py | 5 + mismo/text/_similarity.py | 57 +++++- mismo/text/tests/test_similarity.py | 38 +++- 6 files changed, 512 insertions(+), 2 deletions(-) create mode 100644 docs/examples/string_comparisons.ipynb create mode 100644 mismo/eda/_string.py diff --git a/docs/examples/string_comparisons.ipynb b/docs/examples/string_comparisons.ipynb new file mode 100644 index 00000000..c7c6d460 --- /dev/null +++ b/docs/examples/string_comparisons.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b429d6cb7344eb9b", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:41.837527Z", + "start_time": "2024-10-08T11:49:40.220699Z" + } + }, + "outputs": [], + "source": [ + "from __future__ import annotations\n", + "\n", + "import ibis\n", + "\n", + "from mismo.block import KeyBlocker\n", + "from mismo.playdata import load_febrl1\n", + "\n", + "ibis.options.interactive = True\n", + "records, links = load_febrl1()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dc4c76bbe1ed06cd", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:41.885620Z", + "start_time": "2024-10-08T11:49:41.883245Z" + } + }, + "outputs": [], + "source": [ + "from mismo.cluster import connected_components\n", + "from mismo.eda import string_comparator_score_chart, string_comparator_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e550096046e0e505", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:42.275798Z", + "start_time": "2024-10-08T11:49:42.098545Z" + } + }, + "outputs": [], + "source": [ + "connected = connected_components(links=links, records=records)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4501165142736034", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:42.368218Z", + "start_time": "2024-10-08T11:49:42.343046Z" + } + }, + "outputs": [], + "source": [ + "blocked = KeyBlocker(\"component\")(connected, connected)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3ba075261fce1d8c", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:42.671277Z", + "start_time": "2024-10-08T11:49:42.568748Z" + } + }, + "outputs": [], + "source": [ + "scores = string_comparator_scores(blocked.limit(20), \"surname_l\", \"surname_r\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ecd02caad29c4a7c", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:42.861466Z", + "start_time": "2024-10-08T11:49:42.859012Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e384caf06edc8179", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:43.169968Z", + "start_time": "2024-10-08T11:49:43.065416Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string_comparator_score_chart(blocked.limit(20), \"surname_l\", \"surname_r\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "929f6463fcb7d6af", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:43.368034Z", + "start_time": "2024-10-08T11:49:43.366663Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f46b91dd4c33eaa0", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-08T11:49:43.548332Z", + "start_time": "2024-10-08T11:49:43.546614Z" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/mismo/eda/__init__.py b/mismo/eda/__init__.py index a1efe18e..87a54d57 100644 --- a/mismo/eda/__init__.py +++ b/mismo/eda/__init__.py @@ -4,3 +4,7 @@ from mismo.eda._plot import distribution_chart as distribution_chart from mismo.eda._plot import distribution_dashboard as distribution_dashboard +from mismo.eda._string import ( + string_comparator_score_chart as string_comparator_score_chart, +) +from mismo.eda._string import string_comparator_scores as string_comparator_scores diff --git a/mismo/eda/_string.py b/mismo/eda/_string.py new file mode 100644 index 00000000..17bdad7b --- /dev/null +++ b/mismo/eda/_string.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import altair as alt +from ibis import _ +from ibis import selectors as s +from ibis.expr import types as ir + +from mismo.text import ( + damerau_levenshtein, + damerau_levenshtein_ratio, + jaccard, + jaro_similarity, + jaro_winkler_similarity, + levenshtein, + levenshtein_ratio, +) + + +def string_comparator_scores(table: ir.Table, col1: str, col2: str) -> ir.Table: + """Create a table of string comparison measures between two columns. + + This calculates the following similarity measures which range between 0 and 1: + - The Jaro similarity + - The Jaro-Winkler similarity + - The Levenshtein ratio + - The Damerau-Levenshtein ratio + + as well as the following edit distances: + - The Levenshtein distance + - The Damerau-Levenshtein distance + + + Parameters + ---------- + + table : ir.Table + An ibis table containing string columns. + col1: str + The name of the first column. + col2: str + The name of the second column. + + Returns + ------- + A table of string comparison measures between two columns. + + Examples + -------- + + >>> import ibis + >>> from mismo.eda import string_comparator_scores + >>> ibis.options.interactive = True + >>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"], + ... "string2": ["foo", "bam", "fizz buzz"]}) + >>> string_comparator_scores(table, col1="string1", col2="string2") + ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━┓ + ┃ string1 ┃ string2 ┃ jaro_similarity ┃ jaro_winkler_similarity ┃ … ┃ + ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━┩ + │ string │ string │ float64 │ float64 │ … │ + ├─────────┼───────────┼─────────────────┼─────────────────────────┼───┤ + │ foo │ foo │ 1.000000 │ 1.000000 │ … │ + │ bar │ bam │ 0.777778 │ 0.822222 │ … │ + │ fizz │ fizz buzz │ 0.814815 │ 0.888889 │ … │ + └─────────┴───────────┴─────────────────┴─────────────────────────┴───┘ + """ + comp_table = ( + table.select(_[col1].name("string1"), _[col2].name("string2")) + .mutate( + jaro_similarity=jaro_similarity(_.string1, _.string2), + jaro_winkler_similarity=jaro_winkler_similarity(_.string1, _.string2), + jaccard_similarity=jaccard(_.string1, _.string2), + levenshtein_ratio=levenshtein_ratio(_.string1, _.string2), + damerau_levenshtein_ratio=damerau_levenshtein_ratio(_.string1, _.string2), + levenshtein_distance=levenshtein(_.string1, _.string2), + damerau_levenshtein_distance=damerau_levenshtein(_.string1, _.string2), + ) + .cache() + ) + + return comp_table + + +def string_comparator_score_chart(table: ir.Table, col1: str, col2: str) -> alt.Chart: + """Create a heatmap of string comparison measures between two columns. + + Examples + -------- + + >>> import ibis + >>> from mismo.eda import string_comparator_score_chart + >>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"], + ... "string2": ["foo", "bam", "fizz buzz"]}) + >>> string_comparator_score_chart(table, col1="string1", col2="string2") + alt.Chart(...) + """ + + comp_table = string_comparator_scores(table, col1, col2).mutate( + strings_to_compare=_.string1.concat(", ", _.string2) + ) + similarity_records = ( + comp_table.select( + "strings_to_compare", s.contains("similarity") | s.contains("ratio") + ) + .pivot_longer( + ~s.cols("strings_to_compare"), names_to="comparator", values_to="value" + ) + .mutate( + comparator=_.comparator.re_replace("(_similarity)|(_ratio)", ""), + ) + ) + distance_records = ( + comp_table.select("strings_to_compare", s.contains("distance")) + .pivot_longer( + ~s.cols("strings_to_compare"), names_to="comparator", values_to="value" + ) + .mutate( + comparator=_.comparator.re_replace("_distance", ""), + ) + ) + base = ( + alt.Chart(similarity_records, title="Similarity") + .mark_rect() + .encode( + y=alt.Text( + "strings_to_compare:O", + title="String comparison", + ), + x=alt.Text("comparator:O", title=None), + color=alt.Color("value:Q", legend=None, scale=alt.Scale(domain=(0, 1))), + ) + ) + + text = base.mark_text().encode( + alt.Text("value:Q", format=".2f"), + color=alt.value("black"), + ) + + distance_base = ( + alt.Chart(distance_records, title="Distance") + .mark_rect() + .encode( + y=alt.Text("strings_to_compare:O", axis=None), + x=alt.Text("comparator:O", title=None), + color=alt.Color("value:Q", legend=None).scale( + scheme="yelloworangered", reverse=True + ), + ) + ) + + distance_text = distance_base.mark_text().encode( + alt.Text("value:Q", format=".2f"), + color=alt.value("black"), + ) + chart = alt.hconcat( + base + text, + distance_base + distance_text, + title=alt.Title(text="Heatmaps of string comparison metrics", anchor="middle"), + config=alt.Config( + view=alt.ViewConfig(discreteHeight={"step": 30}, discreteWidth={"step": 40}) + ), + ).resolve_scale(color="independent", size="independent") + return chart + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/mismo/text/__init__.py b/mismo/text/__init__.py index 9f1313e5..5cf33a58 100644 --- a/mismo/text/__init__.py +++ b/mismo/text/__init__.py @@ -3,10 +3,15 @@ from __future__ import annotations from mismo.text._features import ngrams as ngrams +from mismo.text._features import tokenize as tokenize from mismo.text._similarity import damerau_levenshtein as damerau_levenshtein from mismo.text._similarity import ( damerau_levenshtein_ratio as damerau_levenshtein_ratio, ) from mismo.text._similarity import double_metaphone as double_metaphone +from mismo.text._similarity import jaccard as jaccard +from mismo.text._similarity import jaro_similarity as jaro_similarity +from mismo.text._similarity import jaro_winkler_similarity as jaro_winkler_similarity +from mismo.text._similarity import levenshtein as levenshtein from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio from mismo.text._strings import norm_whitespace as norm_whitespace diff --git a/mismo/text/_similarity.py b/mismo/text/_similarity.py index b45e10e4..01bf392b 100644 --- a/mismo/text/_similarity.py +++ b/mismo/text/_similarity.py @@ -47,6 +47,14 @@ def damerau_levenshtein(a: str, b: str) -> int: """ +@ibis.udf.scalar.builtin +def levenshtein(a: str, b: str) -> int: + """ + The number of adds, deletes and substitutions to get from `a` to `b`. + + """ + + def levenshtein_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue: """The levenshtein distance between two strings, normalized to be between 0 and 1. @@ -86,7 +94,7 @@ def levenshtein_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValu >>> levenshtein_ratio("", "").execute() np.float64(nan) """ - return _dist_ratio(s1, s2, lambda a, b: a.levenshtein(b)) + return _dist_ratio(s1, s2, levenshtein) def damerau_levenshtein_ratio( @@ -107,3 +115,50 @@ def _dist_ratio(s1, s2, dist): s2 = _util.ensure_ibis(s2, "string") lenmax = ibis.greatest(s1.length(), s2.length()) return (lenmax - dist(s1, s2)) / lenmax + + +@ibis.udf.scalar.builtin +def jaro_similarity(s1: str, s2: str) -> float: + """The jaro similarity between `s1` and `s2`. + + This is defined as + `sj = 1/3 * (m/l_1 + m/l_2 + (m-t)/m)` + + where `m` is the number of matching characters between s1 and s2 and `t` is the + number of transpositions between `s1` and `s2`. + """ + + +# TODO: This isn't portable between backends +@ibis.udf.scalar.builtin +def jaro_winkler_similarity(s1: str, s2: str) -> float: + """The Jaro-Winkler similarity between `s1` and `s2`. + + The Jaro-Winkler similarity is a variant of the Jaro similarity that + measures the number of edits between two strings + and places a higher importance on the prefix. + + It is defined as `(sjw = sj + l * p * (1-sj)` + where `sj` is the Jaro similarity, `l` is the length of the common prefix (up to a + maximum of 4) and `p` is a constant scaling factor (up to a maximum of 0.25, but + typically set to 0.1) + """ + + +@ibis.udf.scalar.builtin +def jaccard(s1: str, s2: str) -> float: + """The Jaccard similarity between `s1` and `s2 + + This is equivalent to + + ```python + from mismo.sets import jaccard as jaccard_set + from mismo.text import tokenize + + t1 = tokenize(s1) + t2 = tokenize(s2) + jaccard_set(t1, t2) + ``` + + but is added here for convenience. + """ diff --git a/mismo/text/tests/test_similarity.py b/mismo/text/tests/test_similarity.py index bd37185b..a07fa902 100644 --- a/mismo/text/tests/test_similarity.py +++ b/mismo/text/tests/test_similarity.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from mismo import text +from mismo import sets, text @pytest.mark.parametrize( @@ -38,3 +38,39 @@ def test_levenshtein_ratio(string1, string2, expected): assert np.isnan(result) else: assert expected == result + + +@pytest.mark.parametrize( + "string1,string2,expected", + [ + ("foo", "foo", 1), + ("foo", "food", 0.942), + ("bar", "bim", 0.5555), + ("", "", 0), + (None, None, np.nan), + ], +) +def test_jaro_winkler_similarity(string1, string2, expected): + result = text.jaro_winkler_similarity(string1, string2).execute() + if np.isnan(expected): + assert np.isnan(result) + else: + assert result == pytest.approx(expected, 0.001) + + +@pytest.mark.parametrize( + "string1,string2,expected", + [ + ("foo", "foo", 1), + ("foo bar", "foo", 0.3333), # this is currently failing + ("foo bar", "bar foo", 1), + ], +) +def test_jaccard_string_similarity(string1, string2, expected): + """Test that the string and set jaccard methods are equivalent.""" + result = text.jaccard(string1, string2).execute() + tokens1 = text.tokenize(string1) + tokens2 = text.tokenize(string2) + set_result = sets.jaccard(tokens1, tokens2).execute() + assert result == pytest.approx(set_result, 0.001) + assert result == pytest.approx(expected, 0.001) From ffcbea9586c7503064d99169957529897b61557a Mon Sep 17 00:00:00 2001 From: jstammers Date: Wed, 16 Oct 2024 16:36:55 +0100 Subject: [PATCH 2/4] add to example notebook --- docs/SUMMARY.md | 1 + docs/examples/string_comparisons.ipynb | 281 +++++++++++++++++-------- 2 files changed, 191 insertions(+), 91 deletions(-) diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index dd993d1b..f85a090f 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,6 +1,7 @@ * [Home](index.md) * Examples * [Patent Deduplication](examples/patent_deduplication.ipynb) + * [String Comparison Methods](examples/string_comparisons.ipynb) * Concepts * [Goals and Alternatives](concepts/goals_and_alternatives.md) * [Fellegi-Sunter Model](concepts/fs.md) diff --git a/docs/examples/string_comparisons.ipynb b/docs/examples/string_comparisons.ipynb index c7c6d460..a80432de 100644 --- a/docs/examples/string_comparisons.ipynb +++ b/docs/examples/string_comparisons.ipynb @@ -1,135 +1,262 @@ { "cells": [ + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "# uncomment and run if mismo is not installed\n", + "# %pip install -q git+https://github.com/NickCrews/mismo@main" + ], + "id": "3005879cf128faeb" + }, { "cell_type": "code", - "execution_count": 1, "id": "b429d6cb7344eb9b", "metadata": { "ExecuteTime": { - "end_time": "2024-10-08T11:49:41.837527Z", - "start_time": "2024-10-08T11:49:40.220699Z" + "end_time": "2024-10-16T15:30:05.585511Z", + "start_time": "2024-10-16T15:30:05.438305Z" } }, - "outputs": [], "source": [ "from __future__ import annotations\n", "\n", "import ibis\n", "\n", - "from mismo.block import KeyBlocker\n", - "from mismo.playdata import load_febrl1\n", "\n", - "ibis.options.interactive = True\n", - "records, links = load_febrl1()" - ] + "ibis.options.interactive = True\n" + ], + "outputs": [], + "execution_count": 1 }, { - "cell_type": "code", - "execution_count": 2, - "id": "dc4c76bbe1ed06cd", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-08T11:49:41.885620Z", - "start_time": "2024-10-08T11:49:41.883245Z" - } - }, - "outputs": [], + "metadata": {}, + "cell_type": "markdown", "source": [ - "from mismo.cluster import connected_components\n", - "from mismo.eda import string_comparator_score_chart, string_comparator_scores" - ] + "Many real-world datasets contain errors due to causes such as manual data entry, incorrect data processing and inconsistent formatting. Therefore it's often useful to make use of string similarity measures that can quantify how close two strings are to each other by accounting for common types of string manipulations. These are defined to give a score between 0 and 1 which indicate minimal and maximal similarity respectively. mismo currently implements the following string similarity measures that are suitable for different use-cases:\n", + "\n", + "- `Jaro` - a measure of the similarity between two strings given the number of matching characters and transpositions and their length.\n", + "- `Jaro-Winkler` - a modification of the Jaro similarity that uses a prefix scale to give more favourable weightings to strings that match at the start.\n", + "- `Jaccard` - a measure of the number of overlapping sets of words in two strings.\n", + "\n", + "In addition, the following edit distance measures are defined along with equivalent similarities that are normalized using string lengths.\n", + "- `Levenshtein` - a measure of the distance between two strings based on the number of deletions, insertions and substitutions.\n", + "- `Damerau-Levenshtein` - an extension of `Levenshtein` that includes transpositions." + ], + "id": "cd07440b9757cf54" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's explore how these work in practice using the patents dataset. We will generate pairs by blocking on the `label_true` column", + "id": "76c63dd5bd247ae7" }, { - "cell_type": "code", - "execution_count": 3, - "id": "e550096046e0e505", "metadata": { "ExecuteTime": { - "end_time": "2024-10-08T11:49:42.275798Z", - "start_time": "2024-10-08T11:49:42.098545Z" + "end_time": "2024-10-16T15:30:06.733211Z", + "start_time": "2024-10-16T15:30:05.602814Z" } }, - "outputs": [], + "cell_type": "code", "source": [ - "connected = connected_components(links=links, records=records)" - ] + "from mismo.playdata import load_patents\n", + "patents = load_patents()\n", + "patents" + ], + "id": "7c8477cf22e9d399", + "outputs": [ + { + "data": { + "text/plain": [ + "┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001B[1m \u001B[0m\u001B[1mrecord_id\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlabel_true\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mname_true\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mname\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlatitude\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlongitude\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mcoauthors\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mclasses\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\n", + "┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001B[2mint64\u001B[0m │ \u001B[2mint64\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │\n", + "├───────────┼────────────┼──────────────────────┼──────────────────────────────┼──────────┼───────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────┤\n", + "│ \u001B[1;36m2909\u001B[0m │ \u001B[1;36m402600\u001B[0m │ \u001B[32mAGILENT TECHNOLOGIES\u001B[0m │ \u001B[32m* AGILENT TECHNOLOGIES, INC.\u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mKONINK PHILIPS ELECTRONICS N V**DAVID E SNYDER**THOMAS D LYSTER \u001B[0m │ \u001B[32mA61N**A61B \u001B[0m │\n", + "│ \u001B[1;36m3574\u001B[0m │ \u001B[1;36m569309\u001B[0m │ \u001B[32mAKZO NOBEL \u001B[0m │ \u001B[32m* AKZO NOBEL N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mTSJERK HOEKSTRA**ANDRESS K JOHNSON**TERESA MARIE CHERON**ALBERTO SLIKTA**JA…\u001B[0m │ \u001B[32mG01N**B01L**C11D**G02F**F16L \u001B[0m │\n", + "│ \u001B[1;36m3575\u001B[0m │ \u001B[1;36m569309\u001B[0m │ \u001B[32mAKZO NOBEL \u001B[0m │ \u001B[32m* AKZO NOBEL NV \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mWILLIAM JOHN ERNEST PARR**HANS OSKARSSON**MARTIN HELLSTEN**KORNELIS OVERKEM…\u001B[0m │ \u001B[32mC09K**F17D**B01F**C23F \u001B[0m │\n", + "│ \u001B[1;36m3779\u001B[0m │ \u001B[1;36m656303\u001B[0m │ \u001B[32mALCATEL \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m52.35\u001B[0m │ \u001B[1;36m4.916667\u001B[0m │ \u001B[32mGUENTER KOCHSMEIER**ZBIGNIEW WIEGOLASKI**EVAN JOHN STANBURY**PETER GRANT JE…\u001B[0m │ \u001B[32mG02B**G04G**H02G**G06F \u001B[0m │\n", + "│ \u001B[1;36m3780\u001B[0m │ \u001B[1;36m656303\u001B[0m │ \u001B[32mALCATEL \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m52.35\u001B[0m │ \u001B[1;36m4.916667\u001B[0m │ \u001B[32mZILAN MANFRED**JOSIANE RAMOS**DUANE LYNN MORTENSEN**CHRISTIAN LE SERGENT \u001B[0m │ \u001B[32mH03G**B05D**H04L**H04B**C03B**C03C**G02B**H01B \u001B[0m │\n", + "│ \u001B[1;36m3782\u001B[0m │ \u001B[1;36m656303\u001B[0m │ \u001B[32mALCATEL \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mOLIVIER AUDOUIN**MICHEL SOTOM**JEAN MICHEL GABRIAGUES \u001B[0m │ \u001B[32mH04B**H01S**H04J \u001B[0m │\n", + "│ \u001B[1;36m15041\u001B[0m │ \u001B[1;36m4333661\u001B[0m │ \u001B[32mCANON EUROPA \u001B[0m │ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mLEE RICKLER**SIMON PARKER**CANON RES CENT EURO **RAKEFET SAGMAN**TIMOTHY FRA…\u001B[0m │ \u001B[32mG06F \u001B[0m │\n", + "│ \u001B[1;36m15042\u001B[0m │ \u001B[1;36m4333661\u001B[0m │ \u001B[32mCANON EUROPA \u001B[0m │ \u001B[32m* CANON EUROPA N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mQI HE HONG**ADAM MICHAEL BAUMBERG**ALEXANDER RALPH LYONS \u001B[0m │ \u001B[32mG06T**G01B \u001B[0m │\n", + "│ \u001B[1;36m15043\u001B[0m │ \u001B[1;36m4333661\u001B[0m │ \u001B[32mCANON EUROPA \u001B[0m │ \u001B[32m* CANON EUROPA NV \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mNILESH PATHAK**MASAMICHI MASUDA** CANON TECHNOLOGY EURO **PATRICK WILLIAM MO…\u001B[0m │ \u001B[32mH04B**G06T**G06F**H04M**H04N**H04Q**G03B**B41J**G01B**G06Q \u001B[0m │\n", + "│ \u001B[1;36m25387\u001B[0m │ \u001B[1;36m7650783\u001B[0m │ \u001B[32mDSM \u001B[0m │ \u001B[32m* DSM N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mGABRIEL MARINUS MEESTERS**RUDOLF CAROLUS BARENDSE**ARIE KARST KIES**ALEXANDE…\u001B[0m │ \u001B[32mC12N**A61K**A23L**A23J**A23K**A01H**B01J**C12R**C07D**A61P**B01D\u001B[0m │\n", + "│ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │\n", + "└───────────┴────────────┴──────────────────────┴──────────────────────────────┴──────────┴───────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────┘" + ], + "text/html": [ + "
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ record_id  label_true  name_true             name                          latitude  longitude  coauthors                                                                         classes                                                          ┃\n",
+       "┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ int64int64stringstringfloat64float64stringstring                                                           │\n",
+       "├───────────┼────────────┼──────────────────────┼──────────────────────────────┼──────────┼───────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────┤\n",
+       "│      2909402600AGILENT TECHNOLOGIES* AGILENT TECHNOLOGIES, INC.0.000.000000KONINK PHILIPS ELECTRONICS N V**DAVID E  SNYDER**THOMAS D  LYSTER               A61N**A61B                                                       │\n",
+       "│      3574569309AKZO NOBEL          * AKZO NOBEL N.V.           0.000.000000TSJERK  HOEKSTRA**ANDRESS K  JOHNSON**TERESA MARIE  CHERON**ALBERTO  SLIKTA**JA…G01N**B01L**C11D**G02F**F16L                                     │\n",
+       "│      3575569309AKZO NOBEL          * AKZO NOBEL NV             0.000.000000WILLIAM JOHN ERNEST  PARR**HANS  OSKARSSON**MARTIN  HELLSTEN**KORNELIS  OVERKEM…C09K**F17D**B01F**C23F                                           │\n",
+       "│      3779656303ALCATEL             * ALCATEL N.V.              52.354.916667GUENTER  KOCHSMEIER**ZBIGNIEW  WIEGOLASKI**EVAN JOHN  STANBURY**PETER GRANT  JE…G02B**G04G**H02G**G06F                                           │\n",
+       "│      3780656303ALCATEL             * ALCATEL N.V.              52.354.916667ZILAN  MANFRED**JOSIANE  RAMOS**DUANE LYNN  MORTENSEN**CHRISTIAN  LE SERGENT    H03G**B05D**H04L**H04B**C03B**C03C**G02B**H01B                   │\n",
+       "│      3782656303ALCATEL             * ALCATEL N.V.              0.000.000000OLIVIER  AUDOUIN**MICHEL  SOTOM**JEAN MICHEL  GABRIAGUES                        H04B**H01S**H04J                                                 │\n",
+       "│     150414333661CANON EUROPA        * CANON EUROPA N.V          0.000.000000LEE  RICKLER**SIMON  PARKER**CANON RES CENT EURO **RAKEFET  SAGMAN**TIMOTHY FRA…G06F                                                             │\n",
+       "│     150424333661CANON EUROPA        * CANON EUROPA N.V.         0.000.000000QI HE  HONG**ADAM MICHAEL  BAUMBERG**ALEXANDER RALPH  LYONS                     G06T**G01B                                                       │\n",
+       "│     150434333661CANON EUROPA        * CANON EUROPA NV           0.000.000000NILESH  PATHAK**MASAMICHI  MASUDA** CANON TECHNOLOGY EURO **PATRICK WILLIAM  MO…H04B**G06T**G06F**H04M**H04N**H04Q**G03B**B41J**G01B**G06Q       │\n",
+       "│     253877650783DSM                 * DSM N.V.                  0.000.000000GABRIEL MARINUS  MEESTERS**RUDOLF CAROLUS  BARENDSE**ARIE KARST  KIES**ALEXANDE…C12N**A61K**A23L**A23J**A23K**A01H**B01J**C12R**C07D**A61P**B01D │\n",
+       "│                                                                         │\n",
+       "└───────────┴────────────┴──────────────────────┴──────────────────────────────┴──────────┴───────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────┘\n",
+       "
\n" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 2 }, { - "cell_type": "code", - "execution_count": 4, - "id": "4501165142736034", "metadata": { "ExecuteTime": { - "end_time": "2024-10-08T11:49:42.368218Z", - "start_time": "2024-10-08T11:49:42.343046Z" + "end_time": "2024-10-16T15:30:06.857603Z", + "start_time": "2024-10-16T15:30:06.840256Z" } }, - "outputs": [], + "cell_type": "code", "source": [ - "blocked = KeyBlocker(\"component\")(connected, connected)" - ] + "from mismo.block import KeyBlocker\n", + "blocked = KeyBlocker(\"label_true\")(patents, patents)" + ], + "id": "6fab260ed83951a5", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "A comparison table of these string similarity measures can be generated using `mismo.eda.string_comparator_scores`", + "id": "2e47f7105bcd7fe9" }, { "cell_type": "code", - "execution_count": 5, "id": "3ba075261fce1d8c", "metadata": { "ExecuteTime": { - "end_time": "2024-10-08T11:49:42.671277Z", - "start_time": "2024-10-08T11:49:42.568748Z" + "end_time": "2024-10-16T15:30:07.088938Z", + "start_time": "2024-10-16T15:30:07.025850Z" } }, - "outputs": [], "source": [ - "scores = string_comparator_scores(blocked.limit(20), \"surname_l\", \"surname_r\")" - ] + "from mismo.eda import string_comparator_score_chart, string_comparator_scores\n", + "scores = string_comparator_scores(blocked.limit(20), \"name_l\", \"name_r\")" + ], + "outputs": [], + "execution_count": 4 }, { "cell_type": "code", - "execution_count": 6, "id": "ecd02caad29c4a7c", "metadata": { "ExecuteTime": { - "end_time": "2024-10-08T11:49:42.861466Z", - "start_time": "2024-10-08T11:49:42.859012Z" + "end_time": "2024-10-16T15:30:07.109820Z", + "start_time": "2024-10-16T15:30:07.094108Z" } }, - "outputs": [], - "source": [] + "source": "scores", + "outputs": [ + { + "data": { + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001B[1m \u001B[0m\u001B[1mstring1\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mstring2\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_winkler_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaccard_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mint64\u001B[0m │ \u001B[2mint64\u001B[0m │\n", + "├─────────────────────────────────┼────────────────────────────────┼─────────────────┼─────────────────────────┼────────────────────┼───────────────────┼───────────────────────────┼──────────────────────┼──────────────────────────────┤\n", + "│ \u001B[32m* AKZO NOBEL N.V. \u001B[0m │ \u001B[32m* AKZO NOBEL NV \u001B[0m │ \u001B[1;36m0.960784\u001B[0m │ \u001B[1;36m0.976471\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", + "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA N.V. \u001B[0m │ \u001B[1;36m0.982456\u001B[0m │ \u001B[1;36m0.989474\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA NV \u001B[0m │ \u001B[1;36m0.981481\u001B[0m │ \u001B[1;36m0.988889\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* DSM N.V. \u001B[0m │ \u001B[32mDSM N.V. \u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.875000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │\n", + "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘" + ], + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ string1                          string2                         jaro_similarity  jaro_winkler_similarity  jaccard_similarity  levenshtein_ratio  damerau_levenshtein_ratio  levenshtein_distance  damerau_levenshtein_distance ┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ stringstringfloat64float64float64float64float64int64int64                        │\n",
+       "├─────────────────────────────────┼────────────────────────────────┼─────────────────┼─────────────────────────┼────────────────────┼───────────────────┼───────────────────────────┼──────────────────────┼──────────────────────────────┤\n",
+       "│ * AKZO NOBEL N.V.              * AKZO NOBEL NV               0.9607840.9764710.9166670.8823530.88235322 │\n",
+       "│ * ALCATEL N.V.                 * ALCATEL N.V.                1.0000001.0000001.0000001.0000001.00000000 │\n",
+       "│ * ALCATEL N.V.                 * ALCATEL N.V.                1.0000001.0000001.0000001.0000001.00000000 │\n",
+       "│ * CANON EUROPA N.V             * CANON EUROPA N.V.           0.9824560.9894741.0000000.9473680.94736811 │\n",
+       "│ * CANON EUROPA N.V             * CANON EUROPA NV             0.9814810.9888890.9166670.9444440.94444411 │\n",
+       "│ * DSM N.V.                     DSM N.V.                      0.8500000.8500000.8750000.8000000.80000022 │\n",
+       "│ * HUNTER DOUGLAS INDUSTRIES B V* HUNTER DOUGLAS INDUSTRIES BV0.9892470.9935481.0000000.9677420.96774211 │\n",
+       "│ * HUNTER DOUGLAS INDUSTRIES B V* HUNTER DOUGLAS INDUSTRIES BV0.9892470.9935481.0000000.9677420.96774211 │\n",
+       "│ * HUNTER DOUGLAS INDUSTRIES BV * HUNTER DOUGLAS INDUSTRIES BV1.0000001.0000001.0000001.0000001.00000000 │\n",
+       "│ * HUNTER DOUGLAS INDUSTRIES BV * HUNTER DOUGLAS INDUSTRIES BV1.0000001.0000001.0000001.0000001.00000000 │\n",
+       "│  │\n",
+       "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘\n",
+       "
\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 5 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "These can be visually represented using `mismo.eda.string_comparator_score_chart` which plots a heatmap of the similarity and distance measures.", + "id": "b7f45d3951818274" }, { - "cell_type": "code", - "execution_count": 7, - "id": "e384caf06edc8179", "metadata": { "ExecuteTime": { - "end_time": "2024-10-08T11:49:43.169968Z", - "start_time": "2024-10-08T11:49:43.065416Z" + "end_time": "2024-10-16T15:30:07.323565Z", + "start_time": "2024-10-16T15:30:07.222961Z" } }, + "cell_type": "code", + "source": [ + "chart = string_comparator_score_chart(blocked.limit(20), \"name_l\", \"name_r\")\n", + "chart" + ], + "id": "248d795b43d8e704", "outputs": [ { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], - "source": [ - "string_comparator_score_chart(blocked.limit(20), \"surname_l\", \"surname_r\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "929f6463fcb7d6af", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-08T11:49:43.368034Z", - "start_time": "2024-10-08T11:49:43.366663Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f46b91dd4c33eaa0", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-08T11:49:43.548332Z", - "start_time": "2024-10-08T11:49:43.546614Z" - } - }, - "outputs": [], - "source": [] + "execution_count": 6 } ], "metadata": { From 73401c4e9be1b346d381ac0f12ba680ba0adb4ca Mon Sep 17 00:00:00 2001 From: jstammers Date: Wed, 16 Oct 2024 22:16:22 +0100 Subject: [PATCH 3/4] refactor levenshtein to builtin --- mismo/text/_similarity.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mismo/text/_similarity.py b/mismo/text/_similarity.py index 01bf392b..abfda27b 100644 --- a/mismo/text/_similarity.py +++ b/mismo/text/_similarity.py @@ -47,14 +47,6 @@ def damerau_levenshtein(a: str, b: str) -> int: """ -@ibis.udf.scalar.builtin -def levenshtein(a: str, b: str) -> int: - """ - The number of adds, deletes and substitutions to get from `a` to `b`. - - """ - - def levenshtein_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValue: """The levenshtein distance between two strings, normalized to be between 0 and 1. @@ -94,7 +86,7 @@ def levenshtein_ratio(s1: ir.StringValue, s2: ir.StringValue) -> ir.FloatingValu >>> levenshtein_ratio("", "").execute() np.float64(nan) """ - return _dist_ratio(s1, s2, levenshtein) + return _dist_ratio(s1, s2, lambda a, b: a.levenshtein(b)) def damerau_levenshtein_ratio( From dde80ee52f9a6840847dc19357193bf62ff115a5 Mon Sep 17 00:00:00 2001 From: jstammers Date: Wed, 16 Oct 2024 22:16:33 +0100 Subject: [PATCH 4/4] notebook formatting --- docs/examples/string_comparisons.ipynb | 183 ++++++++----------------- 1 file changed, 60 insertions(+), 123 deletions(-) diff --git a/docs/examples/string_comparisons.ipynb b/docs/examples/string_comparisons.ipynb index a80432de..849709c5 100644 --- a/docs/examples/string_comparisons.ipynb +++ b/docs/examples/string_comparisons.ipynb @@ -1,35 +1,29 @@ { "cells": [ { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "3005879cf128faeb", + "metadata": {}, + "outputs": [], "source": [ "# uncomment and run if mismo is not installed\n", "# %pip install -q git+https://github.com/NickCrews/mismo@main" - ], - "id": "3005879cf128faeb" + ] }, { + "metadata": {}, "cell_type": "code", - "id": "b429d6cb7344eb9b", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T15:30:05.585511Z", - "start_time": "2024-10-16T15:30:05.438305Z" - } - }, + "outputs": [], + "execution_count": null, "source": [ "from __future__ import annotations\n", "\n", "import ibis\n", "\n", - "\n", - "ibis.options.interactive = True\n" + "ibis.options.interactive = True" ], - "outputs": [], - "execution_count": 1 + "id": "904960f605570461" }, { "metadata": {}, @@ -45,118 +39,60 @@ "- `Levenshtein` - a measure of the distance between two strings based on the number of deletions, insertions and substitutions.\n", "- `Damerau-Levenshtein` - an extension of `Levenshtein` that includes transpositions." ], - "id": "cd07440b9757cf54" + "id": "d4d56f4a13a52b45" }, { "metadata": {}, "cell_type": "markdown", "source": "Let's explore how these work in practice using the patents dataset. We will generate pairs by blocking on the `label_true` column", - "id": "76c63dd5bd247ae7" + "id": "bc520900a0823dd4" }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T15:30:06.733211Z", - "start_time": "2024-10-16T15:30:05.602814Z" - } - }, + "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ "from mismo.playdata import load_patents\n", + "\n", "patents = load_patents()\n", "patents" ], - "id": "7c8477cf22e9d399", - "outputs": [ - { - "data": { - "text/plain": [ - "┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001B[1m \u001B[0m\u001B[1mrecord_id\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlabel_true\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mname_true\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mname\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlatitude\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlongitude\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mcoauthors\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mclasses\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\n", - "┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001B[2mint64\u001B[0m │ \u001B[2mint64\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │\n", - "├───────────┼────────────┼──────────────────────┼──────────────────────────────┼──────────┼───────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────┤\n", - "│ \u001B[1;36m2909\u001B[0m │ \u001B[1;36m402600\u001B[0m │ \u001B[32mAGILENT TECHNOLOGIES\u001B[0m │ \u001B[32m* AGILENT TECHNOLOGIES, INC.\u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mKONINK PHILIPS ELECTRONICS N V**DAVID E SNYDER**THOMAS D LYSTER \u001B[0m │ \u001B[32mA61N**A61B \u001B[0m │\n", - "│ \u001B[1;36m3574\u001B[0m │ \u001B[1;36m569309\u001B[0m │ \u001B[32mAKZO NOBEL \u001B[0m │ \u001B[32m* AKZO NOBEL N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mTSJERK HOEKSTRA**ANDRESS K JOHNSON**TERESA MARIE CHERON**ALBERTO SLIKTA**JA…\u001B[0m │ \u001B[32mG01N**B01L**C11D**G02F**F16L \u001B[0m │\n", - "│ \u001B[1;36m3575\u001B[0m │ \u001B[1;36m569309\u001B[0m │ \u001B[32mAKZO NOBEL \u001B[0m │ \u001B[32m* AKZO NOBEL NV \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mWILLIAM JOHN ERNEST PARR**HANS OSKARSSON**MARTIN HELLSTEN**KORNELIS OVERKEM…\u001B[0m │ \u001B[32mC09K**F17D**B01F**C23F \u001B[0m │\n", - "│ \u001B[1;36m3779\u001B[0m │ \u001B[1;36m656303\u001B[0m │ \u001B[32mALCATEL \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m52.35\u001B[0m │ \u001B[1;36m4.916667\u001B[0m │ \u001B[32mGUENTER KOCHSMEIER**ZBIGNIEW WIEGOLASKI**EVAN JOHN STANBURY**PETER GRANT JE…\u001B[0m │ \u001B[32mG02B**G04G**H02G**G06F \u001B[0m │\n", - "│ \u001B[1;36m3780\u001B[0m │ \u001B[1;36m656303\u001B[0m │ \u001B[32mALCATEL \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m52.35\u001B[0m │ \u001B[1;36m4.916667\u001B[0m │ \u001B[32mZILAN MANFRED**JOSIANE RAMOS**DUANE LYNN MORTENSEN**CHRISTIAN LE SERGENT \u001B[0m │ \u001B[32mH03G**B05D**H04L**H04B**C03B**C03C**G02B**H01B \u001B[0m │\n", - "│ \u001B[1;36m3782\u001B[0m │ \u001B[1;36m656303\u001B[0m │ \u001B[32mALCATEL \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mOLIVIER AUDOUIN**MICHEL SOTOM**JEAN MICHEL GABRIAGUES \u001B[0m │ \u001B[32mH04B**H01S**H04J \u001B[0m │\n", - "│ \u001B[1;36m15041\u001B[0m │ \u001B[1;36m4333661\u001B[0m │ \u001B[32mCANON EUROPA \u001B[0m │ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mLEE RICKLER**SIMON PARKER**CANON RES CENT EURO **RAKEFET SAGMAN**TIMOTHY FRA…\u001B[0m │ \u001B[32mG06F \u001B[0m │\n", - "│ \u001B[1;36m15042\u001B[0m │ \u001B[1;36m4333661\u001B[0m │ \u001B[32mCANON EUROPA \u001B[0m │ \u001B[32m* CANON EUROPA N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mQI HE HONG**ADAM MICHAEL BAUMBERG**ALEXANDER RALPH LYONS \u001B[0m │ \u001B[32mG06T**G01B \u001B[0m │\n", - "│ \u001B[1;36m15043\u001B[0m │ \u001B[1;36m4333661\u001B[0m │ \u001B[32mCANON EUROPA \u001B[0m │ \u001B[32m* CANON EUROPA NV \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mNILESH PATHAK**MASAMICHI MASUDA** CANON TECHNOLOGY EURO **PATRICK WILLIAM MO…\u001B[0m │ \u001B[32mH04B**G06T**G06F**H04M**H04N**H04Q**G03B**B41J**G01B**G06Q \u001B[0m │\n", - "│ \u001B[1;36m25387\u001B[0m │ \u001B[1;36m7650783\u001B[0m │ \u001B[32mDSM \u001B[0m │ \u001B[32m* DSM N.V. \u001B[0m │ \u001B[1;36m0.00\u001B[0m │ \u001B[1;36m0.000000\u001B[0m │ \u001B[32mGABRIEL MARINUS MEESTERS**RUDOLF CAROLUS BARENDSE**ARIE KARST KIES**ALEXANDE…\u001B[0m │ \u001B[32mC12N**A61K**A23L**A23J**A23K**A01H**B01J**C12R**C07D**A61P**B01D\u001B[0m │\n", - "│ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │\n", - "└───────────┴────────────┴──────────────────────┴──────────────────────────────┴──────────┴───────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────┘" - ], - "text/html": [ - "
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
-       "┃ record_id  label_true  name_true             name                          latitude  longitude  coauthors                                                                         classes                                                          ┃\n",
-       "┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
-       "│ int64int64stringstringfloat64float64stringstring                                                           │\n",
-       "├───────────┼────────────┼──────────────────────┼──────────────────────────────┼──────────┼───────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────┤\n",
-       "│      2909402600AGILENT TECHNOLOGIES* AGILENT TECHNOLOGIES, INC.0.000.000000KONINK PHILIPS ELECTRONICS N V**DAVID E  SNYDER**THOMAS D  LYSTER               A61N**A61B                                                       │\n",
-       "│      3574569309AKZO NOBEL          * AKZO NOBEL N.V.           0.000.000000TSJERK  HOEKSTRA**ANDRESS K  JOHNSON**TERESA MARIE  CHERON**ALBERTO  SLIKTA**JA…G01N**B01L**C11D**G02F**F16L                                     │\n",
-       "│      3575569309AKZO NOBEL          * AKZO NOBEL NV             0.000.000000WILLIAM JOHN ERNEST  PARR**HANS  OSKARSSON**MARTIN  HELLSTEN**KORNELIS  OVERKEM…C09K**F17D**B01F**C23F                                           │\n",
-       "│      3779656303ALCATEL             * ALCATEL N.V.              52.354.916667GUENTER  KOCHSMEIER**ZBIGNIEW  WIEGOLASKI**EVAN JOHN  STANBURY**PETER GRANT  JE…G02B**G04G**H02G**G06F                                           │\n",
-       "│      3780656303ALCATEL             * ALCATEL N.V.              52.354.916667ZILAN  MANFRED**JOSIANE  RAMOS**DUANE LYNN  MORTENSEN**CHRISTIAN  LE SERGENT    H03G**B05D**H04L**H04B**C03B**C03C**G02B**H01B                   │\n",
-       "│      3782656303ALCATEL             * ALCATEL N.V.              0.000.000000OLIVIER  AUDOUIN**MICHEL  SOTOM**JEAN MICHEL  GABRIAGUES                        H04B**H01S**H04J                                                 │\n",
-       "│     150414333661CANON EUROPA        * CANON EUROPA N.V          0.000.000000LEE  RICKLER**SIMON  PARKER**CANON RES CENT EURO **RAKEFET  SAGMAN**TIMOTHY FRA…G06F                                                             │\n",
-       "│     150424333661CANON EUROPA        * CANON EUROPA N.V.         0.000.000000QI HE  HONG**ADAM MICHAEL  BAUMBERG**ALEXANDER RALPH  LYONS                     G06T**G01B                                                       │\n",
-       "│     150434333661CANON EUROPA        * CANON EUROPA NV           0.000.000000NILESH  PATHAK**MASAMICHI  MASUDA** CANON TECHNOLOGY EURO **PATRICK WILLIAM  MO…H04B**G06T**G06F**H04M**H04N**H04Q**G03B**B41J**G01B**G06Q       │\n",
-       "│     253877650783DSM                 * DSM N.V.                  0.000.000000GABRIEL MARINUS  MEESTERS**RUDOLF CAROLUS  BARENDSE**ARIE KARST  KIES**ALEXANDE…C12N**A61K**A23L**A23J**A23K**A01H**B01J**C12R**C07D**A61P**B01D │\n",
-       "│                                                                         │\n",
-       "└───────────┴────────────┴──────────────────────┴──────────────────────────────┴──────────┴───────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────┘\n",
-       "
\n" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 2 + "id": "a8aa4080d43d9467" }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T15:30:06.857603Z", - "start_time": "2024-10-16T15:30:06.840256Z" - } - }, + "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ "from mismo.block import KeyBlocker\n", + "\n", "blocked = KeyBlocker(\"label_true\")(patents, patents)" ], - "id": "6fab260ed83951a5", - "outputs": [], - "execution_count": 3 + "id": "5088ecf3547b9af6" }, { "metadata": {}, "cell_type": "markdown", "source": "A comparison table of these string similarity measures can be generated using `mismo.eda.string_comparator_scores`", - "id": "2e47f7105bcd7fe9" + "id": "e5073d32a97b1106" }, { + "metadata": {}, "cell_type": "code", - "id": "3ba075261fce1d8c", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T15:30:07.088938Z", - "start_time": "2024-10-16T15:30:07.025850Z" - } - }, + "outputs": [], + "execution_count": null, "source": [ "from mismo.eda import string_comparator_score_chart, string_comparator_scores\n", + "\n", "scores = string_comparator_scores(blocked.limit(20), \"name_l\", \"name_r\")" ], - "outputs": [], - "execution_count": 4 + "id": "868123695edc81b8" }, { "cell_type": "code", + "execution_count": 5, "id": "ecd02caad29c4a7c", "metadata": { "ExecuteTime": { @@ -164,29 +100,9 @@ "start_time": "2024-10-16T15:30:07.094108Z" } }, - "source": "scores", "outputs": [ { "data": { - "text/plain": [ - "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001B[1m \u001B[0m\u001B[1mstring1\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mstring2\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_winkler_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaccard_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\n", - "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mint64\u001B[0m │ \u001B[2mint64\u001B[0m │\n", - "├─────────────────────────────────┼────────────────────────────────┼─────────────────┼─────────────────────────┼────────────────────┼───────────────────┼───────────────────────────┼──────────────────────┼──────────────────────────────┤\n", - "│ \u001B[32m* AKZO NOBEL N.V. \u001B[0m │ \u001B[32m* AKZO NOBEL NV \u001B[0m │ \u001B[1;36m0.960784\u001B[0m │ \u001B[1;36m0.976471\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", - "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", - "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", - "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA N.V. \u001B[0m │ \u001B[1;36m0.982456\u001B[0m │ \u001B[1;36m0.989474\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", - "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA NV \u001B[0m │ \u001B[1;36m0.981481\u001B[0m │ \u001B[1;36m0.988889\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", - "│ \u001B[32m* DSM N.V. \u001B[0m │ \u001B[32mDSM N.V. \u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.875000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", - "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", - "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", - "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", - "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", - "│ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │\n", - "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘" - ], "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
        "┃ string1                          string2                         jaro_similarity  jaro_winkler_similarity  jaccard_similarity  levenshtein_ratio  damerau_levenshtein_ratio  levenshtein_distance  damerau_levenshtein_distance ┃\n",
@@ -206,6 +122,25 @@
        "│  │\n",
        "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘\n",
        "
\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001B[1m \u001B[0m\u001B[1mstring1\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mstring2\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_winkler_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaccard_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mint64\u001B[0m │ \u001B[2mint64\u001B[0m │\n", + "├─────────────────────────────────┼────────────────────────────────┼─────────────────┼─────────────────────────┼────────────────────┼───────────────────┼───────────────────────────┼──────────────────────┼──────────────────────────────┤\n", + "│ \u001B[32m* AKZO NOBEL N.V. \u001B[0m │ \u001B[32m* AKZO NOBEL NV \u001B[0m │ \u001B[1;36m0.960784\u001B[0m │ \u001B[1;36m0.976471\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", + "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA N.V. \u001B[0m │ \u001B[1;36m0.982456\u001B[0m │ \u001B[1;36m0.989474\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA NV \u001B[0m │ \u001B[1;36m0.981481\u001B[0m │ \u001B[1;36m0.988889\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* DSM N.V. \u001B[0m │ \u001B[32mDSM N.V. \u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.875000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │\n", + "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘" ] }, "execution_count": 5, @@ -213,27 +148,26 @@ "output_type": "execute_result" } ], - "execution_count": 5 + "source": [ + "scores" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "These can be visually represented using `mismo.eda.string_comparator_score_chart` which plots a heatmap of the similarity and distance measures.", - "id": "b7f45d3951818274" + "id": "b7f45d3951818274", + "metadata": {}, + "source": "These can be visually represented using `mismo.eda.string_comparator_score_chart` which plots a heatmap of the similarity and distance measures." }, { + "cell_type": "code", + "execution_count": 6, + "id": "248d795b43d8e704", "metadata": { "ExecuteTime": { "end_time": "2024-10-16T15:30:07.323565Z", "start_time": "2024-10-16T15:30:07.222961Z" } }, - "cell_type": "code", - "source": [ - "chart = string_comparator_score_chart(blocked.limit(20), \"name_l\", \"name_r\")\n", - "chart" - ], - "id": "248d795b43d8e704", "outputs": [ { "data": { @@ -314,7 +248,10 @@ "output_type": "execute_result" } ], - "execution_count": 6 + "source": [ + "chart = string_comparator_score_chart(blocked.limit(20), \"name_l\", \"name_r\")\n", + "chart" + ] } ], "metadata": {