diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index dd993d1b..f85a090f 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,6 +1,7 @@ * [Home](index.md) * Examples * [Patent Deduplication](examples/patent_deduplication.ipynb) + * [String Comparison Methods](examples/string_comparisons.ipynb) * Concepts * [Goals and Alternatives](concepts/goals_and_alternatives.md) * [Fellegi-Sunter Model](concepts/fs.md) diff --git a/docs/examples/string_comparisons.ipynb b/docs/examples/string_comparisons.ipynb new file mode 100644 index 00000000..849709c5 --- /dev/null +++ b/docs/examples/string_comparisons.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3005879cf128faeb", + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment and run if mismo is not installed\n", + "# %pip install -q git+https://github.com/NickCrews/mismo@main" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from __future__ import annotations\n", + "\n", + "import ibis\n", + "\n", + "ibis.options.interactive = True" + ], + "id": "904960f605570461" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "Many real-world datasets contain errors due to causes such as manual data entry, incorrect data processing and inconsistent formatting. Therefore it's often useful to make use of string similarity measures that can quantify how close two strings are to each other by accounting for common types of string manipulations. These are defined to give a score between 0 and 1 which indicate minimal and maximal similarity respectively. mismo currently implements the following string similarity measures that are suitable for different use-cases:\n", + "\n", + "- `Jaro` - a measure of the similarity between two strings given the number of matching characters and transpositions and their length.\n", + "- `Jaro-Winkler` - a modification of the Jaro similarity that uses a prefix scale to give more favourable weightings to strings that match at the start.\n", + "- `Jaccard` - a measure of the number of overlapping sets of words in two strings.\n", + "\n", + "In addition, the following edit distance measures are defined along with equivalent similarities that are normalized using string lengths.\n", + "- `Levenshtein` - a measure of the distance between two strings based on the number of deletions, insertions and substitutions.\n", + "- `Damerau-Levenshtein` - an extension of `Levenshtein` that includes transpositions." + ], + "id": "d4d56f4a13a52b45" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's explore how these work in practice using the patents dataset. We will generate pairs by blocking on the `label_true` column", + "id": "bc520900a0823dd4" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from mismo.playdata import load_patents\n", + "\n", + "patents = load_patents()\n", + "patents" + ], + "id": "a8aa4080d43d9467" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from mismo.block import KeyBlocker\n", + "\n", + "blocked = KeyBlocker(\"label_true\")(patents, patents)" + ], + "id": "5088ecf3547b9af6" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "A comparison table of these string similarity measures can be generated using `mismo.eda.string_comparator_scores`", + "id": "e5073d32a97b1106" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from mismo.eda import string_comparator_score_chart, string_comparator_scores\n", + "\n", + "scores = string_comparator_scores(blocked.limit(20), \"name_l\", \"name_r\")" + ], + "id": "868123695edc81b8" + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ecd02caad29c4a7c", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-16T15:30:07.109820Z", + "start_time": "2024-10-16T15:30:07.094108Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃ string1 ┃ string2 ┃ jaro_similarity ┃ jaro_winkler_similarity ┃ jaccard_similarity ┃ levenshtein_ratio ┃ damerau_levenshtein_ratio ┃ levenshtein_distance ┃ damerau_levenshtein_distance ┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ int64 │ int64 │\n", + "├─────────────────────────────────┼────────────────────────────────┼─────────────────┼─────────────────────────┼────────────────────┼───────────────────┼───────────────────────────┼──────────────────────┼──────────────────────────────┤\n", + "│ * AKZO NOBEL N.V. │ * AKZO NOBEL NV │ 0.960784 │ 0.976471 │ 0.916667 │ 0.882353 │ 0.882353 │ 2 │ 2 │\n", + "│ * ALCATEL N.V. │ * ALCATEL N.V. │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 0 │ 0 │\n", + "│ * ALCATEL N.V. │ * ALCATEL N.V. │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 0 │ 0 │\n", + "│ * CANON EUROPA N.V │ * CANON EUROPA N.V. │ 0.982456 │ 0.989474 │ 1.000000 │ 0.947368 │ 0.947368 │ 1 │ 1 │\n", + "│ * CANON EUROPA N.V │ * CANON EUROPA NV │ 0.981481 │ 0.988889 │ 0.916667 │ 0.944444 │ 0.944444 │ 1 │ 1 │\n", + "│ * DSM N.V. │ DSM N.V. │ 0.850000 │ 0.850000 │ 0.875000 │ 0.800000 │ 0.800000 │ 2 │ 2 │\n", + "│ * HUNTER DOUGLAS INDUSTRIES B V │ * HUNTER DOUGLAS INDUSTRIES BV │ 0.989247 │ 0.993548 │ 1.000000 │ 0.967742 │ 0.967742 │ 1 │ 1 │\n", + "│ * HUNTER DOUGLAS INDUSTRIES B V │ * HUNTER DOUGLAS INDUSTRIES BV │ 0.989247 │ 0.993548 │ 1.000000 │ 0.967742 │ 0.967742 │ 1 │ 1 │\n", + "│ * HUNTER DOUGLAS INDUSTRIES BV │ * HUNTER DOUGLAS INDUSTRIES BV │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 0 │ 0 │\n", + "│ * HUNTER DOUGLAS INDUSTRIES BV │ * HUNTER DOUGLAS INDUSTRIES BV │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 1.000000 │ 0 │ 0 │\n", + "│ … │ … │ … │ … │ … │ … │ … │ … │ … │\n", + "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘\n", + "\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001B[1m \u001B[0m\u001B[1mstring1\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mstring2\u001B[0m\u001B[1m \u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaro_winkler_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mjaccard_similarity\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_ratio\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mlevenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\u001B[1m \u001B[0m\u001B[1mdamerau_levenshtein_distance\u001B[0m\u001B[1m \u001B[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001B[2mstring\u001B[0m │ \u001B[2mstring\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mfloat64\u001B[0m │ \u001B[2mint64\u001B[0m │ \u001B[2mint64\u001B[0m │\n", + "├─────────────────────────────────┼────────────────────────────────┼─────────────────┼─────────────────────────┼────────────────────┼───────────────────┼───────────────────────────┼──────────────────────┼──────────────────────────────┤\n", + "│ \u001B[32m* AKZO NOBEL N.V. \u001B[0m │ \u001B[32m* AKZO NOBEL NV \u001B[0m │ \u001B[1;36m0.960784\u001B[0m │ \u001B[1;36m0.976471\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m0.882353\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", + "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[32m* ALCATEL N.V. \u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA N.V. \u001B[0m │ \u001B[1;36m0.982456\u001B[0m │ \u001B[1;36m0.989474\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m0.947368\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* CANON EUROPA N.V \u001B[0m │ \u001B[32m* CANON EUROPA NV \u001B[0m │ \u001B[1;36m0.981481\u001B[0m │ \u001B[1;36m0.988889\u001B[0m │ \u001B[1;36m0.916667\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m0.944444\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* DSM N.V. \u001B[0m │ \u001B[32mDSM N.V. \u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.850000\u001B[0m │ \u001B[1;36m0.875000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m0.800000\u001B[0m │ \u001B[1;36m2\u001B[0m │ \u001B[1;36m2\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES B V\u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m0.989247\u001B[0m │ \u001B[1;36m0.993548\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m0.967742\u001B[0m │ \u001B[1;36m1\u001B[0m │ \u001B[1;36m1\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV \u001B[0m │ \u001B[32m* HUNTER DOUGLAS INDUSTRIES BV\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m1.000000\u001B[0m │ \u001B[1;36m0\u001B[0m │ \u001B[1;36m0\u001B[0m │\n", + "│ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │ \u001B[2m…\u001B[0m │\n", + "└─────────────────────────────────┴────────────────────────────────┴─────────────────┴─────────────────────────┴────────────────────┴───────────────────┴───────────────────────────┴──────────────────────┴──────────────────────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores" + ] + }, + { + "cell_type": "markdown", + "id": "b7f45d3951818274", + "metadata": {}, + "source": "These can be visually represented using `mismo.eda.string_comparator_score_chart` which plots a heatmap of the similarity and distance measures." + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "248d795b43d8e704", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-16T15:30:07.323565Z", + "start_time": "2024-10-16T15:30:07.222961Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chart = string_comparator_score_chart(blocked.limit(20), \"name_l\", \"name_r\")\n", + "chart" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/mismo/eda/__init__.py b/mismo/eda/__init__.py index a1efe18e..87a54d57 100644 --- a/mismo/eda/__init__.py +++ b/mismo/eda/__init__.py @@ -4,3 +4,7 @@ from mismo.eda._plot import distribution_chart as distribution_chart from mismo.eda._plot import distribution_dashboard as distribution_dashboard +from mismo.eda._string import ( + string_comparator_score_chart as string_comparator_score_chart, +) +from mismo.eda._string import string_comparator_scores as string_comparator_scores diff --git a/mismo/eda/_string.py b/mismo/eda/_string.py new file mode 100644 index 00000000..17bdad7b --- /dev/null +++ b/mismo/eda/_string.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import altair as alt +from ibis import _ +from ibis import selectors as s +from ibis.expr import types as ir + +from mismo.text import ( + damerau_levenshtein, + damerau_levenshtein_ratio, + jaccard, + jaro_similarity, + jaro_winkler_similarity, + levenshtein, + levenshtein_ratio, +) + + +def string_comparator_scores(table: ir.Table, col1: str, col2: str) -> ir.Table: + """Create a table of string comparison measures between two columns. + + This calculates the following similarity measures which range between 0 and 1: + - The Jaro similarity + - The Jaro-Winkler similarity + - The Levenshtein ratio + - The Damerau-Levenshtein ratio + + as well as the following edit distances: + - The Levenshtein distance + - The Damerau-Levenshtein distance + + + Parameters + ---------- + + table : ir.Table + An ibis table containing string columns. + col1: str + The name of the first column. + col2: str + The name of the second column. + + Returns + ------- + A table of string comparison measures between two columns. + + Examples + -------- + + >>> import ibis + >>> from mismo.eda import string_comparator_scores + >>> ibis.options.interactive = True + >>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"], + ... "string2": ["foo", "bam", "fizz buzz"]}) + >>> string_comparator_scores(table, col1="string1", col2="string2") + ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━┓ + ┃ string1 ┃ string2 ┃ jaro_similarity ┃ jaro_winkler_similarity ┃ … ┃ + ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━┩ + │ string │ string │ float64 │ float64 │ … │ + ├─────────┼───────────┼─────────────────┼─────────────────────────┼───┤ + │ foo │ foo │ 1.000000 │ 1.000000 │ … │ + │ bar │ bam │ 0.777778 │ 0.822222 │ … │ + │ fizz │ fizz buzz │ 0.814815 │ 0.888889 │ … │ + └─────────┴───────────┴─────────────────┴─────────────────────────┴───┘ + """ + comp_table = ( + table.select(_[col1].name("string1"), _[col2].name("string2")) + .mutate( + jaro_similarity=jaro_similarity(_.string1, _.string2), + jaro_winkler_similarity=jaro_winkler_similarity(_.string1, _.string2), + jaccard_similarity=jaccard(_.string1, _.string2), + levenshtein_ratio=levenshtein_ratio(_.string1, _.string2), + damerau_levenshtein_ratio=damerau_levenshtein_ratio(_.string1, _.string2), + levenshtein_distance=levenshtein(_.string1, _.string2), + damerau_levenshtein_distance=damerau_levenshtein(_.string1, _.string2), + ) + .cache() + ) + + return comp_table + + +def string_comparator_score_chart(table: ir.Table, col1: str, col2: str) -> alt.Chart: + """Create a heatmap of string comparison measures between two columns. + + Examples + -------- + + >>> import ibis + >>> from mismo.eda import string_comparator_score_chart + >>> table = ibis.memtable({"string1": ["foo", "bar", "fizz"], + ... "string2": ["foo", "bam", "fizz buzz"]}) + >>> string_comparator_score_chart(table, col1="string1", col2="string2") + alt.Chart(...) + """ + + comp_table = string_comparator_scores(table, col1, col2).mutate( + strings_to_compare=_.string1.concat(", ", _.string2) + ) + similarity_records = ( + comp_table.select( + "strings_to_compare", s.contains("similarity") | s.contains("ratio") + ) + .pivot_longer( + ~s.cols("strings_to_compare"), names_to="comparator", values_to="value" + ) + .mutate( + comparator=_.comparator.re_replace("(_similarity)|(_ratio)", ""), + ) + ) + distance_records = ( + comp_table.select("strings_to_compare", s.contains("distance")) + .pivot_longer( + ~s.cols("strings_to_compare"), names_to="comparator", values_to="value" + ) + .mutate( + comparator=_.comparator.re_replace("_distance", ""), + ) + ) + base = ( + alt.Chart(similarity_records, title="Similarity") + .mark_rect() + .encode( + y=alt.Text( + "strings_to_compare:O", + title="String comparison", + ), + x=alt.Text("comparator:O", title=None), + color=alt.Color("value:Q", legend=None, scale=alt.Scale(domain=(0, 1))), + ) + ) + + text = base.mark_text().encode( + alt.Text("value:Q", format=".2f"), + color=alt.value("black"), + ) + + distance_base = ( + alt.Chart(distance_records, title="Distance") + .mark_rect() + .encode( + y=alt.Text("strings_to_compare:O", axis=None), + x=alt.Text("comparator:O", title=None), + color=alt.Color("value:Q", legend=None).scale( + scheme="yelloworangered", reverse=True + ), + ) + ) + + distance_text = distance_base.mark_text().encode( + alt.Text("value:Q", format=".2f"), + color=alt.value("black"), + ) + chart = alt.hconcat( + base + text, + distance_base + distance_text, + title=alt.Title(text="Heatmaps of string comparison metrics", anchor="middle"), + config=alt.Config( + view=alt.ViewConfig(discreteHeight={"step": 30}, discreteWidth={"step": 40}) + ), + ).resolve_scale(color="independent", size="independent") + return chart + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/mismo/text/__init__.py b/mismo/text/__init__.py index 9f1313e5..5cf33a58 100644 --- a/mismo/text/__init__.py +++ b/mismo/text/__init__.py @@ -3,10 +3,15 @@ from __future__ import annotations from mismo.text._features import ngrams as ngrams +from mismo.text._features import tokenize as tokenize from mismo.text._similarity import damerau_levenshtein as damerau_levenshtein from mismo.text._similarity import ( damerau_levenshtein_ratio as damerau_levenshtein_ratio, ) from mismo.text._similarity import double_metaphone as double_metaphone +from mismo.text._similarity import jaccard as jaccard +from mismo.text._similarity import jaro_similarity as jaro_similarity +from mismo.text._similarity import jaro_winkler_similarity as jaro_winkler_similarity +from mismo.text._similarity import levenshtein as levenshtein from mismo.text._similarity import levenshtein_ratio as levenshtein_ratio from mismo.text._strings import norm_whitespace as norm_whitespace diff --git a/mismo/text/_similarity.py b/mismo/text/_similarity.py index b45e10e4..abfda27b 100644 --- a/mismo/text/_similarity.py +++ b/mismo/text/_similarity.py @@ -107,3 +107,50 @@ def _dist_ratio(s1, s2, dist): s2 = _util.ensure_ibis(s2, "string") lenmax = ibis.greatest(s1.length(), s2.length()) return (lenmax - dist(s1, s2)) / lenmax + + +@ibis.udf.scalar.builtin +def jaro_similarity(s1: str, s2: str) -> float: + """The jaro similarity between `s1` and `s2`. + + This is defined as + `sj = 1/3 * (m/l_1 + m/l_2 + (m-t)/m)` + + where `m` is the number of matching characters between s1 and s2 and `t` is the + number of transpositions between `s1` and `s2`. + """ + + +# TODO: This isn't portable between backends +@ibis.udf.scalar.builtin +def jaro_winkler_similarity(s1: str, s2: str) -> float: + """The Jaro-Winkler similarity between `s1` and `s2`. + + The Jaro-Winkler similarity is a variant of the Jaro similarity that + measures the number of edits between two strings + and places a higher importance on the prefix. + + It is defined as `(sjw = sj + l * p * (1-sj)` + where `sj` is the Jaro similarity, `l` is the length of the common prefix (up to a + maximum of 4) and `p` is a constant scaling factor (up to a maximum of 0.25, but + typically set to 0.1) + """ + + +@ibis.udf.scalar.builtin +def jaccard(s1: str, s2: str) -> float: + """The Jaccard similarity between `s1` and `s2 + + This is equivalent to + + ```python + from mismo.sets import jaccard as jaccard_set + from mismo.text import tokenize + + t1 = tokenize(s1) + t2 = tokenize(s2) + jaccard_set(t1, t2) + ``` + + but is added here for convenience. + """ diff --git a/mismo/text/tests/test_similarity.py b/mismo/text/tests/test_similarity.py index bd37185b..a07fa902 100644 --- a/mismo/text/tests/test_similarity.py +++ b/mismo/text/tests/test_similarity.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from mismo import text +from mismo import sets, text @pytest.mark.parametrize( @@ -38,3 +38,39 @@ def test_levenshtein_ratio(string1, string2, expected): assert np.isnan(result) else: assert expected == result + + +@pytest.mark.parametrize( + "string1,string2,expected", + [ + ("foo", "foo", 1), + ("foo", "food", 0.942), + ("bar", "bim", 0.5555), + ("", "", 0), + (None, None, np.nan), + ], +) +def test_jaro_winkler_similarity(string1, string2, expected): + result = text.jaro_winkler_similarity(string1, string2).execute() + if np.isnan(expected): + assert np.isnan(result) + else: + assert result == pytest.approx(expected, 0.001) + + +@pytest.mark.parametrize( + "string1,string2,expected", + [ + ("foo", "foo", 1), + ("foo bar", "foo", 0.3333), # this is currently failing + ("foo bar", "bar foo", 1), + ], +) +def test_jaccard_string_similarity(string1, string2, expected): + """Test that the string and set jaccard methods are equivalent.""" + result = text.jaccard(string1, string2).execute() + tokens1 = text.tokenize(string1) + tokens2 = text.tokenize(string2) + set_result = sets.jaccard(tokens1, tokens2).execute() + assert result == pytest.approx(set_result, 0.001) + assert result == pytest.approx(expected, 0.001)