Skip to content

Commit 74945f8

Browse files
author
Maarten Grootendorst
authored
Improved String Grouping (#6)
* Improved clustering of strings * Update workflow * Additional tests * Update Flair dependency
1 parent bbcfe3d commit 74945f8

File tree

6 files changed

+52
-19
lines changed

6 files changed

+52
-19
lines changed

.github/workflows/testing.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
runs-on: ubuntu-latest
1515
strategy:
1616
matrix:
17-
python-version: [3.6, 3.7, 3.8]
17+
python-version: [3.7, 3.8]
1818

1919
steps:
2020
- uses: actions/checkout@v2

docs/releases.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
v0.2.2
2+
- Update grouping to include all strings only if identical lists of strings are compared
3+
14
v0.2.0
25
- Update naming convention matcher --> model
36
- Update documentation

polyfuzz/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
from .polyfuzz import PolyFuzz
2-
__version__ = "0.2.1"
2+
__version__ = "0.2.2"

polyfuzz/polyfuzz.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -189,13 +189,17 @@ def visualize_precision_recall(self,
189189

190190
def group(self,
191191
model: Union[str, BaseMatcher] = None,
192-
link_min_similarity: float = 0.75):
192+
link_min_similarity: float = 0.75,
193+
group_all_strings: bool = False):
193194
""" From the matches, group the `To` matches together using single linkage
194195
195196
Arguments:
196197
model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
197198
link_min_similarity: the minimum similarity between strings before they are grouped
198199
in a single linkage fashion
200+
group_all_strings: if you want to compare a list of strings with itself and then cluster
201+
those strings, set this to True. Otherwise, only the strings that
202+
were mapped To are clustered.
199203
200204
Updates:
201205
self.matches: Adds a column `Group` that is the grouped version of the `To` column
@@ -223,13 +227,9 @@ def group(self,
223227
elif not model:
224228
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
225229

230+
# Group per model
226231
for name, match in self.matches.items():
227-
strings = list(self.matches[name].To.dropna().unique())
228-
matches = model.match(strings, strings)
229-
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
230-
self._map_groups(name, cluster_name_map)
231-
self.clusters[name] = clusters
232-
self.cluster_mappings[name] = cluster_id_map
232+
self._create_groups(name, model, link_min_similarity, group_all_strings)
233233

234234
def get_ids(self) -> Union[str, List[str], None]:
235235
""" Get all model ids for easier access """
@@ -285,17 +285,33 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
285285

286286
return self.cluster_mappings
287287

288-
def _map_groups(self, name: str, cluster_name_map: Mapping[str, str]):
289-
""" Map the 'to' list to groups """
288+
def _create_groups(self,
289+
name: str,
290+
model: BaseMatcher,
291+
link_min_similarity: float,
292+
group_all_strings: bool):
293+
""" Create groups based on either the To mappings if you compare two different lists of strings, or
294+
the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
295+
"""
296+
297+
if group_all_strings:
298+
strings = list(self.matches[name].From.dropna().unique())
299+
else:
300+
strings = list(self.matches[name].To.dropna().unique())
301+
302+
# Create clusters
303+
matches = model.match(strings, strings)
304+
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
305+
306+
# Map the `to` list to groups
290307
df = self.matches[name]
291308
df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])
292-
293-
# Fix that some mappings from "From" end up in "Group"
294-
df.loc[(df.From != df.To) &
295-
(df.From == df.Group), "Group"] = df.loc[(df.From != df.To) &
296-
(df.From == df.Group), "To"]
297309
self.matches[name] = df
298310

311+
# Track clusters and their ids
312+
self.clusters[name] = clusters
313+
self.cluster_mappings[name] = cluster_id_map
314+
299315
def _update_model_ids(self):
300316
""" Update model ids such that there is no overlap between ids """
301317
# Give models a model_id if it didn't already exist

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
]
1414

1515
base_packages = [
16-
"numpy>= 1.18.5",
16+
"numpy>= 1.18.5,<=1.19.4",
1717
"scipy>= 1.3.1",
1818
"pandas>= 0.25.3",
1919
"tqdm>=4.41.1",
@@ -25,7 +25,7 @@
2525
]
2626

2727
fast_cosine = ["sparse_dot_topn>=0.2.9"]
28-
embeddings_packages = ["flair>= 0.6.1.post1"]
28+
embeddings_packages = ["torch>=1.2.0", "flair>= 0.7"]
2929

3030
extra_packages = embeddings_packages + fast_cosine
3131

@@ -37,7 +37,7 @@
3737
setup(
3838
name="polyfuzz",
3939
packages=find_packages(exclude=["notebooks", "docs"]),
40-
version="0.2.1",
40+
version="0.2.2",
4141
author="Maarten Grootendorst",
4242
author_email="maartengrootendorst@gmail.com",
4343
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",

tests/test_polyfuzz.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,20 @@ def test_grouper(method):
5151
assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}
5252

5353

54+
def test_grouper_same_list():
55+
model = PolyFuzz("TF-IDF").match(from_list, from_list)
56+
model.group(link_min_similarity=0.75, group_all_strings=True)
57+
matches = model.get_matches()
58+
59+
assert isinstance(matches, pd.DataFrame)
60+
assert matches.Similarity.mean() > 0.3
61+
assert len(matches) == 6
62+
assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']
63+
64+
assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
65+
assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}
66+
67+
5468
@pytest.mark.parametrize("method", ["Unknown Model"])
5569
def test_wrongbase_model(method):
5670
with pytest.raises(ValueError):

0 commit comments

Comments
 (0)