Improved String Grouping (#6)

Maarten Grootendorst · web-flow · commit 74945f8326b9 · 2020-12-07T16:39:44.000+01:00
* Improved clustering of strings 
* Update workflow
* Additional tests
* Update Flair dependency
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/docs/releases.md b/docs/releases.md
@@ -1,3 +1,6 @@
+v0.2.2  
+- Update grouping to include all strings only if identical lists of strings are compared  
+
 v0.2.0  
 - Update naming convention matcher --> model  
 - Update documentation  
diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py
@@ -1,2 +1,2 @@
 from .polyfuzz import PolyFuzz
-__version__ = "0.2.1"
+__version__ = "0.2.2"
diff --git a/polyfuzz/polyfuzz.py b/polyfuzz/polyfuzz.py
@@ -189,13 +189,17 @@ def visualize_precision_recall(self,
 
     def group(self,
               model: Union[str, BaseMatcher] = None,
-              link_min_similarity: float = 0.75):
+              link_min_similarity: float = 0.75,
+              group_all_strings: bool = False):
         """ From the matches, group the `To` matches together using single linkage
 
          Arguments:
              model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
              link_min_similarity: the minimum similarity between strings before they are grouped
                                   in a single linkage fashion
+             group_all_strings: if you want to compare a list of strings with itself and then cluster
+                                those strings, set this to True. Otherwise, only the strings that
+                                were mapped To are clustered.
 
          Updates:
             self.matches: Adds a column `Group` that is the grouped version of the `To` column
@@ -223,13 +227,9 @@ def group(self,
         elif not model:
             model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
 
+        # Group per model
         for name, match in self.matches.items():
-            strings = list(self.matches[name].To.dropna().unique())
-            matches = model.match(strings, strings)
-            clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
-            self._map_groups(name, cluster_name_map)
-            self.clusters[name] = clusters
-            self.cluster_mappings[name] = cluster_id_map
+            self._create_groups(name, model, link_min_similarity, group_all_strings)
 
     def get_ids(self) -> Union[str, List[str], None]:
         """ Get all model ids for easier access """
@@ -285,17 +285,33 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
 
         return self.cluster_mappings
 
-    def _map_groups(self, name: str, cluster_name_map: Mapping[str, str]):
-        """ Map the 'to' list to groups """
+    def _create_groups(self,
+                       name: str,
+                       model: BaseMatcher,
+                       link_min_similarity: float,
+                       group_all_strings: bool):
+        """ Create groups based on either the To mappings if you compare two different lists of strings, or
+        the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
+        """
+
+        if group_all_strings:
+            strings = list(self.matches[name].From.dropna().unique())
+        else:
+            strings = list(self.matches[name].To.dropna().unique())
+
+        # Create clusters
+        matches = model.match(strings, strings)
+        clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
+
+        # Map the `to` list to groups
         df = self.matches[name]
         df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])
-
-        # Fix that some mappings from "From" end up in "Group"
-        df.loc[(df.From != df.To) &
-               (df.From == df.Group), "Group"] = df.loc[(df.From != df.To) &
-                                                        (df.From == df.Group), "To"]
         self.matches[name] = df
 
+        # Track clusters and their ids
+        self.clusters[name] = clusters
+        self.cluster_mappings[name] = cluster_id_map
+
     def _update_model_ids(self):
         """ Update model ids such that there is no overlap between ids """
         # Give models a model_id if it didn't already exist
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 ]
 
 base_packages = [
-    "numpy>= 1.18.5",
+    "numpy>= 1.18.5,<=1.19.4",
     "scipy>= 1.3.1",
     "pandas>= 0.25.3",
     "tqdm>=4.41.1",
@@ -25,7 +25,7 @@
 ]
 
 fast_cosine = ["sparse_dot_topn>=0.2.9"]
-embeddings_packages = ["flair>= 0.6.1.post1"]
+embeddings_packages = ["torch>=1.2.0", "flair>= 0.7"]
 
 extra_packages = embeddings_packages + fast_cosine
 
@@ -37,7 +37,7 @@
 setup(
     name="polyfuzz",
     packages=find_packages(exclude=["notebooks", "docs"]),
-    version="0.2.1",
+    version="0.2.2",
     author="Maarten Grootendorst",
     author_email="maartengrootendorst@gmail.com",
     description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",
diff --git a/tests/test_polyfuzz.py b/tests/test_polyfuzz.py
@@ -51,6 +51,20 @@ def test_grouper(method):
     assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}
 
 
+def test_grouper_same_list():
+    model = PolyFuzz("TF-IDF").match(from_list, from_list)
+    model.group(link_min_similarity=0.75, group_all_strings=True)
+    matches = model.get_matches()
+
+    assert isinstance(matches, pd.DataFrame)
+    assert matches.Similarity.mean() > 0.3
+    assert len(matches) == 6
+    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']
+
+    assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
+    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}
+
+
 @pytest.mark.parametrize("method", ["Unknown Model"])
 def test_wrongbase_model(method):
     with pytest.raises(ValueError):

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`from .polyfuzz import PolyFuzz`
`2`		`-__version__ = "0.2.1"`
	`2`	`+__version__ = "0.2.2"`