@@ -189,13 +189,17 @@ def visualize_precision_recall(self,
189189
190190 def group (self ,
191191 model : Union [str , BaseMatcher ] = None ,
192- link_min_similarity : float = 0.75 ):
192+ link_min_similarity : float = 0.75 ,
193+ group_all_strings : bool = False ):
193194 """ From the matches, group the `To` matches together using single linkage
194195
195196 Arguments:
196197 model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
197198 link_min_similarity: the minimum similarity between strings before they are grouped
198199 in a single linkage fashion
200+ group_all_strings: if you want to compare a list of strings with itself and then cluster
201+ those strings, set this to True. Otherwise, only the strings that
202+ were mapped To are clustered.
199203
200204 Updates:
201205 self.matches: Adds a column `Group` that is the grouped version of the `To` column
@@ -223,13 +227,9 @@ def group(self,
223227 elif not model :
224228 model = TFIDF (n_gram_range = (3 , 3 ), min_similarity = link_min_similarity )
225229
230+ # Group per model
226231 for name , match in self .matches .items ():
227- strings = list (self .matches [name ].To .dropna ().unique ())
228- matches = model .match (strings , strings )
229- clusters , cluster_id_map , cluster_name_map = single_linkage (matches , link_min_similarity )
230- self ._map_groups (name , cluster_name_map )
231- self .clusters [name ] = clusters
232- self .cluster_mappings [name ] = cluster_id_map
232+ self ._create_groups (name , model , link_min_similarity , group_all_strings )
233233
234234 def get_ids (self ) -> Union [str , List [str ], None ]:
235235 """ Get all model ids for easier access """
@@ -285,17 +285,33 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
285285
286286 return self .cluster_mappings
287287
288- def _map_groups (self , name : str , cluster_name_map : Mapping [str , str ]):
289- """ Map the 'to' list to groups """
288+ def _create_groups (self ,
289+ name : str ,
290+ model : BaseMatcher ,
291+ link_min_similarity : float ,
292+ group_all_strings : bool ):
293+ """ Create groups based on either the To mappings if you compare two different lists of strings, or
294+ the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
295+ """
296+
297+ if group_all_strings :
298+ strings = list (self .matches [name ].From .dropna ().unique ())
299+ else :
300+ strings = list (self .matches [name ].To .dropna ().unique ())
301+
302+ # Create clusters
303+ matches = model .match (strings , strings )
304+ clusters , cluster_id_map , cluster_name_map = single_linkage (matches , link_min_similarity )
305+
306+ # Map the `to` list to groups
290307 df = self .matches [name ]
291308 df ["Group" ] = df ['To' ].map (cluster_name_map ).fillna (df ['To' ])
292-
293- # Fix that some mappings from "From" end up in "Group"
294- df .loc [(df .From != df .To ) &
295- (df .From == df .Group ), "Group" ] = df .loc [(df .From != df .To ) &
296- (df .From == df .Group ), "To" ]
297309 self .matches [name ] = df
298310
311+ # Track clusters and their ids
312+ self .clusters [name ] = clusters
313+ self .cluster_mappings [name ] = cluster_id_map
314+
299315 def _update_model_ids (self ):
300316 """ Update model ids such that there is no overlap between ids """
301317 # Give models a model_id if it didn't already exist
0 commit comments