@@ -17,6 +17,7 @@ def cosine_similarity(from_vector: np.ndarray,
1717 from_list : List [str ],
1818 to_list : List [str ],
1919 min_similarity : float = 0.75 ,
20+ top_n : int = 1 ,
2021 method : str = "sparse" ) -> pd .DataFrame :
2122 """ Calculate similarity between two matrices/vectors and return best matches
2223
@@ -26,6 +27,7 @@ def cosine_similarity(from_vector: np.ndarray,
2627 from_list: The list from which you want mappings
2728 to_list: The list where you want to map to
2829 min_similarity: The minimum similarity between strings, otherwise return 0 similarity
30+ top_n: The number of best matches you want returned
2931 method: The method/package for calculating the cosine similarity.
3032 Options: "sparse", "sklearn", "knn".
3133 Sparse is the fastest and most memory efficient but requires a
@@ -49,20 +51,22 @@ def cosine_similarity(from_vector: np.ndarray,
4951 indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
5052 ```
5153 """
54+ if top_n > len (set (to_list )):
55+ top_n = len (set (to_list ))
56+
5257 # Slower but uses less memory
5358 if method == "knn" :
5459
5560 if from_list == to_list :
56- knn = NearestNeighbors (n_neighbors = 2 , n_jobs = - 1 , metric = 'cosine' ).fit (to_vector )
61+ knn = NearestNeighbors (n_neighbors = top_n + 1 , n_jobs = - 1 , metric = 'cosine' ).fit (to_vector )
5762 distances , indices = knn .kneighbors (from_vector )
58- distances = distances [:, 1 ]
59- indices = indices [:, 1 ]
60-
63+ distances = distances [:, 1 :]
64+ indices = indices [:, 1 :]
6165 else :
62- knn = NearestNeighbors (n_neighbors = 1 , n_jobs = - 1 , metric = 'cosine' ).fit (to_vector )
66+ knn = NearestNeighbors (n_neighbors = top_n , n_jobs = - 1 , metric = 'cosine' ).fit (to_vector )
6367 distances , indices = knn .kneighbors (from_vector )
6468
65- similarity = [round (1 - distance , 3 ) for distance in distances .flatten ( )]
69+ similarities = [np . round (1 - distances [:, i ], 3 ) for i in range ( distances .shape [ 1 ] )]
6670
6771 # Fast, but does has some installation issues
6872 elif _HAVE_SPARSE_DOT and method == "sparse" :
@@ -74,15 +78,16 @@ def cosine_similarity(from_vector: np.ndarray,
7478 # There is a bug with awesome_cossim_topn that when to_vector and from_vector
7579 # have the same shape, setting topn to 1 does not work. Apparently, you need
7680 # to it at least to 2 for it to work
77- similarity_matrix = awesome_cossim_topn (from_vector , to_vector .T , 2 , min_similarity )
81+ similarity_matrix = awesome_cossim_topn (from_vector , to_vector .T , top_n + 1 , min_similarity )
7882
7983 if from_list == to_list :
8084 similarity_matrix = similarity_matrix .tolil ()
8185 similarity_matrix .setdiag (0. )
8286 similarity_matrix = similarity_matrix .tocsr ()
8387
84- indices = np .array (similarity_matrix .argmax (axis = 1 ).T ).flatten ()
85- similarity = similarity_matrix .max (axis = 1 ).toarray ().T .flatten ()
88+ indices = np .flip (np .argsort (similarity_matrix .toarray (), axis = - 1 ), axis = 1 )[:, :top_n ]
89+ similarities = np .flip (np .sort (similarity_matrix .toarray (), axis = - 1 ), axis = 1 )[:, :top_n ]
90+ similarities = [np .round (similarities [:, i ], 3 ) for i in range (similarities .shape [1 ])]
8691
8792 # Faster than knn and slower than sparse but uses more memory
8893 else :
@@ -91,13 +96,26 @@ def cosine_similarity(from_vector: np.ndarray,
9196 if from_list == to_list :
9297 np .fill_diagonal (similarity_matrix , 0 )
9398
94- indices = similarity_matrix .argmax (axis = 1 )
95- similarity = similarity_matrix .max (axis = 1 )
99+ indices = np .flip (np .argsort (similarity_matrix , axis = - 1 ), axis = 1 )[:, :top_n ]
100+ similarities = np .flip (np .sort (similarity_matrix , axis = - 1 ), axis = 1 )[:, :top_n ]
101+ similarities = [np .round (similarities [:, i ], 3 ) for i in range (similarities .shape [1 ])]
96102
97103 # Convert results to df
98- matches = [to_list [idx ] for idx in indices .flatten ()]
99- matches = pd .DataFrame (np .vstack ((from_list , matches , similarity )).T , columns = ["From" , "To" , "Similarity" ])
100- matches .Similarity = matches .Similarity .astype (float )
101- matches .loc [matches .Similarity < 0.001 , "To" ] = None
104+ columns = (["From" ] +
105+ ["To" if i == 0 else f"To_{ i + 1 } " for i in range (top_n )] +
106+ ["Similarity" if i == 0 else f"Similarity_{ i + 1 } " for i in range (top_n )]); columns
107+ matches = [[to_list [idx ] for idx in indices [:, i ]] for i in range (indices .shape [1 ])]
108+ matches = pd .DataFrame (np .vstack (([from_list ], matches , similarities )).T , columns = columns )
109+
110+ # Update column order
111+ columns = [["From" , "To" , "Similarity" ]] + [[f"To_{ i + 2 } " , f"Similarity_{ i + 2 } " ] for i in range ((top_n - 1 ))]
112+ matches = matches .loc [:, [title for column in columns for title in column ]]
113+
114+ # Update types
115+ for column in matches .columns :
116+ if "Similarity" in column :
117+ matches [column ] = matches [column ].astype (float )
118+ matches .loc [matches [column ] < 0.001 , column ] = float (0 )
119+ matches .loc [matches [column ] < 0.001 , column .replace ("Similarity" , "To" )] = None
102120
103121 return matches
0 commit comments