@@ -85,9 +85,9 @@ def cosine_similarity(from_vector: np.ndarray,
8585 similarity_matrix .setdiag (0. )
8686 similarity_matrix = similarity_matrix .tocsr ()
8787
88- indices = np . flip ( np . argsort ( similarity_matrix . toarray (), axis = - 1 ), axis = 1 )[:, : top_n ]
89- similarities = np . flip ( np . sort ( similarity_matrix . toarray (), axis = - 1 ), axis = 1 )[:, : top_n ]
90- similarities = [ np .round ( similarities [:, i ], 3 ) for i in range ( similarities . shape [ 1 ])]
88+ indices = _top_n_idx_sparse ( similarity_matrix , top_n )
89+ similarities = _top_n_similarities_sparse ( similarity_matrix , top_n , indices )
90+ indices = np .array ( np . nan_to_num ( np . array ( indices , dtype = np . float ), nan = 0 ), dtype = np . int )
9191
9292 # Faster than knn and slower than sparse but uses more memory
9393 else :
@@ -101,9 +101,9 @@ def cosine_similarity(from_vector: np.ndarray,
101101 similarities = [np .round (similarities [:, i ], 3 ) for i in range (similarities .shape [1 ])]
102102
103103 # Convert results to df
104- columns = (["From" ] +
105- ["To" if i == 0 else f"To_{ i + 1 } " for i in range (top_n )] +
106- ["Similarity" if i == 0 else f"Similarity_{ i + 1 } " for i in range (top_n )]); columns
104+ columns = (["From" ] +
105+ ["To" if i == 0 else f"To_{ i + 1 } " for i in range (top_n )] +
106+ ["Similarity" if i == 0 else f"Similarity_{ i + 1 } " for i in range (top_n )])
107107 matches = [[to_list [idx ] for idx in indices [:, i ]] for i in range (indices .shape [1 ])]
108108 matches = pd .DataFrame (np .vstack (([from_list ], matches , similarities )).T , columns = columns )
109109
@@ -119,3 +119,24 @@ def cosine_similarity(from_vector: np.ndarray,
119119 matches .loc [matches [column ] < 0.001 , column .replace ("Similarity" , "To" )] = None
120120
121121 return matches
122+
123+
124+ def _top_n_idx_sparse (matrix , n ):
125+ """ Return index of top n values in each row of a sparse matrix """
126+ top_n_idx = []
127+ for le , ri in zip (matrix .indptr [:- 1 ], matrix .indptr [1 :]):
128+ n_row_pick = min (n , ri - le )
129+ values = list (matrix .indices [le + np .argpartition (matrix .data [le :ri ], - n_row_pick )[- n_row_pick :]])[::- 1 ]
130+ values = [values [index ] if len (values ) >= index + 1 else None for index in range (n )]
131+ top_n_idx .append (values )
132+ return np .array (top_n_idx )
133+
134+
135+ def _top_n_similarities_sparse (matrix , n , indices ):
136+ """ Return similarity scores of top n values in each row of a sparse matrix """
137+ similarity_scores = []
138+ for row , values in enumerate (indices ):
139+ scores = [round (matrix [row , value ], n ) if value is not None else 0 for value in values ]
140+ similarity_scores .append (scores )
141+ similarity_scores = np .array (similarity_scores ).T
142+ return similarity_scores
0 commit comments