1
+ import os
2
+ import numpy as np
3
+ from scipy .spatial .distance import euclidean , cityblock , minkowski , chebyshev , canberra , braycurtis , jensenshannon , hamming # Add other distance functions as needed
4
+ from sentence_transformers import SentenceTransformer , util # Import SentenceTransformer
5
+ from scipy .stats import pearsonr , spearmanr
6
+ from scipy .special import kl_div
7
+ from scipy .spatial .distance import jensenshannon
8
+
9
+
10
+ MODEL_NAME = 'all-MiniLM-L6-v2'
11
+ MODEL_FOLDER = 'model'
12
+
13
+ def load_file (file_path ):
14
+ with open (file_path , 'r' , encoding = 'utf-8' ) as file :
15
+ return [line .strip () for line in file if line .strip ()]
16
+
17
+ def load_or_download_model ():
18
+ model_path = os .path .join (MODEL_FOLDER , MODEL_NAME )
19
+ if os .path .exists (model_path ):
20
+ print (f"Loading model from { model_path } " )
21
+ return SentenceTransformer (model_path )
22
+ else :
23
+ print (f"Downloading model { MODEL_NAME } " )
24
+ model = SentenceTransformer (MODEL_NAME )
25
+ os .makedirs (MODEL_FOLDER , exist_ok = True )
26
+ model .save (model_path )
27
+ print (f"Model saved to { model_path } " )
28
+ return model
29
+
30
+ def cosine_similarity (query_embedding , sentence_embeddings ):
31
+ return util .pytorch_cos_sim (query_embedding , sentence_embeddings )[0 ]
32
+
33
+ def euclidean_distance (query_embedding , sentence_embeddings ):
34
+ return - np .array ([euclidean (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
35
+
36
+ def manhattan_distance (query_embedding , sentence_embeddings ):
37
+ return - np .array ([cityblock (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
38
+
39
+ def dot_product (query_embedding , sentence_embeddings ):
40
+ return np .dot (sentence_embeddings , query_embedding .T ).flatten ()
41
+
42
+ def pearson_correlation (query_embedding , sentence_embeddings ):
43
+ return np .array ([pearsonr (query_embedding .flatten (), sent_emb .flatten ())[0 ] for sent_emb in sentence_embeddings ])
44
+
45
+ def jaccard_similarity (query_embedding , sentence_embeddings ):
46
+ # Simplified Jaccard similarity for continuous values
47
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sum (np .maximum (query_embedding , sent_emb )) for sent_emb in sentence_embeddings ])
48
+
49
+ def hamming_distance (query_embedding , sentence_embeddings ):
50
+ # Simplified Hamming distance for continuous values
51
+ return - np .array ([np .sum (query_embedding != sent_emb ) for sent_emb in sentence_embeddings ])
52
+
53
+ def minkowski_distance (query_embedding , sentence_embeddings , p = 3 ):
54
+ return - np .array ([minkowski (query_embedding , sent_emb , p ) for sent_emb in sentence_embeddings ])
55
+
56
+ def chebyshev_distance (query_embedding , sentence_embeddings ):
57
+ return - np .array ([chebyshev (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
58
+
59
+ def canberra_distance (query_embedding , sentence_embeddings ):
60
+ return - np .array ([canberra (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
61
+
62
+ def bray_curtis_distance (query_embedding , sentence_embeddings ):
63
+ return - np .array ([braycurtis (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
64
+
65
+ def mahalanobis_distance (query_embedding , sentence_embeddings ):
66
+ # Placeholder: Requires covariance matrix calculation
67
+ return - np .array ([euclidean (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
68
+
69
+ def dice_similarity (query_embedding , sentence_embeddings ):
70
+ return np .array ([2 * np .sum (np .minimum (query_embedding , sent_emb )) / (np .sum (query_embedding ) + np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
71
+
72
+ def tanimoto_similarity (query_embedding , sentence_embeddings ):
73
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sum (np .maximum (query_embedding , sent_emb )) for sent_emb in sentence_embeddings ])
74
+
75
+ def spearman_correlation (query_embedding , sentence_embeddings ):
76
+ return np .array ([spearmanr (query_embedding .flatten (), sent_emb .flatten ())[0 ] for sent_emb in sentence_embeddings ])
77
+
78
+ def wasserstein_distance (query_embedding , sentence_embeddings ):
79
+ # Placeholder: Requires more complex implementation
80
+ return - np .array ([np .sum (np .abs (np .sort (query_embedding ) - np .sort (sent_emb ))) for sent_emb in sentence_embeddings ])
81
+
82
+ def kl_divergence (query_embedding , sentence_embeddings ):
83
+ return - np .array ([np .sum (kl_div (query_embedding + 1e-10 , sent_emb + 1e-10 )) for sent_emb in sentence_embeddings ])
84
+
85
+
86
+ def haversine_distance (query_embedding , sentence_embeddings ):
87
+ # Placeholder: Not applicable for high-dimensional embeddings
88
+ return - euclidean_distance (query_embedding , sentence_embeddings )
89
+
90
+ def cosine_distance (query_embedding , sentence_embeddings ):
91
+ return 1 - cosine_similarity (query_embedding , sentence_embeddings )
92
+
93
+ def sorensen_dice_coefficient (query_embedding , sentence_embeddings ):
94
+ return dice_similarity (query_embedding , sentence_embeddings )
95
+
96
+ def levenshtein_distance (query_embedding , sentence_embeddings ):
97
+ # Placeholder: Not directly applicable to embeddings
98
+ return - euclidean_distance (query_embedding , sentence_embeddings )
99
+
100
+ def jaro_winkler_distance (query_embedding , sentence_embeddings ):
101
+ # Placeholder: Not directly applicable to embeddings
102
+ return - euclidean_distance (query_embedding , sentence_embeddings )
103
+
104
+ def rogers_tanimoto_similarity (query_embedding , sentence_embeddings ):
105
+ # Simplified for continuous values
106
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sum (np .maximum (query_embedding , sent_emb )) for sent_emb in sentence_embeddings ])
107
+
108
+ def yule_similarity (query_embedding , sentence_embeddings ):
109
+ # Placeholder: Not directly applicable to embeddings
110
+ return cosine_similarity (query_embedding , sentence_embeddings )
111
+
112
+ def kulczynski_similarity (query_embedding , sentence_embeddings ):
113
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .minimum (np .sum (query_embedding ), np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
114
+
115
+ def gower_distance (query_embedding , sentence_embeddings ):
116
+ # Simplified Gower distance
117
+ return - np .array ([np .mean (np .abs (query_embedding - sent_emb )) for sent_emb in sentence_embeddings ])
118
+
119
+ def russell_rao_similarity (query_embedding , sentence_embeddings ):
120
+ # Simplified for continuous values
121
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / len (query_embedding ) for sent_emb in sentence_embeddings ])
122
+
123
+ def ochiai_similarity (query_embedding , sentence_embeddings ):
124
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sqrt (np .sum (query_embedding ) * np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
125
+
126
+ def matching_coefficient (query_embedding , sentence_embeddings ):
127
+ # Simplified for continuous values
128
+ return np .array ([np .sum (query_embedding == sent_emb ) / len (query_embedding ) for sent_emb in sentence_embeddings ])
129
+
130
+ def tversky_index (query_embedding , sentence_embeddings , alpha = 0.5 , beta = 0.5 ):
131
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / (np .sum (np .minimum (query_embedding , sent_emb )) + alpha * np .sum (np .maximum (0 , query_embedding - sent_emb )) + beta * np .sum (np .maximum (0 , sent_emb - query_embedding ))) for sent_emb in sentence_embeddings ])
132
+
133
+ def sorensen_similarity (query_embedding , sentence_embeddings ):
134
+ return dice_similarity (query_embedding , sentence_embeddings )
135
+
136
+ def overlap_coefficient (query_embedding , sentence_embeddings ):
137
+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .minimum (np .sum (query_embedding ), np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
138
+
139
+ def edit_distance (query_embedding , sentence_embeddings ):
140
+ # Placeholder: Not directly applicable to embeddings
141
+ return - euclidean_distance (query_embedding , sentence_embeddings )
142
+
143
+ def sokal_michener_distance (query_embedding , sentence_embeddings ):
144
+ # Simplified for continuous values
145
+ return np .array ([np .sum (np .abs (query_embedding - sent_emb )) / len (query_embedding ) for sent_emb in sentence_embeddings ])
146
+
147
+ def tschebyshev_distance (query_embedding , sentence_embeddings ):
148
+ return chebyshev_distance (query_embedding , sentence_embeddings )
149
+
150
+ def dice_hamming_distance (query_embedding , sentence_embeddings ):
151
+ dice = dice_similarity (query_embedding , sentence_embeddings )
152
+ hamming = hamming_distance (query_embedding , sentence_embeddings )
153
+ return (dice + hamming ) / 2
154
+
155
+ def improved_jensen_distance (query_embedding , sentence_embeddings , epsilon = 1e-10 ):
156
+ # Add a small epsilon to avoid division by zero
157
+ query_embedding = query_embedding + epsilon
158
+ sentence_embeddings = sentence_embeddings + epsilon
159
+
160
+ # Normalize the query embedding
161
+ query_sum = np .sum (query_embedding )
162
+ query_embedding = query_embedding / query_sum
163
+
164
+ # Normalize each sentence embedding
165
+ sentence_embeddings_normalized = sentence_embeddings / np .sum (sentence_embeddings , axis = 1 , keepdims = True )
166
+
167
+ # Compute Jensen-Shannon distance for each sentence embedding
168
+ distances = np .array ([jensenshannon (query_embedding , sent_emb ) for sent_emb in sentence_embeddings_normalized ])
169
+
170
+ # Replace any NaN or inf values with a large finite number
171
+ distances = np .nan_to_num (distances , nan = np .finfo (float ).max , posinf = np .finfo (float ).max )
172
+
173
+ return distances
174
+
175
+ def log_likelihood (query_embedding , sentence_embeddings ):
176
+ # Placeholder: Requires probability distributions
177
+ return cosine_similarity (query_embedding , sentence_embeddings )
178
+
179
+ similarity_functions = {
180
+ '1' : ('Cosine Similarity' , cosine_similarity ),
181
+ '2' : ('Euclidean Distance' , euclidean_distance ),
182
+ '3' : ('Manhattan Distance' , manhattan_distance ),
183
+ '4' : ('Dot Product' , dot_product ),
184
+ '5' : ('Pearson Correlation' , pearson_correlation ),
185
+ '6' : ('Jaccard Similarity' , jaccard_similarity ),
186
+ '7' : ('Hamming Distance' , hamming_distance ),
187
+ '8' : ('Minkowski Distance' , minkowski_distance ),
188
+ '9' : ('Chebyshev Distance' , chebyshev_distance ),
189
+ '10' : ('Canberra Distance' , canberra_distance ),
190
+ '11' : ('Bray-Curtis Distance' , bray_curtis_distance ),
191
+ '12' : ('Dice Similarity' , dice_similarity ),
192
+ '13' : ('Tanimoto Similarity' , tanimoto_similarity ),
193
+ '14' : ('Spearman Correlation' , spearman_correlation ),
194
+ '15' : ('Wasserstein Distance' , wasserstein_distance ),
195
+ '16' : ('KL Divergence' , kl_divergence ),
196
+ '17' : ('Cosine Distance' , cosine_distance ),
197
+ '18' : ('Sorensen-Dice Coefficient' , sorensen_dice_coefficient ),
198
+ '19' : ('Levenshtein Distance' , levenshtein_distance ),
199
+ '20' : ('Jaro-Winkler Distance' , jaro_winkler_distance ),
200
+ '21' : ('Rogers-Tanimoto Similarity' , rogers_tanimoto_similarity ),
201
+ '22' : ('Yule Similarity' , yule_similarity ),
202
+ '23' : ('Kulczynski Similarity' , kulczynski_similarity ),
203
+ '24' : ('Gower Distance' , gower_distance ),
204
+ '25' : ('Russell-Rao Similarity' , russell_rao_similarity ),
205
+ '26' : ('Matching Coefficient' , matching_coefficient ),
206
+ '27' : ('Tversky Index' , tversky_index ),
207
+ '28' : ('Sørensen Similarity' , sorensen_similarity ),
208
+ '29' : ('Overlap Coefficient' , overlap_coefficient ),
209
+ '30' : ('Edit Distance' , edit_distance ),
210
+ '31' : ('Sokal-Michener Distance' , sokal_michener_distance ),
211
+ '32' : ('Tschebyshev Distance' , tschebyshev_distance ),
212
+ '33' : ('Dice-Hamming Distance' , dice_hamming_distance ),
213
+ '34' : ('Jensen Distance' , improved_jensen_distance ),
214
+ '35' : ('Log Likelihood' , log_likelihood ),
215
+ }
216
+
217
+
218
+ def find_similar_sentences (query , file_path , similarity_func , top_n = 5 ):
219
+ model = load_or_download_model ()
220
+ sentences = load_file (file_path )
221
+ sentence_embeddings = model .encode (sentences )
222
+ query_embedding = model .encode ([query ])[0 ] # Flatten the query embedding
223
+
224
+ similarity_scores = similarity_func (query_embedding , sentence_embeddings )
225
+ top_results = sorted (zip (sentences , similarity_scores ), key = lambda x : x [1 ], reverse = True )[:top_n ]
226
+
227
+ return top_results
228
+
229
+ def validate_file_path (file_path ):
230
+ if not file_path .endswith ('.txt' ):
231
+ file_path += '.txt'
232
+ if not os .path .exists (file_path ):
233
+ raise FileNotFoundError (f"The file '{ file_path } ' does not exist." )
234
+ return file_path
235
+
236
+ def main ():
237
+ print ("Welcome to the Comprehensive Sentence Similarity Search Tool!" )
238
+
239
+ query = input ("Enter your query: " )
240
+
241
+ while True :
242
+ file_path = input ("Enter the path to your text file without extension: " )
243
+ try :
244
+ file_path = validate_file_path (file_path )
245
+ break
246
+ except FileNotFoundError as e :
247
+ print (f"Error: { str (e )} Please try again." )
248
+
249
+ print ("\n Choose a similarity measurement method:" )
250
+ for key , (name , _ ) in similarity_functions .items ():
251
+ print (f"{ key } . { name } " )
252
+
253
+ while True :
254
+ choice = input ("Enter the number of your choice: " )
255
+ if choice in similarity_functions :
256
+ similarity_name , similarity_func = similarity_functions [choice ]
257
+ break
258
+ print ("Invalid choice. Please try again." )
259
+
260
+ try :
261
+ results = find_similar_sentences (query , file_path , similarity_func )
262
+ print (f"\n Top 5 similar sentences for query: '{ query } ' using { similarity_name } \n " )
263
+ for sentence , score in results :
264
+ print (f"Similarity Score: { score :.4f} " )
265
+ print (f"Sentence: { sentence } \n " )
266
+ except Exception as e :
267
+ print (f"An error occurred: { str (e )} " )
268
+
269
+ if __name__ == "__main__" :
270
+ main ()
0 commit comments