1+ import os
2+ import numpy as np
3+ from scipy .spatial .distance import euclidean , cityblock , minkowski , chebyshev , canberra , braycurtis , jensenshannon , hamming # Add other distance functions as needed
4+ from sentence_transformers import SentenceTransformer , util # Import SentenceTransformer
5+ from scipy .stats import pearsonr , spearmanr
6+ from scipy .special import kl_div
7+ from scipy .spatial .distance import jensenshannon
8+
9+
10+ MODEL_NAME = 'all-MiniLM-L6-v2'
11+ MODEL_FOLDER = 'model'
12+
13+ def load_file (file_path ):
14+ with open (file_path , 'r' , encoding = 'utf-8' ) as file :
15+ return [line .strip () for line in file if line .strip ()]
16+
17+ def load_or_download_model ():
18+ model_path = os .path .join (MODEL_FOLDER , MODEL_NAME )
19+ if os .path .exists (model_path ):
20+ print (f"Loading model from { model_path } " )
21+ return SentenceTransformer (model_path )
22+ else :
23+ print (f"Downloading model { MODEL_NAME } " )
24+ model = SentenceTransformer (MODEL_NAME )
25+ os .makedirs (MODEL_FOLDER , exist_ok = True )
26+ model .save (model_path )
27+ print (f"Model saved to { model_path } " )
28+ return model
29+
30+ def cosine_similarity (query_embedding , sentence_embeddings ):
31+ return util .pytorch_cos_sim (query_embedding , sentence_embeddings )[0 ]
32+
33+ def euclidean_distance (query_embedding , sentence_embeddings ):
34+ return - np .array ([euclidean (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
35+
36+ def manhattan_distance (query_embedding , sentence_embeddings ):
37+ return - np .array ([cityblock (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
38+
39+ def dot_product (query_embedding , sentence_embeddings ):
40+ return np .dot (sentence_embeddings , query_embedding .T ).flatten ()
41+
42+ def pearson_correlation (query_embedding , sentence_embeddings ):
43+ return np .array ([pearsonr (query_embedding .flatten (), sent_emb .flatten ())[0 ] for sent_emb in sentence_embeddings ])
44+
45+ def jaccard_similarity (query_embedding , sentence_embeddings ):
46+ # Simplified Jaccard similarity for continuous values
47+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sum (np .maximum (query_embedding , sent_emb )) for sent_emb in sentence_embeddings ])
48+
49+ def hamming_distance (query_embedding , sentence_embeddings ):
50+ # Simplified Hamming distance for continuous values
51+ return - np .array ([np .sum (query_embedding != sent_emb ) for sent_emb in sentence_embeddings ])
52+
53+ def minkowski_distance (query_embedding , sentence_embeddings , p = 3 ):
54+ return - np .array ([minkowski (query_embedding , sent_emb , p ) for sent_emb in sentence_embeddings ])
55+
56+ def chebyshev_distance (query_embedding , sentence_embeddings ):
57+ return - np .array ([chebyshev (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
58+
59+ def canberra_distance (query_embedding , sentence_embeddings ):
60+ return - np .array ([canberra (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
61+
62+ def bray_curtis_distance (query_embedding , sentence_embeddings ):
63+ return - np .array ([braycurtis (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
64+
65+ def mahalanobis_distance (query_embedding , sentence_embeddings ):
66+ # Placeholder: Requires covariance matrix calculation
67+ return - np .array ([euclidean (query_embedding , sent_emb ) for sent_emb in sentence_embeddings ])
68+
69+ def dice_similarity (query_embedding , sentence_embeddings ):
70+ return np .array ([2 * np .sum (np .minimum (query_embedding , sent_emb )) / (np .sum (query_embedding ) + np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
71+
72+ def tanimoto_similarity (query_embedding , sentence_embeddings ):
73+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sum (np .maximum (query_embedding , sent_emb )) for sent_emb in sentence_embeddings ])
74+
75+ def spearman_correlation (query_embedding , sentence_embeddings ):
76+ return np .array ([spearmanr (query_embedding .flatten (), sent_emb .flatten ())[0 ] for sent_emb in sentence_embeddings ])
77+
78+ def wasserstein_distance (query_embedding , sentence_embeddings ):
79+ # Placeholder: Requires more complex implementation
80+ return - np .array ([np .sum (np .abs (np .sort (query_embedding ) - np .sort (sent_emb ))) for sent_emb in sentence_embeddings ])
81+
82+ def kl_divergence (query_embedding , sentence_embeddings ):
83+ return - np .array ([np .sum (kl_div (query_embedding + 1e-10 , sent_emb + 1e-10 )) for sent_emb in sentence_embeddings ])
84+
85+
86+ def haversine_distance (query_embedding , sentence_embeddings ):
87+ # Placeholder: Not applicable for high-dimensional embeddings
88+ return - euclidean_distance (query_embedding , sentence_embeddings )
89+
90+ def cosine_distance (query_embedding , sentence_embeddings ):
91+ return 1 - cosine_similarity (query_embedding , sentence_embeddings )
92+
93+ def sorensen_dice_coefficient (query_embedding , sentence_embeddings ):
94+ return dice_similarity (query_embedding , sentence_embeddings )
95+
96+ def levenshtein_distance (query_embedding , sentence_embeddings ):
97+ # Placeholder: Not directly applicable to embeddings
98+ return - euclidean_distance (query_embedding , sentence_embeddings )
99+
100+ def jaro_winkler_distance (query_embedding , sentence_embeddings ):
101+ # Placeholder: Not directly applicable to embeddings
102+ return - euclidean_distance (query_embedding , sentence_embeddings )
103+
104+ def rogers_tanimoto_similarity (query_embedding , sentence_embeddings ):
105+ # Simplified for continuous values
106+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sum (np .maximum (query_embedding , sent_emb )) for sent_emb in sentence_embeddings ])
107+
108+ def yule_similarity (query_embedding , sentence_embeddings ):
109+ # Placeholder: Not directly applicable to embeddings
110+ return cosine_similarity (query_embedding , sentence_embeddings )
111+
112+ def kulczynski_similarity (query_embedding , sentence_embeddings ):
113+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .minimum (np .sum (query_embedding ), np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
114+
115+ def gower_distance (query_embedding , sentence_embeddings ):
116+ # Simplified Gower distance
117+ return - np .array ([np .mean (np .abs (query_embedding - sent_emb )) for sent_emb in sentence_embeddings ])
118+
119+ def russell_rao_similarity (query_embedding , sentence_embeddings ):
120+ # Simplified for continuous values
121+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / len (query_embedding ) for sent_emb in sentence_embeddings ])
122+
123+ def ochiai_similarity (query_embedding , sentence_embeddings ):
124+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .sqrt (np .sum (query_embedding ) * np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
125+
126+ def matching_coefficient (query_embedding , sentence_embeddings ):
127+ # Simplified for continuous values
128+ return np .array ([np .sum (query_embedding == sent_emb ) / len (query_embedding ) for sent_emb in sentence_embeddings ])
129+
130+ def tversky_index (query_embedding , sentence_embeddings , alpha = 0.5 , beta = 0.5 ):
131+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / (np .sum (np .minimum (query_embedding , sent_emb )) + alpha * np .sum (np .maximum (0 , query_embedding - sent_emb )) + beta * np .sum (np .maximum (0 , sent_emb - query_embedding ))) for sent_emb in sentence_embeddings ])
132+
133+ def sorensen_similarity (query_embedding , sentence_embeddings ):
134+ return dice_similarity (query_embedding , sentence_embeddings )
135+
136+ def overlap_coefficient (query_embedding , sentence_embeddings ):
137+ return np .array ([np .sum (np .minimum (query_embedding , sent_emb )) / np .minimum (np .sum (query_embedding ), np .sum (sent_emb )) for sent_emb in sentence_embeddings ])
138+
139+ def edit_distance (query_embedding , sentence_embeddings ):
140+ # Placeholder: Not directly applicable to embeddings
141+ return - euclidean_distance (query_embedding , sentence_embeddings )
142+
143+ def sokal_michener_distance (query_embedding , sentence_embeddings ):
144+ # Simplified for continuous values
145+ return np .array ([np .sum (np .abs (query_embedding - sent_emb )) / len (query_embedding ) for sent_emb in sentence_embeddings ])
146+
147+ def tschebyshev_distance (query_embedding , sentence_embeddings ):
148+ return chebyshev_distance (query_embedding , sentence_embeddings )
149+
150+ def dice_hamming_distance (query_embedding , sentence_embeddings ):
151+ dice = dice_similarity (query_embedding , sentence_embeddings )
152+ hamming = hamming_distance (query_embedding , sentence_embeddings )
153+ return (dice + hamming ) / 2
154+
155+ def improved_jensen_distance (query_embedding , sentence_embeddings , epsilon = 1e-10 ):
156+ # Add a small epsilon to avoid division by zero
157+ query_embedding = query_embedding + epsilon
158+ sentence_embeddings = sentence_embeddings + epsilon
159+
160+ # Normalize the query embedding
161+ query_sum = np .sum (query_embedding )
162+ query_embedding = query_embedding / query_sum
163+
164+ # Normalize each sentence embedding
165+ sentence_embeddings_normalized = sentence_embeddings / np .sum (sentence_embeddings , axis = 1 , keepdims = True )
166+
167+ # Compute Jensen-Shannon distance for each sentence embedding
168+ distances = np .array ([jensenshannon (query_embedding , sent_emb ) for sent_emb in sentence_embeddings_normalized ])
169+
170+ # Replace any NaN or inf values with a large finite number
171+ distances = np .nan_to_num (distances , nan = np .finfo (float ).max , posinf = np .finfo (float ).max )
172+
173+ return distances
174+
175+ def log_likelihood (query_embedding , sentence_embeddings ):
176+ # Placeholder: Requires probability distributions
177+ return cosine_similarity (query_embedding , sentence_embeddings )
178+
179+ similarity_functions = {
180+ '1' : ('Cosine Similarity' , cosine_similarity ),
181+ '2' : ('Euclidean Distance' , euclidean_distance ),
182+ '3' : ('Manhattan Distance' , manhattan_distance ),
183+ '4' : ('Dot Product' , dot_product ),
184+ '5' : ('Pearson Correlation' , pearson_correlation ),
185+ '6' : ('Jaccard Similarity' , jaccard_similarity ),
186+ '7' : ('Hamming Distance' , hamming_distance ),
187+ '8' : ('Minkowski Distance' , minkowski_distance ),
188+ '9' : ('Chebyshev Distance' , chebyshev_distance ),
189+ '10' : ('Canberra Distance' , canberra_distance ),
190+ '11' : ('Bray-Curtis Distance' , bray_curtis_distance ),
191+ '12' : ('Dice Similarity' , dice_similarity ),
192+ '13' : ('Tanimoto Similarity' , tanimoto_similarity ),
193+ '14' : ('Spearman Correlation' , spearman_correlation ),
194+ '15' : ('Wasserstein Distance' , wasserstein_distance ),
195+ '16' : ('KL Divergence' , kl_divergence ),
196+ '17' : ('Cosine Distance' , cosine_distance ),
197+ '18' : ('Sorensen-Dice Coefficient' , sorensen_dice_coefficient ),
198+ '19' : ('Levenshtein Distance' , levenshtein_distance ),
199+ '20' : ('Jaro-Winkler Distance' , jaro_winkler_distance ),
200+ '21' : ('Rogers-Tanimoto Similarity' , rogers_tanimoto_similarity ),
201+ '22' : ('Yule Similarity' , yule_similarity ),
202+ '23' : ('Kulczynski Similarity' , kulczynski_similarity ),
203+ '24' : ('Gower Distance' , gower_distance ),
204+ '25' : ('Russell-Rao Similarity' , russell_rao_similarity ),
205+ '26' : ('Matching Coefficient' , matching_coefficient ),
206+ '27' : ('Tversky Index' , tversky_index ),
207+ '28' : ('Sørensen Similarity' , sorensen_similarity ),
208+ '29' : ('Overlap Coefficient' , overlap_coefficient ),
209+ '30' : ('Edit Distance' , edit_distance ),
210+ '31' : ('Sokal-Michener Distance' , sokal_michener_distance ),
211+ '32' : ('Tschebyshev Distance' , tschebyshev_distance ),
212+ '33' : ('Dice-Hamming Distance' , dice_hamming_distance ),
213+ '34' : ('Jensen Distance' , improved_jensen_distance ),
214+ '35' : ('Log Likelihood' , log_likelihood ),
215+ }
216+
217+
218+ def find_similar_sentences (query , file_path , similarity_func , top_n = 5 ):
219+ model = load_or_download_model ()
220+ sentences = load_file (file_path )
221+ sentence_embeddings = model .encode (sentences )
222+ query_embedding = model .encode ([query ])[0 ] # Flatten the query embedding
223+
224+ similarity_scores = similarity_func (query_embedding , sentence_embeddings )
225+ top_results = sorted (zip (sentences , similarity_scores ), key = lambda x : x [1 ], reverse = True )[:top_n ]
226+
227+ return top_results
228+
229+ def validate_file_path (file_path ):
230+ if not file_path .endswith ('.txt' ):
231+ file_path += '.txt'
232+ if not os .path .exists (file_path ):
233+ raise FileNotFoundError (f"The file '{ file_path } ' does not exist." )
234+ return file_path
235+
236+ def main ():
237+ print ("Welcome to the Comprehensive Sentence Similarity Search Tool!" )
238+
239+ query = input ("Enter your query: " )
240+
241+ while True :
242+ file_path = input ("Enter the path to your text file without extension: " )
243+ try :
244+ file_path = validate_file_path (file_path )
245+ break
246+ except FileNotFoundError as e :
247+ print (f"Error: { str (e )} Please try again." )
248+
249+ print ("\n Choose a similarity measurement method:" )
250+ for key , (name , _ ) in similarity_functions .items ():
251+ print (f"{ key } . { name } " )
252+
253+ while True :
254+ choice = input ("Enter the number of your choice: " )
255+ if choice in similarity_functions :
256+ similarity_name , similarity_func = similarity_functions [choice ]
257+ break
258+ print ("Invalid choice. Please try again." )
259+
260+ try :
261+ results = find_similar_sentences (query , file_path , similarity_func )
262+ print (f"\n Top 5 similar sentences for query: '{ query } ' using { similarity_name } \n " )
263+ for sentence , score in results :
264+ print (f"Similarity Score: { score :.4f} " )
265+ print (f"Sentence: { sentence } \n " )
266+ except Exception as e :
267+ print (f"An error occurred: { str (e )} " )
268+
269+ if __name__ == "__main__" :
270+ main ()
0 commit comments