Skip to content

Commit 2dd3790

Browse files
committed
added many searching algorithm #1333
1 parent 5974826 commit 2dd3790

File tree

1 file changed

+270
-0
lines changed

1 file changed

+270
-0
lines changed

NLP/multi_similarity_tool.py

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
import os
2+
import numpy as np
3+
from scipy.spatial.distance import euclidean, cityblock, minkowski, chebyshev, canberra, braycurtis, jensenshannon, hamming # Add other distance functions as needed
4+
from sentence_transformers import SentenceTransformer, util # Import SentenceTransformer
5+
from scipy.stats import pearsonr, spearmanr
6+
from scipy.special import kl_div
7+
from scipy.spatial.distance import jensenshannon
8+
9+
10+
MODEL_NAME = 'all-MiniLM-L6-v2'
11+
MODEL_FOLDER = 'model'
12+
13+
def load_file(file_path):
14+
with open(file_path, 'r', encoding='utf-8') as file:
15+
return [line.strip() for line in file if line.strip()]
16+
17+
def load_or_download_model():
18+
model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
19+
if os.path.exists(model_path):
20+
print(f"Loading model from {model_path}")
21+
return SentenceTransformer(model_path)
22+
else:
23+
print(f"Downloading model {MODEL_NAME}")
24+
model = SentenceTransformer(MODEL_NAME)
25+
os.makedirs(MODEL_FOLDER, exist_ok=True)
26+
model.save(model_path)
27+
print(f"Model saved to {model_path}")
28+
return model
29+
30+
def cosine_similarity(query_embedding, sentence_embeddings):
31+
return util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
32+
33+
def euclidean_distance(query_embedding, sentence_embeddings):
34+
return -np.array([euclidean(query_embedding, sent_emb) for sent_emb in sentence_embeddings])
35+
36+
def manhattan_distance(query_embedding, sentence_embeddings):
37+
return -np.array([cityblock(query_embedding, sent_emb) for sent_emb in sentence_embeddings])
38+
39+
def dot_product(query_embedding, sentence_embeddings):
40+
return np.dot(sentence_embeddings, query_embedding.T).flatten()
41+
42+
def pearson_correlation(query_embedding, sentence_embeddings):
43+
return np.array([pearsonr(query_embedding.flatten(), sent_emb.flatten())[0] for sent_emb in sentence_embeddings])
44+
45+
def jaccard_similarity(query_embedding, sentence_embeddings):
46+
# Simplified Jaccard similarity for continuous values
47+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / np.sum(np.maximum(query_embedding, sent_emb)) for sent_emb in sentence_embeddings])
48+
49+
def hamming_distance(query_embedding, sentence_embeddings):
50+
# Simplified Hamming distance for continuous values
51+
return -np.array([np.sum(query_embedding != sent_emb) for sent_emb in sentence_embeddings])
52+
53+
def minkowski_distance(query_embedding, sentence_embeddings, p=3):
54+
return -np.array([minkowski(query_embedding, sent_emb, p) for sent_emb in sentence_embeddings])
55+
56+
def chebyshev_distance(query_embedding, sentence_embeddings):
57+
return -np.array([chebyshev(query_embedding, sent_emb) for sent_emb in sentence_embeddings])
58+
59+
def canberra_distance(query_embedding, sentence_embeddings):
60+
return -np.array([canberra(query_embedding, sent_emb) for sent_emb in sentence_embeddings])
61+
62+
def bray_curtis_distance(query_embedding, sentence_embeddings):
63+
return -np.array([braycurtis(query_embedding, sent_emb) for sent_emb in sentence_embeddings])
64+
65+
def mahalanobis_distance(query_embedding, sentence_embeddings):
66+
# Placeholder: Requires covariance matrix calculation
67+
return -np.array([euclidean(query_embedding, sent_emb) for sent_emb in sentence_embeddings])
68+
69+
def dice_similarity(query_embedding, sentence_embeddings):
70+
return np.array([2 * np.sum(np.minimum(query_embedding, sent_emb)) / (np.sum(query_embedding) + np.sum(sent_emb)) for sent_emb in sentence_embeddings])
71+
72+
def tanimoto_similarity(query_embedding, sentence_embeddings):
73+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / np.sum(np.maximum(query_embedding, sent_emb)) for sent_emb in sentence_embeddings])
74+
75+
def spearman_correlation(query_embedding, sentence_embeddings):
76+
return np.array([spearmanr(query_embedding.flatten(), sent_emb.flatten())[0] for sent_emb in sentence_embeddings])
77+
78+
def wasserstein_distance(query_embedding, sentence_embeddings):
79+
# Placeholder: Requires more complex implementation
80+
return -np.array([np.sum(np.abs(np.sort(query_embedding) - np.sort(sent_emb))) for sent_emb in sentence_embeddings])
81+
82+
def kl_divergence(query_embedding, sentence_embeddings):
83+
return -np.array([np.sum(kl_div(query_embedding + 1e-10, sent_emb + 1e-10)) for sent_emb in sentence_embeddings])
84+
85+
86+
def haversine_distance(query_embedding, sentence_embeddings):
87+
# Placeholder: Not applicable for high-dimensional embeddings
88+
return -euclidean_distance(query_embedding, sentence_embeddings)
89+
90+
def cosine_distance(query_embedding, sentence_embeddings):
91+
return 1 - cosine_similarity(query_embedding, sentence_embeddings)
92+
93+
def sorensen_dice_coefficient(query_embedding, sentence_embeddings):
94+
return dice_similarity(query_embedding, sentence_embeddings)
95+
96+
def levenshtein_distance(query_embedding, sentence_embeddings):
97+
# Placeholder: Not directly applicable to embeddings
98+
return -euclidean_distance(query_embedding, sentence_embeddings)
99+
100+
def jaro_winkler_distance(query_embedding, sentence_embeddings):
101+
# Placeholder: Not directly applicable to embeddings
102+
return -euclidean_distance(query_embedding, sentence_embeddings)
103+
104+
def rogers_tanimoto_similarity(query_embedding, sentence_embeddings):
105+
# Simplified for continuous values
106+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / np.sum(np.maximum(query_embedding, sent_emb)) for sent_emb in sentence_embeddings])
107+
108+
def yule_similarity(query_embedding, sentence_embeddings):
109+
# Placeholder: Not directly applicable to embeddings
110+
return cosine_similarity(query_embedding, sentence_embeddings)
111+
112+
def kulczynski_similarity(query_embedding, sentence_embeddings):
113+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / np.minimum(np.sum(query_embedding), np.sum(sent_emb)) for sent_emb in sentence_embeddings])
114+
115+
def gower_distance(query_embedding, sentence_embeddings):
116+
# Simplified Gower distance
117+
return -np.array([np.mean(np.abs(query_embedding - sent_emb)) for sent_emb in sentence_embeddings])
118+
119+
def russell_rao_similarity(query_embedding, sentence_embeddings):
120+
# Simplified for continuous values
121+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / len(query_embedding) for sent_emb in sentence_embeddings])
122+
123+
def ochiai_similarity(query_embedding, sentence_embeddings):
124+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / np.sqrt(np.sum(query_embedding) * np.sum(sent_emb)) for sent_emb in sentence_embeddings])
125+
126+
def matching_coefficient(query_embedding, sentence_embeddings):
127+
# Simplified for continuous values
128+
return np.array([np.sum(query_embedding == sent_emb) / len(query_embedding) for sent_emb in sentence_embeddings])
129+
130+
def tversky_index(query_embedding, sentence_embeddings, alpha=0.5, beta=0.5):
131+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / (np.sum(np.minimum(query_embedding, sent_emb)) + alpha * np.sum(np.maximum(0, query_embedding - sent_emb)) + beta * np.sum(np.maximum(0, sent_emb - query_embedding))) for sent_emb in sentence_embeddings])
132+
133+
def sorensen_similarity(query_embedding, sentence_embeddings):
134+
return dice_similarity(query_embedding, sentence_embeddings)
135+
136+
def overlap_coefficient(query_embedding, sentence_embeddings):
137+
return np.array([np.sum(np.minimum(query_embedding, sent_emb)) / np.minimum(np.sum(query_embedding), np.sum(sent_emb)) for sent_emb in sentence_embeddings])
138+
139+
def edit_distance(query_embedding, sentence_embeddings):
140+
# Placeholder: Not directly applicable to embeddings
141+
return -euclidean_distance(query_embedding, sentence_embeddings)
142+
143+
def sokal_michener_distance(query_embedding, sentence_embeddings):
144+
# Simplified for continuous values
145+
return np.array([np.sum(np.abs(query_embedding - sent_emb)) / len(query_embedding) for sent_emb in sentence_embeddings])
146+
147+
def tschebyshev_distance(query_embedding, sentence_embeddings):
148+
return chebyshev_distance(query_embedding, sentence_embeddings)
149+
150+
def dice_hamming_distance(query_embedding, sentence_embeddings):
151+
dice = dice_similarity(query_embedding, sentence_embeddings)
152+
hamming = hamming_distance(query_embedding, sentence_embeddings)
153+
return (dice + hamming) / 2
154+
155+
def improved_jensen_distance(query_embedding, sentence_embeddings, epsilon=1e-10):
156+
# Add a small epsilon to avoid division by zero
157+
query_embedding = query_embedding + epsilon
158+
sentence_embeddings = sentence_embeddings + epsilon
159+
160+
# Normalize the query embedding
161+
query_sum = np.sum(query_embedding)
162+
query_embedding = query_embedding / query_sum
163+
164+
# Normalize each sentence embedding
165+
sentence_embeddings_normalized = sentence_embeddings / np.sum(sentence_embeddings, axis=1, keepdims=True)
166+
167+
# Compute Jensen-Shannon distance for each sentence embedding
168+
distances = np.array([jensenshannon(query_embedding, sent_emb) for sent_emb in sentence_embeddings_normalized])
169+
170+
# Replace any NaN or inf values with a large finite number
171+
distances = np.nan_to_num(distances, nan=np.finfo(float).max, posinf=np.finfo(float).max)
172+
173+
return distances
174+
175+
def log_likelihood(query_embedding, sentence_embeddings):
176+
# Placeholder: Requires probability distributions
177+
return cosine_similarity(query_embedding, sentence_embeddings)
178+
179+
similarity_functions = {
180+
'1': ('Cosine Similarity', cosine_similarity),
181+
'2': ('Euclidean Distance', euclidean_distance),
182+
'3': ('Manhattan Distance', manhattan_distance),
183+
'4': ('Dot Product', dot_product),
184+
'5': ('Pearson Correlation', pearson_correlation),
185+
'6': ('Jaccard Similarity', jaccard_similarity),
186+
'7': ('Hamming Distance', hamming_distance),
187+
'8': ('Minkowski Distance', minkowski_distance),
188+
'9': ('Chebyshev Distance', chebyshev_distance),
189+
'10': ('Canberra Distance', canberra_distance),
190+
'11': ('Bray-Curtis Distance', bray_curtis_distance),
191+
'12': ('Dice Similarity', dice_similarity),
192+
'13': ('Tanimoto Similarity', tanimoto_similarity),
193+
'14': ('Spearman Correlation', spearman_correlation),
194+
'15': ('Wasserstein Distance', wasserstein_distance),
195+
'16': ('KL Divergence', kl_divergence),
196+
'17': ('Cosine Distance', cosine_distance),
197+
'18': ('Sorensen-Dice Coefficient', sorensen_dice_coefficient),
198+
'19': ('Levenshtein Distance', levenshtein_distance),
199+
'20': ('Jaro-Winkler Distance', jaro_winkler_distance),
200+
'21': ('Rogers-Tanimoto Similarity', rogers_tanimoto_similarity),
201+
'22': ('Yule Similarity', yule_similarity),
202+
'23': ('Kulczynski Similarity', kulczynski_similarity),
203+
'24': ('Gower Distance', gower_distance),
204+
'25': ('Russell-Rao Similarity', russell_rao_similarity),
205+
'26': ('Matching Coefficient', matching_coefficient),
206+
'27': ('Tversky Index', tversky_index),
207+
'28': ('Sørensen Similarity', sorensen_similarity),
208+
'29': ('Overlap Coefficient', overlap_coefficient),
209+
'30': ('Edit Distance', edit_distance),
210+
'31': ('Sokal-Michener Distance', sokal_michener_distance),
211+
'32': ('Tschebyshev Distance', tschebyshev_distance),
212+
'33': ('Dice-Hamming Distance', dice_hamming_distance),
213+
'34': ('Jensen Distance', improved_jensen_distance),
214+
'35': ('Log Likelihood', log_likelihood),
215+
}
216+
217+
218+
def find_similar_sentences(query, file_path, similarity_func, top_n=5):
219+
model = load_or_download_model()
220+
sentences = load_file(file_path)
221+
sentence_embeddings = model.encode(sentences)
222+
query_embedding = model.encode([query])[0] # Flatten the query embedding
223+
224+
similarity_scores = similarity_func(query_embedding, sentence_embeddings)
225+
top_results = sorted(zip(sentences, similarity_scores), key=lambda x: x[1], reverse=True)[:top_n]
226+
227+
return top_results
228+
229+
def validate_file_path(file_path):
230+
if not file_path.endswith('.txt'):
231+
file_path += '.txt'
232+
if not os.path.exists(file_path):
233+
raise FileNotFoundError(f"The file '{file_path}' does not exist.")
234+
return file_path
235+
236+
def main():
237+
print("Welcome to the Comprehensive Sentence Similarity Search Tool!")
238+
239+
query = input("Enter your query: ")
240+
241+
while True:
242+
file_path = input("Enter the path to your text file without extension: ")
243+
try:
244+
file_path = validate_file_path(file_path)
245+
break
246+
except FileNotFoundError as e:
247+
print(f"Error: {str(e)} Please try again.")
248+
249+
print("\nChoose a similarity measurement method:")
250+
for key, (name, _) in similarity_functions.items():
251+
print(f"{key}. {name}")
252+
253+
while True:
254+
choice = input("Enter the number of your choice: ")
255+
if choice in similarity_functions:
256+
similarity_name, similarity_func = similarity_functions[choice]
257+
break
258+
print("Invalid choice. Please try again.")
259+
260+
try:
261+
results = find_similar_sentences(query, file_path, similarity_func)
262+
print(f"\nTop 5 similar sentences for query: '{query}' using {similarity_name}\n")
263+
for sentence, score in results:
264+
print(f"Similarity Score: {score:.4f}")
265+
print(f"Sentence: {sentence}\n")
266+
except Exception as e:
267+
print(f"An error occurred: {str(e)}")
268+
269+
if __name__ == "__main__":
270+
main()

0 commit comments

Comments
 (0)