-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTOP-K_Most_Similar_Documents.py
More file actions
58 lines (50 loc) · 2.28 KB
/
TOP-K_Most_Similar_Documents.py
File metadata and controls
58 lines (50 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import re
def cosine_similarity(doc1, doc2):
dot_product = np.dot(doc1, doc2)
normalized_doc1 = np.linalg.norm(doc1)
normalized_doc2 = np.linalg.norm(doc2)
cos = dot_product / (normalized_doc1 * normalized_doc2)
return cos
number_of_documents = int(input("How many documents?"))
documents_with_contents = {}
discrete_words = []
list_of_documents = []
print('Enter the documents\' full names (ex. doc.txt)\n')
for i in range(number_of_documents):
list_of_documents.append(input())
for i in range(number_of_documents):
try:
with open(list_of_documents[i], 'r') as currentDocument:
document_contents = currentDocument.readlines()
document_contents = re.findall('\w+', ' '.join(document_contents))
for index in range(len(document_contents)):
document_contents[index] = document_contents[index].lower()
for word in document_contents:
word = word.lower()
if word in discrete_words:
continue
else:
discrete_words.append(word)
documents_with_contents.update({list_of_documents[i]: document_contents})
except FileNotFoundError:
print(f'File \"{list_of_documents[i]}\" was not found; files must be placed in the same folder as the program')
exit(-1)
vectorized = []
vectorized_documents = {}
for document in documents_with_contents:
for word in discrete_words:
vectorized.append(documents_with_contents[document].count(word))
vectorized_documents.update({document: vectorized})
vectorized = []
similarities_dict = {}
for i in range(len(documents_with_contents)):
for j in range(i+1, len(documents_with_contents)):
cosine_result = cosine_similarity(vectorized_documents[list_of_documents[i]],
vectorized_documents[list_of_documents[j]])
similarities_dict.update({f'({list_of_documents[i]}, {list_of_documents[j]})': cosine_result})
similarities_sorted = sorted(similarities_dict, key=similarities_dict.get, reverse=True)
top_k = int(input("Enter a number K to print the top_K most similar documents"))
for i, r in enumerate(similarities_sorted):
if i < top_k:
print(f'{i+1}.{r}: {similarities_dict[r]}')