-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsearch.py
More file actions
108 lines (99 loc) · 4.58 KB
/
search.py
File metadata and controls
108 lines (99 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from index import stemmer, new_index_of_index, doc_ids, stop_words, has_digits, base_directory
from collections import defaultdict
import math
import re
import time
import os
# def find_shared_docs(sorted_lists): # finds shared documents between tokens
# # BOOLEAN SEARCH
# # DONT USE - BROKEN
# if not sorted_lists:
# return []
# if len(sorted_lists) == 1:
# return sorted_lists[0]
# curr = sorted_lists[0]
# for l in sorted_lists[1:]:
# comb = []
# i1 = 0
# i2 = 0
# while(i1 < len(curr) and i2 < len(l)):
# if int(curr[i1]) == int(l[i2]):
# comb.append(curr[i1])
# i1+=2
# i2+=2
# else:
# if int(curr[i1]) < int(l[i2]):
# i1+=2
# else:
# i2+=2
# curr = comb
# return curr
def compute_k_best_docs(token_docs, k): # calculate tf-idf scores
rank = defaultdict(int)
for token_doc in token_docs:
# the even items are the document ids
# the odd items are the weights
for i in range(0, len(token_doc), 2):
rank[int(token_doc[i])] += (1+math.log(int(token_doc[i+1]),10)) * math.log( (len(doc_ids)) / (len(token_doc)//2) , 10)
rank_list = [] # turn scores into list so they can be sorted
for d, r in rank.items():
rank_list.append((r,d))
rank_list.sort(reverse=True)
print(len(rank_list), "results") # how many possible matches there were
if len(rank_list) < k:
return rank_list
return rank_list[:k] # returns k best
def search():
global stop_words
with open(os.path.join(base_directory, "storage.csv")) as f:
# inverted index is opened but not loaded into memory
print('Enter exit* to quit')
while(True):
try:
search = input(">> ")
start = time.time() # used to check how long retrieval takes
if search == "exit*":
return
content = re.findall(r'[a-zA-Z0-9]+', search) # query content is tokenized the same way as the documents were
non_stop = 0
use_stop_words = True
for word in content:
if word not in stop_words:
non_stop+=1
if non_stop > 0.4 * len(content): # if less than 40% of query is stop words dont use them
use_stop_words = False
token_docs = []
# each item in the list is a line from the inverted index file
for i, token in enumerate(content):
token = token.lower()
if not use_stop_words and token in stop_words:
continue
token = stemmer.stem(token)
if token in new_index_of_index:
f.seek(new_index_of_index[token])
l = f.readline()
token_docs.append(l[:-1].split(',')[1:]) # avoid new-line and also first value which is token
if i < len(content) -1:
next = content[i+1].lower()
if next not in stop_words and not has_digits(token): # 2-gram
next = stemmer.stem(next)
two_gram = token + next
if two_gram in new_index_of_index:
f.seek(new_index_of_index[two_gram])
l = f.readline()
token_docs.append(l[:-1].split(',')[1:])
if i < len(content)-2:
nextt = content[i+2].lower()
if nextt not in stop_words and (not has_digits(nextt) and not has_digits(next)): # 3-gram
nextt = stemmer.stem(nextt)
three_gram = token + next + nextt
if three_gram in new_index_of_index:
f.seek(new_index_of_index[three_gram])
l = f.readline()
token_docs.append(l[:-1].split(',')[1:])
five_best = compute_k_best_docs(token_docs, 5)
for d in five_best:
print(doc_ids[int(d[1])])
print(time.time()-start, "seconds")
except:
continue