-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIndexMerger.py
More file actions
96 lines (83 loc) · 2.48 KB
/
IndexMerger.py
File metadata and controls
96 lines (83 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os,sys
import timeit
import glob
from collections import defaultdict
from heapq import heapify, heappush, heappop
folderToStore = "finalIndex/"
indexFileCount = 0
secondaryIndex = dict()
chunkSize = 100000
indexFiles = glob.glob("./indexFiles/*")
primaryIndex = open("primaryIndex.txt","w")
completedFiles = [0] * len(indexFiles)
filePointers = dict()
currentRowOfFile = dict()
percolator = list()
words = dict()
total = 0
invertedIndex = defaultdict()
def writeToPrimary():
offset = []
firstWord = True
indexFileCount += 1
fileName = folderToStore + "index" + str(indexFileCount) + ".txt"
fp = open(fileName,"w")
for i in sorted(invertedIndex):
if firstWord:
secondaryIndex[i] = indexFileCount
firstWord = False
toWrite = str(i) + "=" + invertedIndex[i] + "\n"
fp.write(toWrite)
def writeToSecondary():
fileName = folderToStore + "secondaryIndex.txt"
fp = open(fileName,"w")
for i in sorted(secondaryIndex):
toWrite = str(i) + " " + str(secondaryIndex[i]) + "\n"
fp.write(toWrite)
start = timeit.default_timer()
fileCount = 0
for i in xrange(len(indexFiles)):
completedFiles[i] = 1
try:
filePointers[i] = open(indexFiles[i],"r")
fileCount += 1
except:
print "Could Open Files: ",fileCount
currentRowOfFile[i] = filePointers[i].readline()
words[i] = currentRowOfFile[i].strip().split("=")
if words[i][0] not in percolator:
heappush(percolator,words[i][0])
while True:
if completedFiles.count(0) == len(indexFiles):
break
else:
total += 1
word = heappop(percolator)
for i in xrange(len(indexFiles)):
if completedFiles[i] and words[i][0] == word:
if word in invertedIndex:
invertedIndex[word] += "," + words[i][1]
else:
invertedIndex[word] = words[i][1]
if total == chunkSize:
total = 0
writeToPrimary()
invertedIndex.clear()
currentRowOfFile[i] = filePointers[i].readline().strip()
if currentRowOfFile[i]:
words[i] = currentRowOfFile[i].split("=")
if words[i][0] not in percolator:
heappush(percolator,words[i][0])
else:
completedFiles[i] = 0
filePointers[i].close()
os.remove(indexFiles[i])
writeToPrimary()
writeToSecondary()
stop = timeit.default_timer()
print "Time for Merging:",stop-start," seconds."
mins = float(stop-start)/float(60)
print "Time for Merging:",mins," Minutes."
hrs = float(mins)/float(60)
print "Time for Merging:",hrs," Hours."
print "Check the External File(s) Now!"