-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSearch_using_walk_algorithm.py
More file actions
277 lines (212 loc) · 11.6 KB
/
Search_using_walk_algorithm.py
File metadata and controls
277 lines (212 loc) · 11.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
from suffix_trees import STree
from collections import defaultdict
import time
import math
import sys
import bisect
def Build_suffix_tree():
global tree
input_file = sys.argv[1]
text = ""
with open(input_file) as file_in:
for line in file_in:
if line[0] != ">":
text += line.strip()
tree = STree.STree(text)
def process_leaf_and_internal_nodes(tree):
setattr(tree, "number_leaf_nodes", 0)
setattr(tree, "number_internal_nodes", 0)
setattr(tree, "M", [-1] * len(tree.word)) # List M in the paper
nodes_stack = [(tree.root, False)]
while nodes_stack:
current_node, visited = nodes_stack.pop()
if not visited:
# Before visiting children (entry)
nodes_stack.append((current_node, True)) # mark to process after children
for child_node in current_node.transition_links.values():
nodes_stack.append((child_node, False))
else:
setattr(current_node, "OT_indexes", [])
# alongside processing
if current_node.is_leaf():
# Assigning leaf nodes unique keys
current_node.index_of_leaf_in_ST = tree.number_leaf_nodes
tree.number_leaf_nodes += 1
# creating auxiliary lists
tree.M[current_node.idx] = current_node
if not hasattr(current_node.parent, "index_of_leftmost_leaf_in_ST"):
setattr(current_node.parent, "index_of_leftmost_leaf_in_ST", current_node.index_of_leaf_in_ST)
elif current_node.index_of_leaf_in_ST < current_node.parent.index_of_leftmost_leaf_in_ST:
current_node.parent.index_of_leftmost_leaf_in_ST = current_node.index_of_leaf_in_ST
if not hasattr(current_node.parent, "index_of_rightmost_leaf_in_ST"):
setattr(current_node.parent, "index_of_rightmost_leaf_in_ST", current_node.index_of_leaf_in_ST)
elif current_node.index_of_leaf_in_ST > current_node.parent.index_of_rightmost_leaf_in_ST:
current_node.parent.index_of_rightmost_leaf_in_ST = current_node.index_of_leaf_in_ST
else:
tree.number_internal_nodes += 1
if not hasattr(current_node.parent, "index_of_leftmost_leaf_in_ST"):
setattr(current_node.parent, "index_of_leftmost_leaf_in_ST", current_node.index_of_leftmost_leaf_in_ST)
elif current_node.index_of_leftmost_leaf_in_ST < current_node.parent.index_of_leftmost_leaf_in_ST:
current_node.parent.index_of_leftmost_leaf_in_ST = current_node.index_of_leftmost_leaf_in_ST
if not hasattr(current_node.parent, "index_of_rightmost_leaf_in_ST"):
setattr(current_node.parent, "index_of_rightmost_leaf_in_ST", current_node.index_of_rightmost_leaf_in_ST)
elif current_node.index_of_rightmost_leaf_in_ST > current_node.parent.index_of_rightmost_leaf_in_ST:
current_node.parent.index_of_rightmost_leaf_in_ST = current_node.index_of_rightmost_leaf_in_ST
print ("Number of leaf nodes is", "{:,}".format(tree.number_leaf_nodes))
print ("Number of internal nodes is", "{:,}".format(tree.number_internal_nodes))
print ("Number of alphabets in the input data", len(tree.root.transition_links) - 1)
######################################################################################## Searching code ##############################################################################################################
def find_end_node_of_exact_path_of_string_starting_from_a_node(tree, string, starting_node, suffix_end_node):
current_node = starting_node
end_node = starting_node
i = 0
l = len(string)
d = starting_node.depth
f = True
while f:
if i <= l - 1:
if string[i] in current_node.transition_links:
end_node = current_node.transition_links[string[i]]
if end_node.is_leaf():
suffix_number_under_node = tree.M[end_node.idx + starting_node.depth]
if suffix_end_node.index_of_leftmost_leaf_in_ST <= suffix_number_under_node.index_of_leaf_in_ST <= suffix_number_under_node.index_of_leaf_in_ST <= suffix_end_node.index_of_rightmost_leaf_in_ST:
return end_node
else:
return end_node.parent
else:
if end_node.depth - current_node.depth == 1:
current_node = end_node
i += 1
else:
if end_node.depth >= d + l:
edge_label = tree.word[end_node.idx + current_node.depth:end_node.idx + l + d]
else:
edge_label = tree.word[end_node.idx + current_node.depth:end_node.idx + end_node.depth]
current_node = end_node
for char in edge_label:
if string[i] == char:
i += 1
else:
f = False
break
else:
f = False
break
else:
f = False
break
if i == l:
return end_node
else:#i must be then less than l
if end_node.depth - starting_node.depth == i:
return end_node
else:
return end_node.parent
def find_end_node_of_exact_match_starting_from_root_node(tree, string):
current_node = tree.root
end_node = tree.root
i = 0
l = len(string)
f = True
while f:
if i <= l - 1:
if string[i] in current_node.transition_links:
end_node = current_node.transition_links[string[i]]
if end_node.depth >= l:
edge_label = tree.word[end_node.idx + current_node.depth:end_node.idx + l]
else:
edge_label = tree.word[end_node.idx + current_node.depth:end_node.idx + end_node.depth]
current_node = end_node
for char in edge_label:
if string[i] == char:
i += 1
else:
f = False
break
else:
f = False
break
else:
f = False
break
if i == l:
return end_node
else:#i must be then less than l
if end_node.depth == i:
return end_node
else:
return end_node.parent
def get_internal_nodes(tree, node, depth):
# Each stack entry: (node, visited_flag)
nodes_stack = [(node, False)]
while nodes_stack:
current_node, visited = nodes_stack.pop()
if not visited:
# Push node back as visited
nodes_stack.append((current_node, True))
# Push children in arbitrary order (no sorting)
for child_node in current_node.transition_links.values():
nodes_stack.append((child_node, False))
else:
# After all children are processed
if not current_node.is_leaf() and current_node.depth <= depth:
tree.nodes_by_depth_dict[current_node.depth].append(current_node)
def start():
print ("------------------------------------------------------------------------------------------")
start = time.time()
Build_suffix_tree()
print ("Building suffix tree took", round((time.time() - start), 5), "seconds")
print ("------------------------------------------------------------------------------------------")
start = time.time()
process_leaf_and_internal_nodes(tree)
print ("Processing leaf and internal nodes took", round((time.time() - start), 5), "seconds")
print ("--------------------------------------------------------------------------------------------------------------------")
print ("Benchmarking process")
print ("--------------------------------------------------------------------------------------------------------------------")
setattr(tree, "nodes_by_depth_dict", defaultdict(list))
max_depth_of_tested_nodes = 20
get_internal_nodes(tree, tree.root, max_depth_of_tested_nodes)
patterns_dict = defaultdict(list)
number_of_patterns_ends_at_internal_node = 0
for pattern_length in [7, 10, 12, 15, 20, 25, 30, 35, 40, 50]:
number_of_patterns = 0
i = 0
while True:
i += 1
nn = i * pattern_length
if nn + pattern_length >= tree.number_leaf_nodes:
break
else:
t = tree.word[nn:nn + pattern_length]
end_node_of_pattern_from_root = find_end_node_of_exact_match_starting_from_root_node(tree, t)
if end_node_of_pattern_from_root.is_leaf():
continue
else:
number_of_patterns_ends_at_internal_node += 1
patterns_dict[pattern_length].append((t, end_node_of_pattern_from_root))
number_of_patterns += 1
if number_of_patterns == 1000:
break
for depth in range(1, max_depth_of_tested_nodes + 1):
start_time_for_searching_all_patterns_of_all_lengths = time.time()
list_of_starting_nodes = tree.nodes_by_depth_dict[depth][-1000:]
Number_of_starting_nodes = len(list_of_starting_nodes)
for pattern_length in sorted(patterns_dict.keys()):
patterns = patterns_dict[pattern_length]
start = time.time()
for dat in patterns:
pattern = dat[0]
end_node_of_pattern_from_root = dat[1]
for starting_node in list_of_starting_nodes: # the last node is the root node so it was excluded
if pattern[0] in starting_node.transition_links:
matching_node = find_end_node_of_exact_path_of_string_starting_from_a_node(tree, pattern, starting_node, end_node_of_pattern_from_root)
if matching_node.depth >= starting_node.depth + pattern_length:
print ("Found matching node", starting_node, matching_node)
else:
print ("No matching node found")
else:
print ("No matching node found")
print ("Total time for searching for", len(patterns), "patterns of length", pattern_length, "starting from", Number_of_starting_nodes, "nodes out of ", len(tree.nodes_by_depth_dict[depth]), "nodes at depth", depth, "is", round((time.time() - start), 5), "seconds")
print ("Total time for searching for", number_of_patterns_ends_at_internal_node, "of all lengths starting from", Number_of_starting_nodes, "nodes out of ", len(tree.nodes_by_depth_dict[depth]), "nodes at depth", depth, "is", round(time.time() - start_time_for_searching_all_patterns_of_all_lengths, 5), "seconds")
print ()
start()