-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathgraph_construction.py
More file actions
200 lines (178 loc) · 6.6 KB
/
graph_construction.py
File metadata and controls
200 lines (178 loc) · 6.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import config
import re
import os
import numpy as np
from utils import *
from tqdm import tqdm
from data_preprocess import load_dataset
HOUR = 3600 # seconds
class LogEntityGraph(object):
FORMAT_SPACE = re.compile(r'\s+')
MAX_TIME_INTERVAL = int(config.max_time_interval*HOUR)
THRESHOLD_TOKEN_COMPLEXITY = 2
THRESHOLD_RECURRENCE_FREQUENCY = 2
MIN_TOKEN_LENGTH = 2
CASE_SENSITIVE = False
ENTITY_FILTER = [
re.compile(r'^\d+-\d+-\d+-\d+\.\d+\.\d+\.\d+$'), # (correlation / equivalent to timestamp)
]
def __init__(self, log_name="bgl", alias=""):
self.entity_filter = []
self.enable_temporal_link = True
self.graph_file_name = os.path.join(config.data_package_path, "graph_struct_%s@%s.obj"%(log_name,alias))
self.init_neighbor_map()
def __str__(self):
return "[log entity graph] number of nodes: %d , average degree: %.2lf , number of entities: %d \n" % (self.num_nodes, self.avg_degree, len(self.previous_entity))
def init_neighbor_map(self):
self.neighbor = {}
self.entity_index_map = {}
return self
def gen_statistics(self):
self.num_nodes = len(self.neighbor)
self.avg_degree = sum([len(v) for v in self.neighbor.values()]) / max(self.num_nodes, 1)
def calc_token_complexity(self, token):
token_complexity = 0
last_token_type = -1
for char in token:
if char.isdigit(): token_type = 0
elif char.isalpha():
if self.CASE_SENSITIVE and char.isupper(): token_type = 3
else: token_type = 1
else: token_type = 2
if token_type!=last_token_type:
token_complexity += 1
last_token_type = token_type
return token_complexity
def extract_entities(self, log):
entity_set = set()
if self.enable_temporal_link:
entity_set.add("")
all_tokens = re.split(self.FORMAT_SPACE, log)
for token in all_tokens:
if len(token) < self.MIN_TOKEN_LENGTH:
continue
if token[-1].isalnum()==False: # remove the last punctuation
token = token[:-1]
drop_entity = False
for e in self.ENTITY_FILTER:
if re.match(e, token):
drop_entity = True
break
if drop_entity==True:
continue
token_complexity = self.calc_token_complexity(token)
if token_complexity >= self.THRESHOLD_TOKEN_COMPLEXITY:
entity_set.add(token)
return entity_set
def add_edge(self, idx, prev_idx, ent_idx):
if idx not in self.neighbor:
self.neighbor[idx] = []
self.neighbor[idx].append((ent_idx, prev_idx))
return
def build(self, log_data):
self.init_neighbor_map()
self.previous_entity = {}
self.tempo_map = {}
for idx, line in tqdm(enumerate(log_data)):
log, _, timestamp, _ = line
self.tempo_map[idx] = timestamp
entity_set = self.extract_entities(log)
for entity in entity_set:
if entity in self.entity_index_map:
eid = self.entity_index_map.get(entity)
prev_idx = self.previous_entity.get(eid, -1)
if 0 <= self.tempo_map[idx] - self.tempo_map[prev_idx] <= self.MAX_TIME_INTERVAL:
self.add_edge(idx, prev_idx, eid)
else:
eid = len(self.entity_index_map)
self.entity_index_map[entity] = eid
self.previous_entity[eid] = idx
self.gen_statistics()
return self
def fetch_subgraph(self, smpl_index, max_history=20):
l2e_map, e2l_map = {}, {}
entity_tmpl_map = {}
def _collect_history_logs(cur_idx, ent_idx, prev_idx, frequency=0):
if frequency >= max_history:
entity_tmpl_map[ent_idx] = tmpl_list
return True
tmpl_list.append(self.tmpl_list[cur_idx])
if prev_idx is None:
if frequency >= self.THRESHOLD_RECURRENCE_FREQUENCY:
entity_tmpl_map[ent_idx] = tmpl_list
return True
return False
prev_idx2 = None
if prev_idx in self.neighbor:
for item in self.neighbor[prev_idx]:
if item[0]==ent_idx:
prev_idx2 = item[1]
break
return _collect_history_logs(prev_idx, ent_idx, prev_idx2, frequency+1)
for i, log_idx in enumerate(smpl_index[::-1]):
l2e_map[log_idx] = []
if log_idx not in self.neighbor:
continue
for ent_idx, prev_idx in self.neighbor[log_idx]:
tmpl_list = []
valid_entity = _collect_history_logs(log_idx, ent_idx, prev_idx)
if valid_entity==True:
if ent_idx not in e2l_map:
e2l_map[ent_idx] = []
e2l_map[ent_idx].append(log_idx)
l2e_map[log_idx].append(ent_idx)
return l2e_map, e2l_map, entity_tmpl_map
def apply_template_mapping(self, tmpl_list):
self.tmpl_list = tmpl_list
def build_index_entity_map(self):
self.index_entity_map = {
v:k for k,v in self.entity_index_map.items()
}
return self
def get_entity_name(self, ent_id):
return self.index_entity_map.get(ent_id, "")
def save(self, file_name=None):
if file_name is None:
file_name = self.graph_file_name
save_object([self.neighbor,self.entity_index_map],file_name)
return
def load(self, file_name=None):
if file_name is None:
file_name = self.graph_file_name
self.neighbor,self.entity_index_map = load_object(file_name)
self.gen_statistics()
return self
def show_log_entity_graph_statistics(self):
group_by_entity = {}
for log_idx, edge_list in self.neighbor.items():
for ent_idx, prev_idx in edge_list:
if ent_idx not in group_by_entity:
group_by_entity[ent_idx] = 0
group_by_entity[ent_idx] += 1
freq_ent_set = set([k for k,v in group_by_entity.items() if v>=2])
group_by_entity = {k:v for k,v in group_by_entity.items() if k in freq_ent_set}
group_by_log = {}
for log_idx, edge_list in self.neighbor.items():
group_by_log[log_idx] = 0
for ent_idx, prev_idx in edge_list:
if ent_idx in freq_ent_set:
group_by_log[log_idx] += 1
print_function = print
print_function("# Log Nodes:",len(group_by_log))
print_function("# Entity Nodes:",len(group_by_entity))
print_function("# Log->Entity Edges:",sum(group_by_log.values()))
print_function("# Entity->Log Edges:",sum(group_by_entity.values()))
print_function("# Log-Log Edges:",len(group_by_log)-1)
print_function("# Total Edges:", (len(group_by_log)-1)+(sum(group_by_log.values())+sum(group_by_entity.values()))//2)
print_function("Avg. Degree Log:",np.mean(list(group_by_log.values()))+2)
print_function("Avg. Degree Entity:",np.mean(list(group_by_entity.values())))
print_function("Max Degree Log:",np.max(list(group_by_log.values()))+2)
print_function("Max Degree Entity:",np.max(list(group_by_entity.values())))
return
if __name__=="__main__":
log_data = load_dataset("bgl")
log_entity_graph = LogEntityGraph("bgl")
log_entity_graph.build(log_data)
#log_entity_graph.save()
#log_entity_graph.load()
log_entity_graph.show_log_entity_graph_statistics()