-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdata_sampling.py
More file actions
116 lines (109 loc) · 4.3 KB
/
data_sampling.py
File metadata and controls
116 lines (109 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import config
import re
import numpy as np
from utils import *
from tqdm import tqdm
HOUR = 3600 # seconds
def convert_into_samples(log_data, entity=None, window_size=config.window_size,
log_name="bgl", max_time_interval=24*HOUR):
samples = []
temp_info = {}
group_names = ['']
min_window_length = 2
sample_proba = 0.2
collected_groups = set()
def add_sequence_sample(group_id, tmp_id=None, use_sub_smpl=False):
if group_id not in temp_info:
return
if tmp_id is not None:
x = [tmp_id] + temp_info[group_id][0]
else:
x = [] + temp_info[group_id][0]
if min_window_length <= len(x):
y = 1 if temp_info[group_id][1]>0 else 0
g = group_id
first_sample = False
if g not in collected_groups:
collected_groups.add(g)
first_sample = True
if sample_proba==None or first_sample==True or np.random.random() < sample_proba:
samples.append([x, y, g])
return
for log_id in tqdm(range(len(log_data))):
log, label, timestamp, entities = log_data[log_id]
if entity is not None and entity!='None':
if entity not in entities: continue
else: group_names = entities[entity]
for group_id in group_names:
if group_id not in temp_info or timestamp-temp_info[group_id][-1] > max_time_interval:
add_sequence_sample(group_id)
temp_info[group_id] = [[], 0, -1]
temp_info[group_id][0].append(log_id)
temp_info[group_id][1] += label
temp_info[group_id][2] = timestamp
if len(temp_info[group_id][0])==window_size:
tmp_id = temp_info[group_id][0][0]
temp_info[group_id][0] = temp_info[group_id][0][1:]
add_sequence_sample(group_id, tmp_id, True)
temp_info[group_id][1] -= log_data[tmp_id][1]
for group_id in temp_info:
if group_id=='': continue
if min_window_length <= len(temp_info[group_id][0]) < window_size-1:
add_sequence_sample(group_id)
samples.sort(key=lambda x:x[0][-1])
return samples
def train_test_split(samples, sample_ratio=0.05):
all_index = list(range(len(samples)))
np.random.shuffle(all_index)
all_index = all_index[:int(len(all_index)*sample_ratio)]
N = len(all_index)
train_size = int(N*config.train_prop)
train_and_dev_size = int(N*(1-config.test_prop))
test_index = all_index[train_and_dev_size:]
test_index.sort()
train_dev_index = all_index[:train_and_dev_size]
np.random.shuffle(train_dev_index)
train_index = train_dev_index[:train_size]
dev_index = train_dev_index[train_size:train_and_dev_size]
# dev_index = np.random.choice(test_index, len(dev_index))
return train_index, dev_index, test_index
def train_test_split_grouped(samples, sample_ratio=0.1):
all_index = list(range(len(samples)))
if config.sort_chronological==False:
np.random.shuffle(all_index)
all_group_map = {}
for i, smpl in enumerate(samples):
g = smpl[-1]
if g not in all_group_map:
all_group_map[g] = []
all_group_map[g].append(i)
all_groups = list(all_group_map.keys())
np.random.shuffle(all_groups)
N = len(all_groups)
train_size = int(N*config.train_prop)
train_and_dev_size = int(N*(1-config.test_prop))
test_groups = all_groups[train_and_dev_size:]
train_dev_groups = all_groups[:train_and_dev_size]
np.random.shuffle(train_dev_groups)
train_groups = train_dev_groups[:train_size]
dev_groups = train_dev_groups[train_size:train_and_dev_size]
train_groups = np.random.choice(train_groups, int(len(train_groups)*sample_ratio))
test_groups = np.random.choice(test_groups, int(len(test_groups)*sample_ratio))
dev_groups = np.random.choice(dev_groups, int(len(dev_groups)*sample_ratio))
train_index = [i for g in train_groups for i in all_group_map[g]]
test_index = [i for g in test_groups for i in all_group_map[g]]
dev_index = [i for g in dev_groups for i in all_group_map[g]]
np.random.shuffle(train_index)
np.random.shuffle(test_index)
np.random.shuffle(dev_index)
return train_index, dev_index, test_index
def simple_balance_sampling(samples, train_index, balance_coef=2):
index_group_by_label = [[], []]
for i in train_index:
y = samples[i][1]
index_group_by_label[y].append(i)
num_positive = len(index_group_by_label[1])
index_group_by_label[0] = index_group_by_label[0][:int(num_positive*balance_coef)]
collect_index = index_group_by_label[0] + index_group_by_label[1]
np.random.shuffle(collect_index)
return collect_index