-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathclean_dataclass.py
More file actions
152 lines (95 loc) · 5.86 KB
/
clean_dataclass.py
File metadata and controls
152 lines (95 loc) · 5.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from load_functions.load_leads import extract_twelve_leads, load_data
from util_functions.general import get_parent_folder
from util_functions.general import get_collection
from util_functions.load_data_ids import *
import argparse
import pathlib
def clean_dataclass(data_class: str):
"""
This function makes it possible to discern the elements of a given data_class
between corrupted and cleaned elements, gathering also statistical information
about the differences (in terms of ECG values, acquisition date, ...)
between the two groups of elements (corrupted vs cleaned)
The IDs of the corrupted/cleaned elements are stored in the folder ./../Data/Feature_map/Dataclass/data_class/'
The statistical results are stored in the folder ./../Data/Analysis/Dataclass/data_class/Data/'
"""
parent_folder = get_parent_folder()
class_ids = load_dataclass_ids(parent_folder, data_class)
if data_class == 'poor_data_quality':
class_clean_ids = []
class_corrupted_ids = class_ids
else:
dataset_clean_ids = load_dataset_clean_ids(parent_folder)
dataset_corrupted_ids = load_dataset_corrupted_ids(parent_folder)
class_clean_ids = list(set(class_ids) & set(dataset_clean_ids))
class_corrupted_ids = list(set(class_ids) & set(dataset_corrupted_ids))
save_dataclass_clean_ids(parent_folder, data_class, class_clean_ids)
save_dataclass_corrupted_ids(parent_folder, data_class, class_corrupted_ids)
class_corrupted_patient_ids = []
class_corrupted_ages = []
class_corrupted_acquisition_date = []
class_corrupted_max_values = []
class_corrupted_min_values = []
class_clean_patient_ids = []
class_clean_ages = []
class_clean_acquisition_date = []
class_clean_max_values = []
class_clean_min_values = []
class_size = len(class_ids)
total_processed_data = 0
subset_size = 100000
for j in range(int(class_size / subset_size) + 1):
if j == int(class_size / subset_size):
subset_ids = class_ids[j*subset_size:]
else:
subset_ids = class_ids[j*subset_size:(j+1)*subset_size]
subset = load_data(get_collection(), subset_ids)
for i, element in enumerate(subset):
if i % 1000 == 0:
print('Processed ', total_processed_data, ' data!')
total_processed_data += 1
element_id = element['_id']
patient_id = element['RestingECG']['PatientDemographics']['PatientID']
if element_id in class_clean_ids:
class_clean_patient_ids.append(patient_id)
age = int(element['RestingECG']['PatientDemographics']['PatientAge'])
acquisition_date = element['RestingECG']['TestDemographics']['AcquisitionDate']
twelve_leads, _ = extract_twelve_leads(element)
class_clean_ages.append(age)
class_clean_acquisition_date.append(int(str(acquisition_date)[:4]))
for lead in twelve_leads:
class_clean_max_values.append(np.max(lead))
class_clean_min_values.append(np.min(lead))
else:
class_corrupted_patient_ids.append(patient_id)
try:
age = int(element['RestingECG']['PatientDemographics']['PatientAge'])
acquisition_date = element['RestingECG']['TestDemographics']['AcquisitionDate']
twelve_leads, _ = extract_twelve_leads(element)
class_corrupted_ages.append(age)
class_corrupted_acquisition_date.append(int(str(acquisition_date)[:4]))
for lead in twelve_leads:
class_corrupted_max_values.append(np.max(lead))
class_corrupted_min_values.append(np.min(lead))
except:
pass
class_clean_patient_ids = list(set(class_clean_patient_ids))
class_corrupted_patient_ids = list(set(class_corrupted_patient_ids))
save_dataclass_clean_patient_ids(parent_folder, data_class, class_clean_patient_ids)
save_dataclass_corrupted_patient_ids(parent_folder, data_class, class_corrupted_patient_ids)
stats_folder = parent_folder + 'Analysis/Dataclass/' + data_class + '/Data/'
pathlib.Path(stats_folder).mkdir(parents=True, exist_ok=True)
np.save(stats_folder + 'clean_ages.npy', np.asarray(class_clean_ages))
np.save(stats_folder + 'clean_acquisition_date.npy', np.asarray(class_clean_acquisition_date))
np.save(stats_folder + 'clean_max_values.npy', np.asarray(class_clean_max_values))
np.save(stats_folder + 'clean_min_values.npy', np.asarray(class_clean_min_values))
np.save(stats_folder + 'corrupted_ages.npy', np.asarray(class_corrupted_ages))
np.save(stats_folder + 'corrupted_acquisition_date.npy', np.asarray(class_corrupted_acquisition_date))
np.save(stats_folder + 'corrupted_max_values.npy', np.asarray(class_corrupted_max_values))
np.save(stats_folder + 'corrupted_min_values.npy', np.asarray(class_corrupted_min_values))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-data_class', '--data_class', type=str)
args = vars(parser.parse_args())
data_class = args['data_class']
clean_dataclass(data_class)