-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMainPreprocessing.py
More file actions
151 lines (118 loc) · 7.05 KB
/
MainPreprocessing.py
File metadata and controls
151 lines (118 loc) · 7.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from Preprocessing.DataPrep import *
from Preprocessing.ImageCreation import *
import os.path
from sklearn.model_selection import train_test_split
random_state = 125
train_file_exists = os.path.exists('HivData/df_train_preprocessed.csv')
test_file_exists = os.path.exists('HivData/df_test_preprocessed.csv')
if (train_file_exists == False and test_file_exists == False):
# get dataset
df = load_dataset()
# clean up
# add column flagging compounds with uncommon atom types
numCpdsBefore = df.shape[0]
# remove compounds with uncommon atoms types
df['OnlyCommonAtms'] = df.SMILES.progress_apply(commonAtoms)
df = df.loc[(df['OnlyCommonAtms'] == True)]
# removing salts, cleaning up SMILES
df['SMILES'] = df.SMILES.progress_apply(stripSalts)
# neutralize
df['SMILES'] = df.SMILES.progress_apply(neutralize_atoms) # clean SMILES
# calculate and filter on three simple descriptors (RDkit)
df['desc'] = df.SMILES.progress_apply(calc_3_descriptors)
desc_cols = ['MW','Rotors','Heavys']
df[desc_cols] = df.desc.to_list()
df.drop("desc",axis=1,inplace=True)
# filter on simple properties
df = df.loc[(df['Heavys'] > 5) &
(df['Heavys'] < 50) &
(df['Rotors'] < 18) &
(df['MW'] > 100) &
(df['MW'] < 900) #&
#(df['OnlyCommonAtms'] == True)
]
# drop columns from dataframe
df.drop(["OnlyCommonAtms",'Heavys','MW','Rotors'], axis=1,inplace=True)
df = df.reset_index(drop=True)
numCpdsAfter = df.shape[0]
print(f"Starting with {numCpdsBefore} compounds in entire dataset. This many remaining {numCpdsAfter} after filtering.")
# split into 1/6th testset and 5/6 for train
training_data, testing_data = train_test_split(df, test_size=1/6, random_state=random_state, stratify=df["HIV_active"])
training_data = training_data.reset_index(drop=True)
testing_data = testing_data.reset_index(drop=True)
# Save preprocessed test and train dataset
training_data.to_csv(path_or_buf ="HivData/df_train_preprocessed.csv")
testing_data.to_csv(path_or_buf ="HivData/df_test_preprocessed.csv")
else:
print("Found local preprocessed dataframe")
training_data = pd.read_csv("HivData/df_train_preprocessed.csv", index_col=[0]).reset_index(drop=True)
testing_data = pd.read_csv("HivData/df_test_preprocessed.csv", index_col=[0]).reset_index(drop=True)
# creating a list of inactive and active compounds
df_inactives_train = training_data.loc[(training_data['HIV_active'] == 0)].reset_index(drop=True)
df_actives_train = training_data.loc[(training_data['HIV_active'] == 1)].reset_index(drop=True)
df_inactives_test = testing_data.loc[(testing_data['HIV_active'] == 0)].reset_index(drop=True)
df_actives_test = testing_data.loc[(testing_data['HIV_active'] == 1)].reset_index(drop=True)
# --------MolImages--------
for fname in os.listdir('HIVImages/MolFromSmilesImages/Train'):
if fname.endswith('.png'):
print("png files already exists in specified path - Mol HIVImages")
break
else:
print("Starting to produce molecular images!")
# creating list of RDKit mol objects for inactive and active compounds
"""inactivesList_train = getMolListFromDataFrame(df_inactives_train, "MolName")
activesList_train = getMolListFromDataFrame(df_actives_train, "MolName")
inactivesList_test = getMolListFromDataFrame(df_inactives_test, "MolName")
activesList_test = getMolListFromDataFrame(df_actives_test, "MolName")
produceMolImages(path="HIVImages/MolFromSmilesImages/Train/", compoundList=inactivesList_train, HIV_activity="inactive")
produceMolImages(path="HIVImages/MolFromSmilesImages/Train/", compoundList=activesList_train, HIV_activity="active")
produceMolImages(path="HIVImages/MolFromSmilesImages/Test/", compoundList=inactivesList_test, HIV_activity="inactive")
produceMolImages(path="HIVImages/MolFromSmilesImages/Test/", compoundList=activesList_test, HIV_activity="active")
"""
# train images
df_inactives_train.progress_apply(export_smile_to_img, path="HIVImages/MolFromSmilesImages/Train/", HIV_activity="inactive", axis=1)
df_actives_train.progress_apply(export_smile_to_img, path="HIVImages/MolFromSmilesImages/Train/", HIV_activity="active", axis=1)
# test images
df_inactives_test.progress_apply(export_smile_to_img, path="HIVImages/MolFromSmilesImages/Test/", HIV_activity="inactive", axis=1)
df_actives_test.progress_apply(export_smile_to_img, path="HIVImages/MolFromSmilesImages/Test/", HIV_activity="active", axis=1)
# --------SmilesImages--------
# uses smiles to produce images
for fname in os.listdir('HIVImages/SmilesImages/Train'):
if fname.endswith('.png'):
# do stuff on the file
print("png files already exists in specified path - Smiles HIVImages")
break
else:
# train images
generateImageSMILE(path="HIVImages/SmilesImages/Train/", compoundList=df_inactives_train, HIV_activity="inactive")
generateImageSMILE(path="HIVImages/SmilesImages/Train/", compoundList=df_actives_train, HIV_activity="active")
# test images
generateImageSMILE(path="HIVImages/SmilesImages/Test/", compoundList=df_inactives_test, HIV_activity="inactive")
generateImageSMILE(path="HIVImages/SmilesImages/Test/", compoundList=df_actives_test, HIV_activity="active")
# --------SmilesColorImages--------
# uses smiles to produce images
for fname in os.listdir('HIVImages/SmilesColorImages/Train'):
if fname.endswith('.png'):
print("png files already exists in specified path - Smiles Color HIVImages")
break
else:
# train images
generateImageSMILEColor(path="HIVImages/SmilesColorImages/Train/", compoundList=df_inactives_train, HIV_activity="inactive", withChars=False)
generateImageSMILEColor(path="HIVImages/SmilesColorImages/Train/", compoundList=df_actives_train, HIV_activity="active", withChars=False)
# test images
generateImageSMILEColor(path="HIVImages/SmilesColorImages/Test/", compoundList=df_inactives_test, HIV_activity="inactive", withChars=False)
generateImageSMILEColor(path="HIVImages/SmilesColorImages/Test/", compoundList=df_actives_test, HIV_activity="active", withChars=False)
# --------SmilesColorCharsImages--------
# uses smiles to produce images
for fname in os.listdir('HIVImages/SmilesColorCharsImages/Train'):
if fname.endswith('.png'):
print("png files already exists in specified path - Smiles Color char HIVImages")
break
else:
# train images
generateImageSMILEColor(path="HIVImages/SmilesColorCharsImages/Train/", compoundList=df_inactives_train, HIV_activity="inactive", withChars=True)
generateImageSMILEColor(path="HIVImages/SmilesColorCharsImages/Train/", compoundList=df_actives_train, HIV_activity="active", withChars=True)
# test images
generateImageSMILEColor(path="HIVImages/SmilesColorCharsImages/Test/", compoundList=df_inactives_test, HIV_activity="inactive", withChars=True)
generateImageSMILEColor(path="HIVImages/SmilesColorCharsImages/Test/", compoundList=df_actives_test, HIV_activity="active", withChars=True)
print("Done!")