-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathMain.py
More file actions
131 lines (92 loc) · 5.33 KB
/
Main.py
File metadata and controls
131 lines (92 loc) · 5.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import pickle
import os
from mordred import Calculator, descriptors, error
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
import numpy as np
import argparse
import sys
from datetime import datetime
def mordred_calculator(dataset:pd.DataFrame):
"""
Args:
dataset (pd.DataFrame): the dataset must be provided with "SMILES" column for which the MDs will be calculated
Returns:
The original dataset with concatenated the MDs
"""
mols = [Chem.MolFromSmiles(smi) for smi in dataset['SMILES']]
calc = Calculator(descriptors, ignore_3D=True)
# as pandas
df = calc.pandas(mols)
df = pd.concat([dataset, df], axis=1)
df = df.applymap(lambda x: np.nan if isinstance(x, error.Error) or isinstance(x, error.Missing)else x) # remove errors using nan
return df
def check_smiles(smiles):
try:
Chem.MolFromSmiles(smiles)
except:
print(f"{smiles}: invalid smiles!")
sys.exit(1)
def pipeline_model_importer():
# import model and pipeline
with open(os.path.join(os.getcwd(), 'models', 'alldata_model_Antioxidant_DPPH30MIN_extra_trees_regressor.pkl'), 'rb') as f:
model1 = pickle.load(f)
with open(os.path.join(os.getcwd(),'models', 'alldata_model_Antioxidant_DPPH30MIN_xgb_regressor.pkl'), 'rb') as f:
model2 = pickle.load(f)
with open(os.path.join(os.getcwd(),'models', 'alldata_model_Antioxidant_DPPH30MIN_gradient_boosting_regressor.pkl'), 'rb') as f:
model3 = pickle.load(f)
with open(os.path.join(os.getcwd(), 'pipeline_and_AD', 'pipeline_Antioxidant_DPPH30MIN.pkl'), 'rb') as f:
pipeline = pickle.load(f)
with open(os.path.join(os.getcwd(), 'pipeline_and_AD', 'AD_clf.pkl'), 'rb') as f:
ad = pickle.load(f)
return model1, model2, model3, pipeline, ad
def files_importer():
parser = argparse.ArgumentParser(description='Antioxidant assesment')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--smiles', required=False, help='Specify target SMILES')
group.add_argument('--filename', required=False, help='Specify xlsx file with molecules to predict')
parser.add_argument('--summary', default=None, help='Specify if you want prediction from all models or only the consensus (default: summary=None) ones')
args = parser.parse_args()
# acess the input value
input_value = args.smiles
if input_value is None:
input_value = args.filename
print(f"Input value: {input_value}")
return input_value, args.summary
if __name__ == '__main__':
print("Antioxidant Model:\nExtra trees model to predict IC50 (log(ug/ml))")
input_value, summary = files_importer()
if ".xlsx" not in input_value:
df = pd.DataFrame([input_value], columns=['SMILES'])
else:
df = pd.read_excel(input_value)
# import data for testing
model1, model2, model3, pipeline, ad = pipeline_model_importer()
# Calculate molecular descriptors
print("Molecular descriptors calculation....\n")
data = mordred_calculator(df)
print("MDs tranformation...\n")
data_input = pd.DataFrame(pipeline.transform(data.loc[:, pipeline.feature_names_in_]), columns = pipeline.feature_names_in_)
print("Model Assessment...")
mw_calc = [MolWt(Chem.MolFromSmiles(smi)) for smi in df['SMILES']]
df['Predictions_ETR [-log(IC50) M]'] = model1.predict(data_input.loc[:, model1.feature_names_in_])
df['Predictions_ETR [mg/L]'] = [(10**-(c))*mw_*1000 for c, mw_ in zip(df['Predictions_ETR [-log(IC50) M]'], mw_calc)]
df['Predictions_XGB [-log(IC50) M]'] = model2.predict(data_input.loc[:, model2.feature_names_in_])
df['Predictions_XGB [mg/L]'] = [(10**-(c))*mw_*1000 for c, mw_ in zip(df['Predictions_XGB [-log(IC50) M]'], mw_calc)]
df['Predictions_GB [-log(IC50) M]'] = model3.predict(data_input.loc[:, model3.feature_names_in_])
df['Predictions_GB [mg/L]'] = [(10**-(c))*mw_*1000 for c, mw_ in zip(df['Predictions_GB [-log(IC50) M]'], mw_calc)]
df['Consensus [-log(IC50) M]'] = [np.mean([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [-log(IC50) M]'], df['Predictions_XGB [-log(IC50) M]'], df['Predictions_GB [-log(IC50) M]'])]
df['Interval [-log(IC50) M]'] = [np.std([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [-log(IC50) M]'], df['Predictions_XGB [-log(IC50) M]'], df['Predictions_GB [-log(IC50) M]'])]
df['Consensus [mg/L]'] = [np.mean([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [mg/L]'], df['Predictions_XGB [mg/L]'], df['Predictions_GB [mg/L]'])]
df['Interval [mg/L]'] = [np.std([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [mg/L]'], df['Predictions_XGB [mg/L]'], df['Predictions_GB [mg/L]'])]
df = round(df, 3)
df['Consensus AND Uncertanty [mg/L]'] = [f"{str(y)} \u00B1 {i}" for y, i in zip(df['Consensus [mg/L]'], df['Interval [mg/L]'])]
df['Applicability Domain'] = ad.predict(data_input.loc[:, model2.feature_names_in_])
current_date = datetime.now().strftime("%d_%m_%Y")# ("%Y-%m-%d")
if summary:
df.loc[:, ['Consensus AND Uncertanty [mg/L]', 'Applicability Domain']].to_excel(f'Summary_predictions_{current_date}.xlsx')
print(df)
else:
df.to_excel(f'Predictions_{current_date}.xlsx')
print(df)