AntioxidantActivity/Main.py at main · EdoardoVigano/AntioxidantActivity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import pickle
import os
from mordred import Calculator, descriptors, error

from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
import numpy as np
import argparse
import sys
from datetime import datetime


def mordred_calculator(dataset:pd.DataFrame):
    """
    Args:
        dataset (pd.DataFrame): the dataset must be provided with "SMILES" column for which the MDs will be calculated

    Returns:
        The original dataset with concatenated the MDs
    """

    mols = [Chem.MolFromSmiles(smi) for smi in dataset['SMILES']]
    calc = Calculator(descriptors, ignore_3D=True)
    # as pandas
    df = calc.pandas(mols)
    df = pd.concat([dataset, df], axis=1)

    df = df.applymap(lambda x: np.nan if isinstance(x, error.Error) or isinstance(x, error.Missing)else x) # remove errors using nan
    return df

def check_smiles(smiles):
    try:
        Chem.MolFromSmiles(smiles)
    except:
        print(f"{smiles}: invalid smiles!")
        sys.exit(1)

def pipeline_model_importer():

    # import model and pipeline
    with open(os.path.join(os.getcwd(), 'models', 'alldata_model_Antioxidant_DPPH30MIN_extra_trees_regressor.pkl'), 'rb') as f:
        model1 = pickle.load(f)

    with open(os.path.join(os.getcwd(),'models', 'alldata_model_Antioxidant_DPPH30MIN_xgb_regressor.pkl'), 'rb') as f:
        model2 = pickle.load(f)

    with open(os.path.join(os.getcwd(),'models', 'alldata_model_Antioxidant_DPPH30MIN_gradient_boosting_regressor.pkl'), 'rb') as f:
        model3 = pickle.load(f)

    with open(os.path.join(os.getcwd(), 'pipeline_and_AD', 'pipeline_Antioxidant_DPPH30MIN.pkl'), 'rb') as f:
        pipeline = pickle.load(f)

    with open(os.path.join(os.getcwd(), 'pipeline_and_AD', 'AD_clf.pkl'), 'rb') as f:
        ad = pickle.load(f)

    return model1, model2, model3, pipeline, ad

def files_importer():
    parser = argparse.ArgumentParser(description='Antioxidant assesment')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--smiles', required=False, help='Specify target SMILES')
    group.add_argument('--filename', required=False, help='Specify xlsx file with molecules to predict')
    parser.add_argument('--summary', default=None, help='Specify if you want prediction from all models or only the consensus (default: summary=None) ones')

    args = parser.parse_args()

    # acess the input value
    input_value = args.smiles
    if input_value is None:
        input_value = args.filename

    print(f"Input value: {input_value}")
    return input_value, args.summary


if __name__ == '__main__':
    print("Antioxidant Model:\nExtra trees model to predict IC50 (log(ug/ml))")
    input_value, summary = files_importer()

    if ".xlsx" not in input_value:
        df = pd.DataFrame([input_value], columns=['SMILES'])
    else:
        df = pd.read_excel(input_value)

    # import data for testing
    model1, model2, model3, pipeline, ad = pipeline_model_importer()
    # Calculate molecular descriptors
    print("Molecular descriptors calculation....\n")
    data = mordred_calculator(df)
    print("MDs tranformation...\n")
    data_input = pd.DataFrame(pipeline.transform(data.loc[:, pipeline.feature_names_in_]), columns = pipeline.feature_names_in_)

    print("Model Assessment...")
    mw_calc = [MolWt(Chem.MolFromSmiles(smi)) for smi in df['SMILES']]
    df['Predictions_ETR [-log(IC50) M]'] = model1.predict(data_input.loc[:, model1.feature_names_in_])
    df['Predictions_ETR [mg/L]'] = [(10**-(c))*mw_*1000 for c, mw_ in zip(df['Predictions_ETR [-log(IC50) M]'], mw_calc)]

    df['Predictions_XGB [-log(IC50) M]'] = model2.predict(data_input.loc[:, model2.feature_names_in_])
    df['Predictions_XGB [mg/L]'] = [(10**-(c))*mw_*1000 for c, mw_ in zip(df['Predictions_XGB [-log(IC50) M]'], mw_calc)]

    df['Predictions_GB [-log(IC50) M]'] = model3.predict(data_input.loc[:, model3.feature_names_in_])
    df['Predictions_GB [mg/L]'] = [(10**-(c))*mw_*1000 for c, mw_ in zip(df['Predictions_GB [-log(IC50) M]'], mw_calc)]


    df['Consensus [-log(IC50) M]'] = [np.mean([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [-log(IC50) M]'], df['Predictions_XGB [-log(IC50) M]'], df['Predictions_GB [-log(IC50) M]'])]
    df['Interval [-log(IC50) M]'] = [np.std([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [-log(IC50) M]'], df['Predictions_XGB [-log(IC50) M]'], df['Predictions_GB [-log(IC50) M]'])]

    df['Consensus [mg/L]'] = [np.mean([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [mg/L]'], df['Predictions_XGB [mg/L]'], df['Predictions_GB [mg/L]'])]
    df['Interval [mg/L]'] = [np.std([y1, y2, y3]) for y1, y2, y3 in zip(df['Predictions_ETR [mg/L]'], df['Predictions_XGB [mg/L]'], df['Predictions_GB [mg/L]'])]


    df = round(df, 3)

    df['Consensus AND Uncertanty [mg/L]'] = [f"{str(y)} \u00B1 {i}" for y, i in zip(df['Consensus [mg/L]'], df['Interval [mg/L]'])]
    df['Applicability Domain'] = ad.predict(data_input.loc[:, model2.feature_names_in_])
    current_date = datetime.now().strftime("%d_%m_%Y")# ("%Y-%m-%d")
    if summary:
        df.loc[:, ['Consensus AND Uncertanty [mg/L]', 'Applicability Domain']].to_excel(f'Summary_predictions_{current_date}.xlsx')
        print(df)
    else:
        df.to_excel(f'Predictions_{current_date}.xlsx')
        print(df)