Skip to content

Commit 0bbbfe9

Browse files
authored
Merge pull request #150 from ATOMScience-org/curation_funcs_1.4
bring Curation funcs #138 to 1.4.0
2 parents 26488bb + 68222ce commit 0bbbfe9

File tree

10 files changed

+1353
-37
lines changed

10 files changed

+1353
-37
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
3+
import json
4+
import numpy as np
5+
import pandas as pd
6+
import os
7+
import sys
8+
9+
import atomsci.ddm.utils.curate_data as curate_data
10+
import atomsci.ddm.utils.struct_utils as struct_utils
11+
12+
script_path = os.path.dirname(os.path.realpath(__file__))
13+
test_file_prefix = 'pGP_MDCK_efflux_ratio_chembl29'
14+
test_files = [f"{script_path}/{test_file_prefix}-{suffix}.csv" for suffix in ['filtered', 'aggregated', 'averaged']]
15+
16+
def clean():
17+
"""
18+
Clean test files
19+
"""
20+
for f in test_files:
21+
if os.path.isfile(f):
22+
os.remove(f)
23+
24+
def get_raw_data():
25+
"""
26+
Returns data frame for dataset to be used to test curation functions
27+
"""
28+
dset_path = os.path.realpath(os.path.join(script_path,
29+
'../../test_datasets/pGP_MDCK_efflux_ratio_chembl29.csv'))
30+
raw_df = pd.read_csv(dset_path)
31+
return raw_df
32+
33+
def test_remove_outlier_replicates():
34+
"""
35+
Test outlier removal using curate_data.remove_outlier_replicates
36+
"""
37+
raw_df = get_raw_data()
38+
print(f"Raw data has {len(raw_df)} rows, {len(set(raw_df.base_rdkit_smiles.values))} unique compounds")
39+
filt_df = curate_data.remove_outlier_replicates(raw_df, response_col='log_efflux_ratio', id_col='base_rdkit_smiles',
40+
max_diff_from_median=0.5)
41+
n_filt_rows = len(filt_df)
42+
n_filt_cmpds = len(set(filt_df.base_rdkit_smiles.values))
43+
print(f"Filtered data has {n_filt_rows} rows, {n_filt_cmpds} unique compounds")
44+
assert (n_filt_rows == 1093), "Error: expected 1093 rows in filtered data"
45+
assert (n_filt_cmpds == 803), "Error: expected 803 unique compounds in filtered data"
46+
n_removed = len(raw_df) - n_filt_rows
47+
assert (n_removed == 7), f"Error: {n_removed} rows were removed, expected 7"
48+
49+
filt_file = f"{script_path}/{test_file_prefix}-filtered.csv"
50+
filt_df.to_csv(filt_file, index=False)
51+
print(f"Wrote outlier-filtered data to {filt_file}")
52+
return filt_df
53+
54+
def test_aggregate_assay_data(filt_df=None):
55+
"""
56+
Test curate_data.aggregate_assay_data, the preferred function for averaging replicate values over compounds
57+
"""
58+
if filt_df is None:
59+
filt_df = test_remove_outlier_replicates()
60+
agg_df = curate_data.aggregate_assay_data(filt_df, value_col='log_efflux_ratio', label_actives=False,
61+
id_col='compound_id', smiles_col='base_rdkit_smiles', relation_col='relation')
62+
n_agg_rows = len(agg_df)
63+
n_agg_cmpds = len(set(agg_df.base_rdkit_smiles.values))
64+
print(f"Aggregated data has {n_agg_rows} rows, {n_agg_cmpds} unique compounds")
65+
assert (n_agg_rows == 803), "Error: expected 803 rows in aggregated data"
66+
assert (n_agg_cmpds == 803), "Error: expected 803 unique compounds in aggregated data"
67+
68+
agg_file = f"{script_path}/{test_file_prefix}-aggregated.csv"
69+
agg_df.to_csv(agg_file, index=False)
70+
print(f"Wrote aggregated data to {agg_file}")
71+
72+
73+
def test_average_and_remove_duplicates():
74+
"""
75+
Test outlier removal and averaging using deprecated curation function
76+
"""
77+
raw_df = get_raw_data()
78+
79+
# tolerance: In each iteration, remove replicate measurements that differ from their mean by more than this percentage of the mean
80+
tolerance = 50 # percentage
81+
column = 'log_efflux_ratio' # column containing measurement values
82+
list_bad_duplicates = 'Yes'
83+
data = raw_df
84+
# max_std: Remove compounds whose standard deviation across replicates exceeds this value
85+
max_std = 0.5
86+
87+
curated_df = curate_data.average_and_remove_duplicates(
88+
column, tolerance, list_bad_duplicates, data, max_std, compound_id='compound_id', smiles_col='base_rdkit_smiles')
89+
print(f"Averaged data has {len(curated_df)} rows, {len(set(curated_df.base_rdkit_smiles.values))} unique compounds")
90+
91+
curated_file = f"{script_path}/{test_file_prefix}-curated.csv"
92+
curated_df.to_csv(curated_file, index=False)
93+
print(f"Wrote curated data to {curated_file}")
94+
95+
96+
def test():
97+
"""
98+
Test data curation functions
99+
"""
100+
101+
# Clean up old files
102+
clean()
103+
104+
# Filter out outliers (preferred method)
105+
filt_df = test_remove_outlier_replicates()
106+
107+
# Average replicate values per compound (preferred method)
108+
test_aggregate_assay_data(filt_df)
109+
110+
# Remove outliers and average over replicates (old method)
111+
test_average_and_remove_duplicates()
112+
113+
114+
115+
if __name__ == '__main__':
116+
test()

atomsci/ddm/test/integrative/delaney_NN/test_delaney_NN.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,12 @@ def curate():
5656
column, tolerance, list_bad_duplicates, data, max_std, compound_id='compound_id', smiles_col='rdkit_smiles')
5757

5858
# Check distribution of response values
59-
assert (curated_df.shape[0] == 1117), 'Error: Incorrect number of compounds'
59+
assert (curated_df.shape[0] == 1116), f"Error: Incorrect number of compounds ({len(curated_df)}, should be 1116)"
6060

6161
curated_df.to_csv('delaney-processed_curated.csv')
6262

6363
# Create second test set by reproducible index for prediction
64-
curated_df.tail(1000).to_csv('delaney-processed_curated_fit.csv')
64+
curated_df.tail(999).to_csv('delaney-processed_curated_fit.csv')
6565
curated_df.head(117).to_csv('delaney-processed_curated_external.csv')
6666

6767
assert(os.path.isfile('delaney-processed_curated.csv'))

atomsci/ddm/test/integrative/delaney_Panel/test_delaney_panel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ def curate():
6565
curated_df[column_class+'2'] = curated_df[column_class]
6666

6767
# Check distribution of response values
68-
assert (curated_df.shape[0] == 1117), 'Error: Incorrect number of compounds'
68+
assert (curated_df.shape[0] == 1116), 'Error: Incorrect number of compounds'
6969

7070
curated_df.to_csv('delaney-processed_curated.csv')
7171

7272
# Create second test set by reproducible index for prediction
73-
curated_df.tail(1000).to_csv('delaney-processed_curated_fit.csv')
73+
curated_df.tail(999).to_csv('delaney-processed_curated_fit.csv')
7474
curated_df.head(117).to_csv('delaney-processed_curated_external.csv')
7575

7676
assert (os.path.isfile('delaney-processed_curated.csv'))

atomsci/ddm/test/integrative/delaney_RF/test_delany_RF.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ def curate():
5555
column, tolerance, list_bad_duplicates, data, max_std, compound_id='compound_id', smiles_col='rdkit_smiles')
5656

5757
# Check distribution of response values
58-
assert (curated_df.shape[0] == 1117), 'Error: Incorrect number of compounds'
58+
assert (curated_df.shape[0] == 1116), 'Error: Incorrect number of compounds'
5959

6060
curated_df.to_csv('delaney-processed_curated.csv')
6161

6262
# Create second test set by reproducible index for prediction
63-
curated_df.tail(1000).to_csv('delaney-processed_curated_fit.csv')
63+
curated_df.tail(999).to_csv('delaney-processed_curated_fit.csv')
6464
curated_df.head(117).to_csv('delaney-processed_curated_external.csv')
6565

6666
assert (os.path.isfile('delaney-processed_curated.csv'))

atomsci/ddm/test/integrative/delaney_XGB/test_delany_XGB.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ def curate():
5555
column, tolerance, list_bad_duplicates, data, max_std, compound_id='compound_id', smiles_col='rdkit_smiles')
5656

5757
# Check distribution of response values
58-
assert (curated_df.shape[0] == 1117), 'Error: Incorrect number of compounds'
58+
assert (curated_df.shape[0] == 1116), 'Error: Incorrect number of compounds'
5959

6060
curated_df.to_csv('delaney-processed_curated.csv')
6161

6262
# Create second test set by reproducible index for prediction
63-
curated_df.tail(1000).to_csv('delaney-processed_curated_fit.csv')
63+
curated_df.tail(999).to_csv('delaney-processed_curated_fit.csv')
6464
curated_df.head(117).to_csv('delaney-processed_curated_external.csv')
6565

6666
assert (os.path.isfile('delaney-processed_curated.csv'))

atomsci/ddm/test/integrative/wenzel_NN/test_wenzel_NN.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def curate():
7878
curated_df.to_csv(data_filename, index=False)
7979

8080
# Create second test set by reproducible index for prediction
81-
curated_df.tail(5000).to_csv('hlm_clearance_curated_fit.csv')
81+
curated_df.tail(4989).to_csv('hlm_clearance_curated_fit.csv')
8282
curated_df.head(348).to_csv('hlm_clearance_curated_external.csv')
8383

8484

0 commit comments

Comments
 (0)