|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +import json |
| 4 | +import numpy as np |
| 5 | +import pandas as pd |
| 6 | +import os |
| 7 | +import sys |
| 8 | + |
| 9 | +import atomsci.ddm.utils.curate_data as curate_data |
| 10 | +import atomsci.ddm.utils.struct_utils as struct_utils |
| 11 | + |
| 12 | +script_path = os.path.dirname(os.path.realpath(__file__)) |
| 13 | +test_file_prefix = 'pGP_MDCK_efflux_ratio_chembl29' |
| 14 | +test_files = [f"{script_path}/{test_file_prefix}-{suffix}.csv" for suffix in ['filtered', 'aggregated', 'averaged']] |
| 15 | + |
| 16 | +def clean(): |
| 17 | + """ |
| 18 | + Clean test files |
| 19 | + """ |
| 20 | + for f in test_files: |
| 21 | + if os.path.isfile(f): |
| 22 | + os.remove(f) |
| 23 | + |
| 24 | +def get_raw_data(): |
| 25 | + """ |
| 26 | + Returns data frame for dataset to be used to test curation functions |
| 27 | + """ |
| 28 | + dset_path = os.path.realpath(os.path.join(script_path, |
| 29 | + '../../test_datasets/pGP_MDCK_efflux_ratio_chembl29.csv')) |
| 30 | + raw_df = pd.read_csv(dset_path) |
| 31 | + return raw_df |
| 32 | + |
| 33 | +def test_remove_outlier_replicates(): |
| 34 | + """ |
| 35 | + Test outlier removal using curate_data.remove_outlier_replicates |
| 36 | + """ |
| 37 | + raw_df = get_raw_data() |
| 38 | + print(f"Raw data has {len(raw_df)} rows, {len(set(raw_df.base_rdkit_smiles.values))} unique compounds") |
| 39 | + filt_df = curate_data.remove_outlier_replicates(raw_df, response_col='log_efflux_ratio', id_col='base_rdkit_smiles', |
| 40 | + max_diff_from_median=0.5) |
| 41 | + n_filt_rows = len(filt_df) |
| 42 | + n_filt_cmpds = len(set(filt_df.base_rdkit_smiles.values)) |
| 43 | + print(f"Filtered data has {n_filt_rows} rows, {n_filt_cmpds} unique compounds") |
| 44 | + assert (n_filt_rows == 1093), "Error: expected 1093 rows in filtered data" |
| 45 | + assert (n_filt_cmpds == 803), "Error: expected 803 unique compounds in filtered data" |
| 46 | + n_removed = len(raw_df) - n_filt_rows |
| 47 | + assert (n_removed == 7), f"Error: {n_removed} rows were removed, expected 7" |
| 48 | + |
| 49 | + filt_file = f"{script_path}/{test_file_prefix}-filtered.csv" |
| 50 | + filt_df.to_csv(filt_file, index=False) |
| 51 | + print(f"Wrote outlier-filtered data to {filt_file}") |
| 52 | + return filt_df |
| 53 | + |
| 54 | +def test_aggregate_assay_data(filt_df=None): |
| 55 | + """ |
| 56 | + Test curate_data.aggregate_assay_data, the preferred function for averaging replicate values over compounds |
| 57 | + """ |
| 58 | + if filt_df is None: |
| 59 | + filt_df = test_remove_outlier_replicates() |
| 60 | + agg_df = curate_data.aggregate_assay_data(filt_df, value_col='log_efflux_ratio', label_actives=False, |
| 61 | + id_col='compound_id', smiles_col='base_rdkit_smiles', relation_col='relation') |
| 62 | + n_agg_rows = len(agg_df) |
| 63 | + n_agg_cmpds = len(set(agg_df.base_rdkit_smiles.values)) |
| 64 | + print(f"Aggregated data has {n_agg_rows} rows, {n_agg_cmpds} unique compounds") |
| 65 | + assert (n_agg_rows == 803), "Error: expected 803 rows in aggregated data" |
| 66 | + assert (n_agg_cmpds == 803), "Error: expected 803 unique compounds in aggregated data" |
| 67 | + |
| 68 | + agg_file = f"{script_path}/{test_file_prefix}-aggregated.csv" |
| 69 | + agg_df.to_csv(agg_file, index=False) |
| 70 | + print(f"Wrote aggregated data to {agg_file}") |
| 71 | + |
| 72 | + |
| 73 | +def test_average_and_remove_duplicates(): |
| 74 | + """ |
| 75 | + Test outlier removal and averaging using deprecated curation function |
| 76 | + """ |
| 77 | + raw_df = get_raw_data() |
| 78 | + |
| 79 | + # tolerance: In each iteration, remove replicate measurements that differ from their mean by more than this percentage of the mean |
| 80 | + tolerance = 50 # percentage |
| 81 | + column = 'log_efflux_ratio' # column containing measurement values |
| 82 | + list_bad_duplicates = 'Yes' |
| 83 | + data = raw_df |
| 84 | + # max_std: Remove compounds whose standard deviation across replicates exceeds this value |
| 85 | + max_std = 0.5 |
| 86 | + |
| 87 | + curated_df = curate_data.average_and_remove_duplicates( |
| 88 | + column, tolerance, list_bad_duplicates, data, max_std, compound_id='compound_id', smiles_col='base_rdkit_smiles') |
| 89 | + print(f"Averaged data has {len(curated_df)} rows, {len(set(curated_df.base_rdkit_smiles.values))} unique compounds") |
| 90 | + |
| 91 | + curated_file = f"{script_path}/{test_file_prefix}-curated.csv" |
| 92 | + curated_df.to_csv(curated_file, index=False) |
| 93 | + print(f"Wrote curated data to {curated_file}") |
| 94 | + |
| 95 | + |
| 96 | +def test(): |
| 97 | + """ |
| 98 | + Test data curation functions |
| 99 | + """ |
| 100 | + |
| 101 | + # Clean up old files |
| 102 | + clean() |
| 103 | + |
| 104 | + # Filter out outliers (preferred method) |
| 105 | + filt_df = test_remove_outlier_replicates() |
| 106 | + |
| 107 | + # Average replicate values per compound (preferred method) |
| 108 | + test_aggregate_assay_data(filt_df) |
| 109 | + |
| 110 | + # Remove outliers and average over replicates (old method) |
| 111 | + test_average_and_remove_duplicates() |
| 112 | + |
| 113 | + |
| 114 | + |
| 115 | +if __name__ == '__main__': |
| 116 | + test() |
0 commit comments