Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions EVE/GMM_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from sklearn import mixture
import numpy as np
import os
import tqdm
import logging

class GMM_model:
"""
Class for the global-local GMM model trained on single-point mutations for a wild-type protein sequence
"""
def __init__(self, params) -> None:

# store parameters
self.protein_GMM_weight = params['protein_GMM_weight']
self.params = dict(
n_components=2,
covariance_type='full',
max_iter=1000,
n_init=30,
tol=1e-4
)

def fit_single(self, X_train):
model = mixture.GaussianMixture(**self.params)
model.fit(X_train)
#The pathogenic cluster is the cluster with higher mean value
pathogenic_cluster_index = np.argmax(np.array(model.means_).flatten())
return model, pathogenic_cluster_index

def fit(self, X_train, proteins_train):
# set up to train
self.models = {}
self.indices = {}

# train global model
gmm, index = self.fit_single(X_train,'main')
self.models['main'] = gmm
self.indices['main'] = index

# train local models
if self.protein_GMM_weight > 0.0:
proteins_list = list(set(proteins_train))
for protein in tqdm.tqdm(proteins_list, "Training all protein GMMs"):
X_train_protein = X_train[proteins_train == protein]
gmm, index = self.fit_single(X_train_protein,protein)
self.models[protein] = gmm
self.indices[protein] = index

return self.models, self.indices

def predict(self, X_pred, protein):
model = self.models[protein]
cluster_index = self.indices[protein]
scores = model.predict_proba(X_pred)[:,cluster_index]
classes = (scores > 0.5).astype(int)
return scores, classes

def predict_weighted(self, X_pred, protein):
scores_protein = self.predict(X_pred, protein)
scores_main = self.predict(X_pred, 'main')

scores_weighted = scores_main * (1 - self.protein_GMM_weight) + \
scores_protein * self.protein_GMM_weight
classes_weighted = (scores_weighted > 0.5).astype(int)
return scores_weighted, classes_weighted

def get_fitted_params(self):
fitted_params = (
self.models.keys(),
self.models.values(),
self.indices.values()
)
output = {}
for protein, model, index in zip(*fitted_params):
output[protein] = {
'index': index,
'means': model.means_,
'covar': model.covariances_,
'weights': model.weights_
}
return output

273 changes: 156 additions & 117 deletions EVE/VAE_decoder.py

Large diffs are not rendered by default.

Binary file added data/weights/PTEN_HUMAN_theta_0.2.npy
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ export output_evol_indices_location='./results/evol_indices'
export num_samples_compute_evol_indices=20000
export batch_size=2048

python compute_evol_indices.py \
python predict_ELBO.py \
--MSA_data_folder ${MSA_data_folder} \
--MSA_list ${MSA_list} \
--protein_index ${protein_index} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@ export input_evol_indices_filename_suffix='_20000_samples'
export protein_list='./data/mappings/example_mapping.csv'
export output_eve_scores_location='./results/EVE_scores'
export output_eve_scores_filename_suffix='Jan1_PTEN_example'

export GMM_parameter_location='./results/GMM_parameters/Default_GMM_parameters'
export GMM_parameter_filename_suffix='default'
export protein_GMM_weight=0.3
export plot_location='./results'
export labels_file_location='./data/labels/PTEN_ClinVar_labels.csv'
export default_uncertainty_threshold_file_location='./utils/default_uncertainty_threshold.json'

python train_GMM_and_compute_EVE_scores.py \
python predict_GMM_score.py \
--input_evol_indices_location ${input_evol_indices_location} \
--input_evol_indices_filename_suffix ${input_evol_indices_filename_suffix} \
--protein_list ${protein_list} \
Expand All @@ -22,11 +19,6 @@ python train_GMM_and_compute_EVE_scores.py \
--GMM_parameter_filename_suffix ${GMM_parameter_filename_suffix} \
--compute_EVE_scores \
--protein_GMM_weight ${protein_GMM_weight} \
--plot_histograms \
--plot_scores_vs_labels \
--plot_location ${plot_location} \
--labels_file_location ${labels_file_location} \
--default_uncertainty_threshold_file_location ${default_uncertainty_threshold_file_location} \
--compute_uncertainty_thresholds \
--verbose


73 changes: 73 additions & 0 deletions examples/full_pipeline_PTEN.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
export MSA_data_folder='./data/MSA'
export MSA_list='./data/mappings/example_mapping.csv'
export MSA_weights_location='./data/weights'
export VAE_checkpoint_location='./results/VAE_parameters'
export model_name_suffix='Jan1_PTEN_example'
export model_parameters_location='./EVE/default_model_params.json'
export training_logs_location='./logs/'
export protein_index=0

python train_VAE.py \
--MSA_data_folder ${MSA_data_folder} \
--MSA_list ${MSA_list} \
--protein_index ${protein_index} \
--MSA_weights_location ${MSA_weights_location} \
--VAE_checkpoint_location ${VAE_checkpoint_location} \
--model_name_suffix ${model_name_suffix} \
--model_parameters_location ${model_parameters_location} \
--training_logs_location ${training_logs_location} \
--verbose

export computation_mode='all_singles'
export all_singles_mutations_folder='./data/mutations'
export evol_indices_location='./results/evol_indices'
export num_samples_compute_evol_indices=20000
export batch_size=2048

python predict_ELBO.py \
--MSA_data_folder ${MSA_data_folder} \
--MSA_list ${MSA_list} \
--protein_index ${protein_index} \
--MSA_weights_location ${MSA_weights_location} \
--VAE_checkpoint_location ${VAE_checkpoint_location} \
--model_name_suffix ${model_name_suffix} \
--model_parameters_location ${model_parameters_location} \
--computation_mode ${computation_mode} \
--all_singles_mutations_folder ${all_singles_mutations_folder} \
--output_evol_indices_location ${output_evol_indices_location} \
--num_samples_compute_evol_indices ${num_samples_compute_evol_indices} \
--batch_size ${batch_size}
--verbose

export evol_indices_filename_suffix='_20000_samples'
export protein_list='./data/mappings/example_mapping.csv'
export eve_scores_location='./results/EVE_scores'
export eve_scores_filename_suffix='Jan1_PTEN_example'
export GMM_parameter_location='./results/GMM_parameters/Default_GMM_parameters'
export GMM_parameter_filename_suffix='default'
export protein_GMM_weight=0.3
export default_uncertainty_threshold_file_location='./utils/default_uncertainty_threshold.json'

python predict_GMM_score.py \
--input_evol_indices_location ${evol_indices_location} \
--input_evol_indices_filename_suffix ${evol_indices_filename_suffix} \
--protein_list ${protein_list} \
--output_eve_scores_location ${eve_scores_location} \
--output_eve_scores_filename_suffix ${eve_scores_filename_suffix} \
--GMM_parameter_location ${GMM_parameter_location} \
--GMM_parameter_filename_suffix ${GMM_parameter_filename_suffix} \
--protein_GMM_weight ${protein_GMM_weight} \
--compute_uncertainty_thresholds \
--verbose

export plot_location='./results'
export labels_file_location='./data/labels/PTEN_ClinVar_labels.csv'

python plot_scores_and_labels.py \
--input_eve_scores_location ${eve_scores_location} \
--input_eve_scores_filename_suffix ${eve_scores_filename_suffix} \
--labels_file_location ${labels_file_location} \
--plot_location ${plot_location} \
--plot_histograms \
--plot_scores_vs_labels \
--verbose
33 changes: 33 additions & 0 deletions logs/PTEN_HUMAN_Jan1_PTEN_example_losses.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
Number of sequences in alignment file: 1179
Neff: 131.5475617794571
Alignment sequence length: 387
90 changes: 90 additions & 0 deletions plot_scores_and_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
import os
import tqdm
import pickle
import pandas as pd
from utils import plot_helpers


def main(args):
model_location = args.gmm_model_location
with open(model_location, 'rb') as fid:
gmm_model = pickle.load(fid)
scores_location = args.input_eve_scores_location + os.sep + \
'all_EVE_scores_'+ \
args.output_eve_scores_filename_suffix+'.csv'
all_scores = pd.read_csv(
scores_location,
index=False
)
protein_list = list(all_scores.protein_name.unique())

if args.plot_elbo_histograms:
# Plot fit of mixture model to predicted scores
histograms_location = \
args.plot_location+os.sep+\
'plots_histograms'+os.sep+\
args.output_eve_scores_filename_suffix
if not os.path.exists(histograms_location):
os.makedirs(histograms_location)
plot_helpers.plot_histograms(
all_scores,
gmm_model,
histograms_location,
protein_list
)

if args.plot_scores_vs_labels:
labels_dataset = pd.read_csv(
args.labels_file_location,
low_memory=False,
usecols=['protein_name','mutations','ClinVar_labels']
)
labels_dataset = labels_dataset[labels_dataset.ClinVar_labels.isin([0,1])]
all_scores_labelled = pd.merge(
all_scores,
labels_dataset,
how='inner',
on=['protein_name','mutations']
)
labelled_scores_location = args.input_eve_scores_location + os.sep + \
'all_EVE_scores_labelled_'+ \
args.output_eve_scores_filename_suffix+'.csv'
all_scores_labelled.to_csv(
labelled_scores_location, index=False
)

# Plot scores against clinical labels
scores_vs_labels_plot_location = \
args.plot_location+os.sep+\
'plots_scores_vs_labels'+os.sep+\
args.output_eve_scores_filename_suffix
if not os.path.exists(scores_vs_labels_plot_location):
os.makedirs(scores_vs_labels_plot_location)
for protein in tqdm.tqdm(protein_list,"Plot scores Vs labels"):
protein_scores = all_scores_labelled[
all_scores_labelled.protein_name==protein
]
output_suffix = args.output_eve_scores_filename_suffix+'_'+protein
plot_helpers.plot_scores_vs_labels(
score_df=protein_scores,
plot_location=scores_vs_labels_plot_location,
output_eve_scores_filename_suffix=output_suffix,
mutation_name='mutations',
score_name="EVE_scores",
label_name='ClinVar_labels'
)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Plot EVE scores against standard labels')
parser.add_argument('--input_eve_scores_location', type=str, help='Folder where all EVE scores are stored')
parser.add_argument('--input_eve_scores_filename_suffix', default='', type=str, help='(Optional) Suffix to be added to output filename')
parser.add_argument('--labels_file_location', default=None, type=str, help='File with ground truth labels for all proteins of interest (e.g., ClinVar)')
parser.add_argument('--plot_location', default=None, type=str, help='Location of the different plots')
parser.add_argument('--plot_elbo_histograms', default=False, action='store_true', help='Plots all evol indices histograms with GMM fits')
parser.add_argument('--plot_scores_vs_labels', default=False, action='store_true', help='Plots EVE scores Vs labels at each protein position')
parser.add_argument('--verbose', action='store_true', help='Print detailed information during run')
args = parser.parse_args()


main(args)
Loading