OATML-Markslab · andrewcboardman · Sep 13, 2022 · Sep 13, 2022 · Sep 26, 2022
diff --git a/EVE/GMM_model.py b/EVE/GMM_model.py
@@ -0,0 +1,82 @@
+from sklearn import mixture
+import numpy as np
+import os
+import tqdm
+import logging
+
+class GMM_model:
+    """
+    Class for the global-local GMM model trained on single-point mutations for a wild-type protein sequence
+    """
+    def __init__(self, params) -> None:
+
+        # store parameters
+        self.protein_GMM_weight = params['protein_GMM_weight']
+        self.params = dict(
+            n_components=2, 
+            covariance_type='full',
+            max_iter=1000,
+            n_init=30,
+            tol=1e-4
+        )
+
+    def fit_single(self, X_train):
+        model = mixture.GaussianMixture(**self.params)
+        model.fit(X_train)
+        #The pathogenic cluster is the cluster with higher mean value
+        pathogenic_cluster_index = np.argmax(np.array(model.means_).flatten()) 
+        return model, pathogenic_cluster_index
+
+    def fit(self, X_train, proteins_train):
+        # set up to train
+        self.models = {}
+        self.indices = {}
+
+        # train global model
+        gmm, index = self.fit_single(X_train,'main')
+        self.models['main'] = gmm
+        self.indices['main'] = index
+
+        # train local models
+        if self.protein_GMM_weight > 0.0:
+            proteins_list = list(set(proteins_train))
+            for protein in tqdm.tqdm(proteins_list, "Training all protein GMMs"):
+                X_train_protein = X_train[proteins_train == protein]
+                gmm, index = self.fit_single(X_train_protein,protein)
+                self.models[protein] = gmm
+                self.indices[protein] = index                
+
+        return self.models, self.indices
+
+    def predict(self, X_pred, protein):
+        model = self.models[protein]
+        cluster_index = self.indices[protein]
+        scores = model.predict_proba(X_pred)[:,cluster_index]
+        classes = (scores > 0.5).astype(int)
+        return scores, classes
+
+    def predict_weighted(self, X_pred, protein):      
+        scores_protein = self.predict(X_pred, protein)
+        scores_main = self.predict(X_pred, 'main')
+
+        scores_weighted = scores_main * (1 - self.protein_GMM_weight) + \
+            scores_protein * self.protein_GMM_weight
+        classes_weighted = (scores_weighted > 0.5).astype(int)
+        return scores_weighted, classes_weighted
+
+    def get_fitted_params(self):
+        fitted_params = (
+            self.models.keys(), 
+            self.models.values(), 
+            self.indices.values()
+            )
+        output = {}
+        for protein, model, index in zip(*fitted_params):
+            output[protein] = {
+                'index': index, 
+                'means': model.means_,
+                'covar': model.covariances_,
+                'weights': model.weights_
+            }
+        return output
+
diff --git a/EVE/VAE_decoder.py b/EVE/VAE_decoder.py
diff --git a/data/weights/PTEN_HUMAN_theta_0.2.npy b/data/weights/PTEN_HUMAN_theta_0.2.npy
diff --git a/...Step2_compute_evol_indices_all_singles.sh → examples/Step2_predict_ELBO.sh b/...Step2_compute_evol_indices_all_singles.sh → examples/Step2_predict_ELBO.sh
@@ -13,7 +13,7 @@ export output_evol_indices_location='./results/evol_indices'
 export num_samples_compute_evol_indices=20000
 export batch_size=2048
 
-python compute_evol_indices.py \
+python predict_ELBO.py \
     --MSA_data_folder ${MSA_data_folder} \
     --MSA_list ${MSA_list} \
     --protein_index ${protein_index} \

diff --git a/...GMM_and_compute_EVE_scores_all_singles.sh → examples/Step3_predict_GMM_score.sh b/...GMM_and_compute_EVE_scores_all_singles.sh → examples/Step3_predict_GMM_score.sh
@@ -3,15 +3,12 @@ export input_evol_indices_filename_suffix='_20000_samples'
 export protein_list='./data/mappings/example_mapping.csv'
 export output_eve_scores_location='./results/EVE_scores'
 export output_eve_scores_filename_suffix='Jan1_PTEN_example'
-
 export GMM_parameter_location='./results/GMM_parameters/Default_GMM_parameters'
 export GMM_parameter_filename_suffix='default'
 export protein_GMM_weight=0.3
-export plot_location='./results'
-export labels_file_location='./data/labels/PTEN_ClinVar_labels.csv'
 export default_uncertainty_threshold_file_location='./utils/default_uncertainty_threshold.json'
 
-python train_GMM_and_compute_EVE_scores.py \
+python predict_GMM_score.py \
     --input_evol_indices_location ${input_evol_indices_location} \
     --input_evol_indices_filename_suffix ${input_evol_indices_filename_suffix} \
     --protein_list ${protein_list} \
@@ -22,11 +19,6 @@ python train_GMM_and_compute_EVE_scores.py \
     --GMM_parameter_filename_suffix ${GMM_parameter_filename_suffix} \
     --compute_EVE_scores \
     --protein_GMM_weight ${protein_GMM_weight} \
-    --plot_histograms \
-    --plot_scores_vs_labels \
-    --plot_location ${plot_location} \
-    --labels_file_location ${labels_file_location} \
-    --default_uncertainty_threshold_file_location ${default_uncertainty_threshold_file_location} \
+    --compute_uncertainty_thresholds \
     --verbose
-
 
diff --git a/examples/full_pipeline_PTEN.sh b/examples/full_pipeline_PTEN.sh
@@ -0,0 +1,73 @@
+export MSA_data_folder='./data/MSA'
+export MSA_list='./data/mappings/example_mapping.csv'
+export MSA_weights_location='./data/weights'
+export VAE_checkpoint_location='./results/VAE_parameters'
+export model_name_suffix='Jan1_PTEN_example'
+export model_parameters_location='./EVE/default_model_params.json'
+export training_logs_location='./logs/'
+export protein_index=0
+
+python train_VAE.py \
+    --MSA_data_folder ${MSA_data_folder} \
+    --MSA_list ${MSA_list} \
+    --protein_index ${protein_index} \
+    --MSA_weights_location ${MSA_weights_location} \
+    --VAE_checkpoint_location ${VAE_checkpoint_location} \
+    --model_name_suffix ${model_name_suffix} \
+    --model_parameters_location ${model_parameters_location} \
+    --training_logs_location ${training_logs_location} \
+    --verbose
+
+export computation_mode='all_singles'
+export all_singles_mutations_folder='./data/mutations'
+export evol_indices_location='./results/evol_indices'
+export num_samples_compute_evol_indices=20000
+export batch_size=2048
+
+python predict_ELBO.py \
+    --MSA_data_folder ${MSA_data_folder} \
+    --MSA_list ${MSA_list} \
+    --protein_index ${protein_index} \
+    --MSA_weights_location ${MSA_weights_location} \
+    --VAE_checkpoint_location ${VAE_checkpoint_location} \
+    --model_name_suffix ${model_name_suffix} \
+    --model_parameters_location ${model_parameters_location} \
+    --computation_mode ${computation_mode} \
+    --all_singles_mutations_folder ${all_singles_mutations_folder} \
+    --output_evol_indices_location ${output_evol_indices_location} \
+    --num_samples_compute_evol_indices ${num_samples_compute_evol_indices} \
+    --batch_size ${batch_size}
+    --verbose
+
+export evol_indices_filename_suffix='_20000_samples'
+export protein_list='./data/mappings/example_mapping.csv'
+export eve_scores_location='./results/EVE_scores'
+export eve_scores_filename_suffix='Jan1_PTEN_example'
+export GMM_parameter_location='./results/GMM_parameters/Default_GMM_parameters'
+export GMM_parameter_filename_suffix='default'
+export protein_GMM_weight=0.3
+export default_uncertainty_threshold_file_location='./utils/default_uncertainty_threshold.json'
+
+python predict_GMM_score.py \
+    --input_evol_indices_location ${evol_indices_location} \
+    --input_evol_indices_filename_suffix ${evol_indices_filename_suffix} \
+    --protein_list ${protein_list} \
+    --output_eve_scores_location ${eve_scores_location} \
+    --output_eve_scores_filename_suffix ${eve_scores_filename_suffix} \
+    --GMM_parameter_location ${GMM_parameter_location} \
+    --GMM_parameter_filename_suffix ${GMM_parameter_filename_suffix} \
+    --protein_GMM_weight ${protein_GMM_weight} \
+    --compute_uncertainty_thresholds \
+    --verbose
+
+export plot_location='./results'
+export labels_file_location='./data/labels/PTEN_ClinVar_labels.csv'
+
+python plot_scores_and_labels.py \
+    --input_eve_scores_location ${eve_scores_location} \
+    --input_eve_scores_filename_suffix ${eve_scores_filename_suffix} \
+    --labels_file_location ${labels_file_location} \
+    --plot_location ${plot_location} \
+    --plot_histograms \
+    --plot_scores_vs_labels \
+    --verbose
diff --git a/logs/PTEN_HUMAN_Jan1_PTEN_example_losses.csv b/logs/PTEN_HUMAN_Jan1_PTEN_example_losses.csv
@@ -0,0 +1,33 @@
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
+Number of sequences in alignment file:	1179
+Neff:	131.5475617794571
+Alignment sequence length:	387
diff --git a/plot_scores_and_labels.py b/plot_scores_and_labels.py
@@ -0,0 +1,90 @@
+import argparse
+import os
+import tqdm
+import pickle
+import pandas as pd
+from utils import plot_helpers
+
+
+def main(args):
+    model_location = args.gmm_model_location
+    with open(model_location, 'rb') as fid:
+        gmm_model = pickle.load(fid)
+    scores_location = args.input_eve_scores_location + os.sep + \
+        'all_EVE_scores_'+ \
+        args.output_eve_scores_filename_suffix+'.csv'
+    all_scores = pd.read_csv(
+        scores_location, 
+        index=False
+        )
+    protein_list = list(all_scores.protein_name.unique())
+
+    if args.plot_elbo_histograms:
+        # Plot fit of mixture model to predicted scores
+        histograms_location = \
+            args.plot_location+os.sep+\
+            'plots_histograms'+os.sep+\
+            args.output_eve_scores_filename_suffix
+        if not os.path.exists(histograms_location):
+            os.makedirs(histograms_location)
+        plot_helpers.plot_histograms(
+            all_scores, 
+            gmm_model,
+            histograms_location,
+            protein_list
+        )
+
+    if args.plot_scores_vs_labels:
+        labels_dataset = pd.read_csv(
+            args.labels_file_location,
+            low_memory=False,
+            usecols=['protein_name','mutations','ClinVar_labels']
+        )
+        labels_dataset = labels_dataset[labels_dataset.ClinVar_labels.isin([0,1])]
+        all_scores_labelled = pd.merge(
+            all_scores, 
+            labels_dataset, 
+            how='inner', 
+            on=['protein_name','mutations']
+        )
+        labelled_scores_location = args.input_eve_scores_location + os.sep + \
+        'all_EVE_scores_labelled_'+ \
+        args.output_eve_scores_filename_suffix+'.csv'
+        all_scores_labelled.to_csv(
+            labelled_scores_location, index=False
+        )
+
+        # Plot scores against clinical labels
+        scores_vs_labels_plot_location = \
+            args.plot_location+os.sep+\
+            'plots_scores_vs_labels'+os.sep+\
+            args.output_eve_scores_filename_suffix
+        if not os.path.exists(scores_vs_labels_plot_location):
+            os.makedirs(scores_vs_labels_plot_location)
+        for protein in tqdm.tqdm(protein_list,"Plot scores Vs labels"):
+            protein_scores = all_scores_labelled[
+                all_scores_labelled.protein_name==protein
+            ]
+            output_suffix = args.output_eve_scores_filename_suffix+'_'+protein
+            plot_helpers.plot_scores_vs_labels(
+                score_df=protein_scores, 
+                plot_location=scores_vs_labels_plot_location,
+                output_eve_scores_filename_suffix=output_suffix,
+                mutation_name='mutations', 
+                score_name="EVE_scores", 
+                label_name='ClinVar_labels'
+            )
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Plot EVE scores against standard labels')
+    parser.add_argument('--input_eve_scores_location', type=str, help='Folder where all EVE scores are stored')
+    parser.add_argument('--input_eve_scores_filename_suffix', default='', type=str, help='(Optional) Suffix to be added to output filename')
+    parser.add_argument('--labels_file_location', default=None, type=str, help='File with ground truth labels for all proteins of interest (e.g., ClinVar)')
+    parser.add_argument('--plot_location', default=None, type=str, help='Location of the different plots')
+    parser.add_argument('--plot_elbo_histograms', default=False, action='store_true', help='Plots all evol indices histograms with GMM fits')
+    parser.add_argument('--plot_scores_vs_labels', default=False, action='store_true', help='Plots EVE scores Vs labels at each protein position')
+    parser.add_argument('--verbose', action='store_true', help='Print detailed information during run')
+    args = parser.parse_args()
+
+
+    main(args)