Update hybrid model: works again (II)

niklases · niklases · commit 1ceb458a27f9 · 2025-04-16T22:15:18.000+02:00
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -65,18 +65,19 @@ def reduce_by_batch_modulo(a: np.ndarray, batch_size=5) -> np.ndarray:
     return a[:reduce]
 
 
-# TODO: Implementation of other regression techniques (CVRegression models)
+# TODO: Implementation of other regression techniques (CVRegression models) [Likely not worth]
 # TODO: Differential evolution of multiple Zero Shot predictors
-#       (and supervised model predictions thereof) and y_true
+#       (and supervised model predictions thereof) and y_true [DONE]
+# TODO: Add constrastive learning option (on PGym data)?
 class DCALLMHybridModel:
     def __init__(
             self,
-            x_train_dca: np.ndarray,               # DCA-encoded sequences
-            y_train: np.ndarray,                   # true labels
+            x_train_dca: np.ndarray,
+            y_train: np.ndarray,
             llm_model_input: dict | None = None,
-            x_wt: np.ndarray | None = None,        # Wild type encoding
-            alphas: np.ndarray | None = None,      # Ridge regression grid for the parameter 'alpha'
-            parameter_range: list[tuple] | None = None,   # Parameter range of 'beta_1' and 'beta_2' with lower bound <= x <= upper bound,
+            x_wt: np.ndarray | None = None,
+            alphas: np.ndarray | None = None,
+            parameter_range: list[tuple] | None = None,
             batch_size: int | None = None,
             device: str | None = None,
             seed: int | None = None
@@ -332,10 +333,10 @@ def get_subsplits_train(self, train_size_fit: float = 0.66):
                 (train_size_fit * len(self.y_train)) - 
                 ((train_size_fit * len(self.y_train)) % self.batch_size)
             )
-            train_test_size = int(
-                (len(self.y_train) - train_size_fit) - 
-                ((len(self.y_train) - train_size_fit) % self.batch_size)
-            )
+            #train_test_size = int(
+            #    (len(self.y_train) - train_size_fit) - 
+            #    ((len(self.y_train) - train_size_fit) % self.batch_size)
+            #)
             (
                 self.x_dca_ttrain, self.x_dca_ttest, 
                 self.x_llm_ttrain, self.x_llm_ttest,
@@ -347,15 +348,13 @@ def get_subsplits_train(self, train_size_fit: float = 0.66):
                 train_size=train_size_fit,
                 random_state=self.seed
             )
-            # Reducing by batch size modulo for X, attention masks, and y
+            # Reducing by batch size modulo for X and y
             self.x_dca_ttrain = self.x_dca_ttrain[:train_size_fit]
             self.x_llm_ttrain = self.x_llm_ttrain[:train_size_fit]
-            #self.attn_llm_ttrain = self.attn_llm_ttrain[:train_size_fit]
             self.y_ttrain = self.y_ttrain[:train_size_fit]
-            self.x_dca_ttest = self.x_dca_ttest[:train_test_size]   
-            self.x_llm_ttest = self.x_llm_ttest[:train_test_size]
-            #self.attn_llm_ttest = self.attn_llm_ttest[:train_test_size]
-            self.y_ttest = self.y_ttest[:train_test_size]
+            #self.x_dca_ttest = self.x_dca_ttest[:train_test_size]   
+            #self.x_llm_ttest = self.x_llm_ttest[:train_test_size]
+            #self.y_ttest = self.y_ttest[:train_test_size]
 
         else:
             (
@@ -393,7 +392,7 @@ def train_llm(self):
             #get_batches(self.attn_llm_ttrain, batch_size=self.batch_size, dtype=int), 
             get_batches(self.y_ttrain, batch_size=self.batch_size, dtype=float)
         )
-        x_llm_ttest_b = get_batches(self.x_llm_ttest, batch_size=self.batch_size, dtype=int)
+
         #x_llm_ttest_b = get_batches(self.x_llm_ttest, batch_size=self.batch_size, dtype=int)
         if self.llm_key == 'prosst':
             y_llm_ttest = self.llm_inference_function(
@@ -415,6 +414,7 @@ def train_llm(self):
                 device=self.device
             )
         elif self.llm_key == 'esm1v':
+            x_llm_ttest_b = get_batches(self.x_llm_ttest, batch_size=1, dtype=int)
             y_llm_ttest = self.llm_inference_function(
                 xs=x_llm_ttest_b,
                 model=self.llm_model,
@@ -585,7 +585,6 @@ def hybrid_prediction(
                     self.llm_attention_mask, 
                     self.structure_input_ids,
                     train=False,
-                    #desc='Infering base model', 
                     device=self.device).detach().cpu().numpy()
                 y_llm_lora = self.llm_inference_function(
                     x_llm, 
@@ -594,30 +593,20 @@ def hybrid_prediction(
                     self.llm_attention_mask, 
                     self.structure_input_ids,
                     train=False,
-                    #desc='Infering LoRA-tuned model', 
                     device=self.device).detach().cpu().numpy()
             elif self.llm_key == 'esm1v':
-                x_llm_b = get_batches(x_llm, batch_size=self.batch_size, dtype=int)
+                x_llm_b = get_batches(x_llm, batch_size=1, dtype=int)
                 y_llm = self.llm_inference_function(
                     x_llm_b, 
                     self.llm_attention_mask,
                     self.llm_base_model, 
-                    #desc='Infering base model', 
                     device=self.device).detach().cpu().numpy()
                 y_llm_lora = self.llm_inference_function(
                     x_llm_b, 
                     self.llm_attention_mask,
                     self.llm_model, 
-                    #desc='Infering LoRA-tuned model', 
                     device=self.device).detach().cpu().numpy()
             
-
-            y_dca, y_ridge, y_llm, y_llm_lora = (
-                reduce_by_batch_modulo(y_dca, batch_size=self.batch_size), 
-                reduce_by_batch_modulo(y_ridge, batch_size=self.batch_size), 
-                reduce_by_batch_modulo(y_llm, batch_size=self.batch_size), 
-                reduce_by_batch_modulo(y_llm_lora, batch_size=self.batch_size)
-            )
             return self.beta1 * y_dca + self.beta2 * y_ridge + self.beta3 * y_llm + self.beta4 * y_llm_lora
 
     def split_performance(
diff --git a/pypef/llm/esm_lora_tune.py b/pypef/llm/esm_lora_tune.py
@@ -141,7 +141,7 @@ def esm_infer(xs, attention_mask, model, desc: None | str = None, device: str |
         device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
     attention_masks = torch.Tensor(np.full(shape=np.shape(xs), fill_value=attention_mask)).to(torch.int64)
     print(f'Infering ESM model for predictions using {device.upper()} device...')
-    for i , (xs_b, am_b) in enumerate(tqdm(zip(xs, attention_masks), total=len(xs), desc=desc)):
+    for i , (xs_b, am_b) in enumerate(tqdm(zip(xs, attention_masks), total=len(xs), desc="Infering ESM model. Sequence")):
         xs_b = xs_b.to(torch.int64)
         with torch.no_grad():
             y_preds = get_y_pred_scores(xs_b, am_b, model, device)
diff --git a/pypef/llm/prosst_lora_tune.py b/pypef/llm/prosst_lora_tune.py
@@ -69,7 +69,6 @@ def get_logits_from_full_seqs(
                     attention_mask=attention_mask,
                     ss_input_ids=structure_input_ids
             )
-
     logits = torch.log_softmax(outputs.logits[:, 1:-1], dim=-1).squeeze()
     for i_s, sequence in enumerate(tqdm(xs, disable=not verbose, desc='Getting ProSST sequence logits')):
         for i_aa, x_aa in enumerate(sequence):
@@ -84,9 +83,6 @@ def get_logits_from_full_seqs(
     return log_probs
 
 
-
-
-
 def checkpoint(model, filename):
     torch.save(model.state_dict(), filename)
 
@@ -107,7 +103,6 @@ def prosst_train(
     print(f'ProSST training using {device.upper()} device (N_Train={len(torch.flatten(score_batches))})...')
     x_sequence_batches = x_sequence_batches.to(device)
     score_batches = score_batches.to(device)
-
     pbar_epochs = tqdm(range(1, n_epochs + 1))
     epoch_spearman_1 = 0.0
     did_not_improve_counter = 0
@@ -191,7 +186,6 @@ def get_structure_quantizied(pdb_file, tokenizer, wt_seq):
     return input_ids, attention_mask, structure_input_ids
 
 
-
 def prosst_setup(wt_seq, pdb_file, sequences, device: str | None = None):
     prosst_base_model, prosst_lora_model, prosst_tokenizer, prosst_optimizer = get_prosst_models()
     prosst_vocab = prosst_tokenizer.get_vocab()
@@ -215,66 +209,3 @@ def prosst_setup(wt_seq, pdb_file, sequences, device: str | None = None):
         }
     }
     return llm_dict_prosst
-
-
-if __name__ == '__main__':
-    import pandas as pd
-    import copy
-    from sklearn.model_selection import train_test_split 
-    import matplotlib.pyplot as plt
-    # Test on dataset GRB2_HUMAN_Faure_2021: SignificanceResult(statistic=0.6997442598613315, pvalue=0.0)
-    wt_seq = "MEAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELNGKDGFIPKNYIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQQPTYVQALFDFDPQEDGELGFRRGDFIHVMDNSDPNWWKGACHGQTGMFPRNYVTPVNRNV"
-    grb2_folder = os.path.abspath(os.path.join(pypef_path, '..', 'datasets', 'GRB2'))
-    pdb_file = os.path.join(grb2_folder, 'GRB2_HUMAN.pdb')
-    csv_file = os.path.join(grb2_folder, 'GRB2_HUMAN_Faure_2021.csv')
-    df = pd.read_csv(csv_file) #, nrows=120)
-    print(df)
-    prosst_base_model, prosst_lora_model, tokenizer, optimizer = get_prosst_models()
-    vocab = tokenizer.get_vocab()
-    structure_sequence = PdbQuantizer()(pdb_file=pdb_file)
-    structure_sequence_offset = [i + 3 for i in structure_sequence]
-    tokenized_res = tokenizer([wt_seq], return_tensors='pt')
-    input_ids = tokenized_res['input_ids']
-    attention_mask = tokenized_res['attention_mask']
-    structure_input_ids = torch.tensor([1, *structure_sequence_offset, 2], dtype=torch.long).unsqueeze(0)
-    #y_pred = get_logits_from_full_seqs(df['mutated_sequence'], prosst_model, input_ids, attention_mask, structure_input_ids, train=False)
-    #print(spearmanr(df['DMS_score'], y_pred.detach().cpu().numpy()))  # SignificanceResult(statistic=np.float64(0.7216670719282277), pvalue=np.float64(0.0))
-    x_sequences = prosst_tokenize_sequences(df['mutated_sequence'], vocab=vocab)
-    for batch_size in [5, 10, 25, 50, 100]:
-        train_perfs_unsup, test_perfs_unsup = [], []
-        train_perfs, test_perfs = [], []
-        for train_size in [200, 1000, 10000]:
-            prosst_model_copy = copy.deepcopy(prosst_base_model)
-            x_train, x_test, scores_train, scores_test = train_test_split(
-                x_sequences, df['DMS_score'].to_numpy().astype(float), train_size=train_size, random_state=42
-            )
-            print(f"\n=========================\nTRAIN SIZE: {train_size} TEST SIZE: {len(x_test)} -- BATCH SIZE: {batch_size}\n=========================")
-
-            y_pred = get_logits_from_full_seqs(
-                x_test, prosst_model_copy, input_ids, attention_mask, structure_input_ids, train=False)
-            print(f'Train-->Test UNTRAINED Performance (N={len(y_pred.flatten())}):',spearmanr(scores_test, y_pred.detach().cpu().numpy()))
-            test_perfs_unsup.append(spearmanr(scores_test, y_pred.detach().cpu().numpy()))
-
-
-            y_preds_train_unsup = get_logits_from_full_seqs(
-                x_train, prosst_model_copy, input_ids, attention_mask, structure_input_ids, train=False, verbose=False)
-            y_preds_train_unsup = y_preds_train_unsup.cpu().numpy()
-            print(f'Train-->Train UNTRAINED Performance (N={len(y_preds_train_unsup)}):', spearmanr(scores_train, y_preds_train_unsup))
-            train_perfs_unsup.append(spearmanr(scores_train, y_preds_train_unsup)[0])
-
-            # TRAINING
-            x_train_b = get_batches(x_train, dtype=int, batch_size=batch_size, verbose=True)
-            scores_train_b = get_batches(scores_train, dtype=float, batch_size=batch_size, verbose=True)
-            y_preds_train = prosst_train(x_train_b, scores_train_b, corr_loss, prosst_model_copy, optimizer, pdb_file, n_epochs=500)
-            print(f'Train-->Train Performance (N={len(y_preds_train)}):', spearmanr(scores_train, y_preds_train))
-            train_perfs.append(spearmanr(scores_train, y_preds_train)[0])
-
-            y_pred = get_logits_from_full_seqs(
-                x_test, prosst_model_copy, input_ids, attention_mask, structure_input_ids, train=False)
-            print(f'Train-->Test Performance (N={len(y_pred.flatten())}):', spearmanr(scores_test, y_pred.detach().cpu().numpy()))
-            test_perfs.append(spearmanr(scores_test, y_pred.detach().cpu().numpy())[0])
-        for k in [train_perfs_unsup, train_perfs, test_perfs_unsup, test_perfs]:
-            plt.plot(range(len(k)), k, label=f'Batch size: {batch_size}')
-    plt.xticks(range(len(k)), [100, 200, 1000, 10000])
-    plt.legend()
-    plt.savefig('1.png')
diff --git a/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv b/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv
@@ -9,3 +9,5 @@ No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performanc
 8,A4GRB6_PSEAI_Chen_2020,5004,1,0.6681056494435768,0.543247747835155,0.647351733217166,0.6105347017831526,0.6007582102931497,0.7414608515883568,0.7053638375627392,0.6894205591814954,0.7677593077079674,0.7245479703656822,0.8059450782290521,0.8107707930340956,5004,4904,4804,4004,1751
 9,AACC1_PSEAI_Dandage_2018,1801,1,0.3180712414525488,0.45793953382550573,0.36853292069676097,0.3174612456170627,0.45937287821213785,0.4310318712519802,0.3521756690003161,0.49206057023996924,0.4617532190070763,0.4488626249244256,0.5505813182399806,0.534569265039648,1801,1701,1601,801,869
 10,ACE2_HUMAN_Chan_2020,2223,1,0.24320754065919856,0.1855938942334426,0.2613054581997969,0.2985805551410494,0.24485718938989934,0.353286631689331,0.4023145866828806,0.3372015473315942,0.4700770240532049,0.5643356952550576,0.6012504479478733,0.610115781454495,2223,2123,2023,1223,3995
+11,ADRB2_HUMAN_Jones_2020,7800,1,0.5187856047925657,0.5310582600087359,0.5151672046363325,0.5183272590334468,0.530515316113512,0.5380119736507377,0.5174727374036995,0.5384330679901763,0.5197786372610751,0.5147111407689821,0.5570931245349682,0.553912098700521,7800,7700,7600,6800,3287
+12,AICDA_HUMAN_Gajula_2014_3cycles,209,1,0.41950521618921593,0.4075489898558927,0.274172920796004,0.4423419549584822,0.5069492105992409,0.4054703600769246,nan,nan,nan,nan,nan,nan,209,109,nan,nan,96
diff --git a/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results_clean.csv b/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results_clean.csv
@@ -9,3 +9,5 @@ No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performanc
 8,A4GRB6_PSEAI_Chen_2020,5004,1,0.6681056494435768,0.543247747835155,0.647351733217166,0.6105347017831526,0.6007582102931497,0.7414608515883568,0.7053638375627392,0.6894205591814954,0.7677593077079674,0.7245479703656822,0.8059450782290521,0.8107707930340956,5004,4904,4804,4004,1751
 9,AACC1_PSEAI_Dandage_2018,1801,1,0.3180712414525488,0.45793953382550573,0.36853292069676097,0.3174612456170627,0.45937287821213785,0.4310318712519802,0.3521756690003161,0.49206057023996924,0.4617532190070763,0.4488626249244256,0.5505813182399806,0.534569265039648,1801,1701,1601,801,869
 10,ACE2_HUMAN_Chan_2020,2223,1,0.24320754065919856,0.1855938942334426,0.2613054581997969,0.2985805551410494,0.24485718938989934,0.353286631689331,0.4023145866828806,0.3372015473315942,0.4700770240532049,0.5643356952550576,0.6012504479478733,0.610115781454495,2223,2123,2023,1223,3995
+11,ADRB2_HUMAN_Jones_2020,7800,1,0.5187856047925657,0.5310582600087359,0.5151672046363325,0.5183272590334468,0.530515316113512,0.5380119736507377,0.5174727374036995,0.5384330679901763,0.5197786372610751,0.5147111407689821,0.5570931245349682,0.553912098700521,7800,7700,7600,6800,3287
+12,AICDA_HUMAN_Gajula_2014_3cycles,209,1,0.41950521618921593,0.4075489898558927,0.274172920796004,0.4423419549584822,0.5069492105992409,0.4054703600769246,nan,nan,nan,nan,nan,nan,209,109,nan,nan,96
diff --git a/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py b/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py