Working on directed evo hybrid DCA+LLM

niklases · niklases · commit 2940f06df9c4 · 2025-04-21T16:29:34.000+02:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -66,6 +66,22 @@
             ]
         },
 
+        {
+            "name": "Python: PyPEF MKPS avGFP PS",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "mkps", 
+                "--wt", "P42212_F64L.fasta", 
+                "--input", "avGFP.csv"
+            ]
+        },
+
         {
             "name": "Python: PyPEF ml -e onehot pls_loocv",
             "type": "debugpy",
@@ -282,6 +298,58 @@
             ]
         },
 
+        {
+            "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP PS: ProSST",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                "-m", "HYBRIDgremlinprosst", 
+                "--ps", "avGFP_prediction_set.fasta", 
+                "--params", "GREMLIN"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP PS: ESM1v",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                "-m", "HYBRIDgremlinesm", 
+                "--ps", "avGFP_prediction_set.fasta", 
+                "--params", "GREMLIN"
+            ]
+        },
+
+        {
+            "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP DirectEvo: ESM1v",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                "directevo",
+                "-m", "HYBRIDgremlinesm", 
+                "--wt", "P42212_F64L.fasta",
+                "--params", "GREMLIN"
+            ]
+        },
+
         {   // PLMC zero-shot steps:
             // 1. $pypef param_inference --params uref100_avgfp_jhmmer_119_plmc_42.6.params
             // 2. $pypef hybrid -t TS.fasl --params PLMC
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -79,6 +79,7 @@ def __init__(
             alphas: np.ndarray | None = None,
             parameter_range: list[tuple] | None = None,
             batch_size: int | None = None,
+            llm_train: bool = True,
             device: str | None = None,
             seed: int | None = None
     ):
@@ -135,6 +136,7 @@ def __init__(
         if batch_size is None:
             batch_size = 5
         self.batch_size = batch_size
+        self.llm_train = llm_train
         (
             self.ridge_opt, 
             self.beta1, 
@@ -408,7 +410,7 @@ def train_llm(self):
                 input_ids=self.input_ids,
                 attention_mask=self.llm_attention_mask,
                 structure_input_ids=self.structure_input_ids,
-                train=True,
+                train=False,
                 device=self.device
             )
             y_llm_ttrain = self.llm_inference_function(
@@ -417,7 +419,7 @@ def train_llm(self):
                 input_ids=self.input_ids,
                 attention_mask=self.llm_attention_mask,
                 structure_input_ids=self.structure_input_ids,
-                train=True,
+                train=False,
                 device=self.device
             )
         elif self.llm_key == 'esm1v':
@@ -1206,7 +1208,11 @@ def performance_ls_ts(
         print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)}')
         save_model_to_dict_pickle(hybrid_model, model_name)
 
-    elif ts_fasta is not None and model_pickle_file is not None and params_file is not None:
+    elif (
+        ts_fasta is not None and 
+        model_pickle_file is not None 
+        and params_file is not None
+        ):
         # # no LS provided --> statistical modeling / no ML
         print(f'Taking model from saved model (Pickle file): {model_pickle_file}...')
         model, model_type = get_model_and_type(model_pickle_file)
@@ -1233,8 +1239,9 @@ def performance_ls_ts(
                 model.hybrid_prediction(x_test, x_llm_test)
             else:
                 y_test_pred = model.hybrid_prediction(x_test)
-
-    elif ts_fasta is not None and model_pickle_file is None:  # no LS provided --> statistical modeling / no ML
+    
+    # no LS provided --> statistical modeling / no ML
+    elif ts_fasta is not None and model_pickle_file is None:  
         print(f"No learning set provided, falling back to statistical DCA model: "
               f"no adjustments of individual hybrid model parameters (\"beta's\").")
         test_sequences, test_variants, y_test = get_sequences_from_file(ts_fasta)
@@ -1354,7 +1361,8 @@ def predict_ps(  # also predicting "pmult" dict directories
     model, model_type = get_model_and_type(model_pickle_file)
 
     if model_type == 'PLMC' or model_type == 'GREMLIN':
-        print(f'Found {model_type} model file. No hybrid model provided - falling back to a statistical DCA model...')
+        print(f'Found {model_type} model file. No hybrid model provided - '
+              f'falling back to a statistical DCA model...')
 
     pmult = [
         'Recomb_Double_Split', 'Recomb_Triple_Split', 'Recomb_Quadruple_Split',
@@ -1377,14 +1385,14 @@ def predict_ps(  # also predicting "pmult" dict directories
                             substitution_sep=separator)
                         ys_pred = get_delta_e_statistical_model(x_test, x_wt)
                     else:  # Hybrid model input requires params from plmc or GREMLIN model plus optional LLM input
-                        x_test, _test_variants, *_ = plmc_or_gremlin_encoding(
+                        x_test, _test_variants, test_sequences, *_ = plmc_or_gremlin_encoding(
                             variants, sequences, None, params_file,
                             threads=threads, verbose=False, substitution_sep=separator
                         )
                         if model.llm_key is None:
                             ys_pred = model.hybrid_prediction(x_test)
                         else:
-                            sequences = [str(seq) for seq in sequences]
+                            sequences = [str(seq) for seq in test_sequences]
                             x_llm_test = llm_embedder(model.llm_model_input, sequences)
                             ys_pred = model.hybrid_prediction(np.asarray(x_test), np.asarray(x_llm_test))
                     for k, y in enumerate(ys_pred):
@@ -1404,6 +1412,7 @@ def predict_ps(  # also predicting "pmult" dict directories
 
     elif prediction_set is not None:  # Predicting single FASTA file sequences
         sequences, variants, _ = get_sequences_from_file(prediction_set)
+        print(len(sequences), len(variants))
         # NaNs are already being removed by the called function
         if model_type != 'Hybrid':  # statistical DCA model
             xs, variants, _, _, x_wt, *_ = plmc_or_gremlin_encoding(
@@ -1412,13 +1421,16 @@ def predict_ps(  # also predicting "pmult" dict directories
             )
             ys_pred = get_delta_e_statistical_model(xs, x_wt)
         else:  # Hybrid model input requires params from plmc or GREMLIN model plus optional LLM input
-            xs, variants, *_ = plmc_or_gremlin_encoding(
+            print(len(variants))
+            xs, variants, sequences, *_ = plmc_or_gremlin_encoding(
                 variants, sequences, None, params_file,
                 threads=threads, verbose=True, substitution_sep=separator
             )
+            print('xs len', len(xs), len(variants))
             if model.llm_key is None:
                 ys_pred = model.hybrid_prediction(xs)
             else:
+                sequences = [str(seq) for seq in sequences]
                 xs_llm = llm_embedder(model.llm_model_input, sequences)
                 ys_pred = model.hybrid_prediction(np.asarray(xs), np.asarray(xs_llm))
         assert len(xs) == len(variants) == len(xs_llm) == len(ys_pred)
@@ -1434,7 +1446,7 @@ def predict_ps(  # also predicting "pmult" dict directories
 def predict_directed_evolution(
         encoder: str,
         variant: str,
-        sequence: str,
+        variant_sequence: str,
         hybrid_model_data_pkl: str
 ) -> Union[str, list]:
     """
@@ -1452,27 +1464,36 @@ def predict_directed_evolution(
 
     if model_type != 'Hybrid':  # statistical DCA model
         xs, variant, _, _, x_wt, *_ = plmc_or_gremlin_encoding(
-            variant, sequence, None, encoder, verbose=False, use_global_model=True)
+            variant, variant_sequence, None, encoder, 
+            verbose=False, use_global_model=True)
         if not list(xs):
             return 'skip'
         y_pred = get_delta_e_statistical_model(xs, x_wt)
-    else:  # model_type == 'Hybrid': Hybrid model input requires params from PLMC or GREMLIN model plus optional LLM input
-        xs, variant, *_ = plmc_or_gremlin_encoding(
-            variant, sequence, None, encoder, verbose=False, use_global_model=True
+    else:  # model_type == 'Hybrid': Hybrid model input requires params 
+        #from PLMC or GREMLIN model plus optional LLM input
+        print(variant, variant_sequence)
+        xs, variant, variant_sequence, *_ = plmc_or_gremlin_encoding(
+            variant, variant_sequence, None, encoder, 
+            verbose=False, use_global_model=True
         )
+        print(variant_sequence)
         if not list(xs):
             return 'skip'
         if model.llm_model_input is None:
             x_llm = None
         else:
-            x_llm = llm_embedder(model.llm_model_input, sequence)
+            x_llm = llm_embedder(model.llm_model_input, variant_sequence)
         try:
+            print(np.shape(xs), np.shape(x_llm),  np.atleast_2d(x_llm))
+            #exit()
             y_pred = model.hybrid_prediction(np.atleast_2d(xs), np.atleast_2d(x_llm))[0]
-        except ValueError:
-            raise SystemError(
-                "Probably a different model was used for encoding than for modeling; "
-                "e.g. using a HYBRIDgremlin model in combination with parameters taken from a PLMC file."
-            )
+        except ValueError as e:
+            raise e  # TODO: Check sequences / mutations
+        #    raise SystemError(
+        #        "Probably a different model was used for encoding than "
+        #        "for modeling; e.g. using a HYBRIDgremlin model in "
+        #        "combination with parameters taken from a PLMC file."
+        #    )
     y_pred = float(y_pred)
 
     return [(y_pred, variant[0][1:])]
diff --git a/pypef/llm/prosst_lora_tune.py b/pypef/llm/prosst_lora_tune.py
@@ -30,7 +30,7 @@
 def prosst_tokenize_sequences(sequences, vocab):
     sequences = np.atleast_1d(sequences).tolist()
     x_sequences = []
-    for sequence in tqdm(sequences, desc='Tokenizing sequences for PRoSST modeling'):
+    for sequence in tqdm(sequences, desc='Tokenizing sequences for ProSST modeling'):
         x_sequence = []
         for aa in sequence:
             x_sequence.append(vocab[aa])
@@ -80,11 +80,16 @@ def get_logits_from_full_seqs(
             if i_aa == 0:
                 seq_log_probs = logits[i_aa, x_aa].reshape(1)
             else:
-                seq_log_probs = torch.cat((seq_log_probs, logits[i_aa, x_aa].reshape(1)), 0)
+                seq_log_probs = torch.cat(
+                    (seq_log_probs, logits[i_aa, x_aa].reshape(1)), 0)
         if i_s == 0:
             log_probs = torch.sum(torch.Tensor(seq_log_probs)).reshape(1)
         else:
-            log_probs = torch.cat((log_probs, torch.sum(torch.Tensor(seq_log_probs)).reshape(1)), 0)
+            log_probs = torch.cat((
+                log_probs, 
+                torch.sum(torch.Tensor(seq_log_probs)).reshape(1)
+                ), 0
+            )
     return log_probs
 
 
@@ -104,8 +109,13 @@ def prosst_train(
     if seed is not None:
         torch.manual_seed(seed)
     if device is None:
-        device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
-    print(f'ProSST training using {device.upper()} device (N_Train={len(torch.flatten(score_batches))})...')
+        device = (
+            "cuda" if torch.cuda.is_available() 
+            else "mps" if torch.backends.mps.is_available() 
+            else "cpu"
+        )
+    print(f"ProSST training using {device.upper()} device "
+          f"(N_Train={len(torch.flatten(score_batches))})...")
     x_sequence_batches = x_sequence_batches.to(device)
     score_batches = score_batches.to(device)
     pbar_epochs = tqdm(range(1, n_epochs + 1))
@@ -119,9 +129,13 @@ def prosst_train(
             pbar_epochs.set_description(f'Epoch {epoch}/{n_epochs}')
         model.train()
         y_preds_detached = []
-        pbar_batches = tqdm(zip(x_sequence_batches, score_batches), total=len(x_sequence_batches), leave=False)
+        pbar_batches = tqdm(zip(x_sequence_batches, score_batches), 
+                            total=len(x_sequence_batches), leave=False)
         for batch, (seqs_b, scores_b) in enumerate(pbar_batches):
-            y_preds_b = get_logits_from_full_seqs(seqs_b, model, input_ids, attention_mask, structure_input_ids, train=True, verbose=False)
+            y_preds_b = get_logits_from_full_seqs(
+                seqs_b, model, input_ids, attention_mask, structure_input_ids, 
+                train=True, verbose=False
+            )
             y_preds_detached.append(y_preds_b.detach().cpu().numpy().flatten())
             loss = loss_fn(scores_b, y_preds_b)
             loss.backward()
@@ -132,7 +146,8 @@ def prosst_train(
                 f"[batch: {batch+1}/{len(x_sequence_batches)} | "
                 f"sequence: {(batch + 1) * len(seqs_b):>5d}/{len(x_sequence_batches) * len(seqs_b)}]  "
             )
-        epoch_spearman_2 = spearmanr(score_batches.cpu().numpy().flatten(), np.array(y_preds_detached).flatten())[0]
+        epoch_spearman_2 = spearmanr(score_batches.cpu().numpy().flatten(), 
+                                     np.array(y_preds_detached).flatten())[0]
         if epoch_spearman_2 == np.nan:
             raise SystemError(
                 f"No correlation between Y_true and Y_pred could be computed...\n"
@@ -143,7 +158,10 @@ def prosst_train(
             did_not_improve_counter = 0
             best_model_epoch = epoch
             best_model_perf = epoch_spearman_2
-            best_model = f"model_saves/Epoch{epoch}-Ntrain{len(score_batches.cpu().numpy().flatten())}-SpearCorr{epoch_spearman_2:.3f}.pt"
+            best_model = (
+                f"model_saves/Epoch{epoch}-Ntrain{len(score_batches.cpu().numpy().flatten())}"
+                f"-SpearCorr{epoch_spearman_2:.3f}.pt"
+            )
             checkpoint(model, best_model)
             epoch_spearman_1 = epoch_spearman_2
             #print(f"Saved current best model as {best_model}")
@@ -167,13 +185,16 @@ def prosst_train(
     y_preds_train = get_logits_from_full_seqs(
         x_sequence_batches.flatten(start_dim=0, end_dim=1), 
         model, input_ids, attention_mask, structure_input_ids, train=False, verbose=False)
-    print(f'Train-->Train Performance (N={len(score_batches.cpu().flatten())}):', spearmanr(score_batches.cpu().flatten(), y_preds_train.cpu()))
+    print(f'Train-->Train Performance (N={len(score_batches.cpu().flatten())}):', 
+          spearmanr(score_batches.cpu().flatten(), y_preds_train.cpu()))
     return y_preds_train.cpu()
 
 
 def get_prosst_models():
-    prosst_base_model = AutoModelForMaskedLM.from_pretrained("AI4Protein/ProSST-2048", trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained("AI4Protein/ProSST-2048", trust_remote_code=True)
+    prosst_base_model = AutoModelForMaskedLM.from_pretrained(
+        "AI4Protein/ProSST-2048", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        "AI4Protein/ProSST-2048", trust_remote_code=True)
     peft_config = LoraConfig(r=8, target_modules=["query", "value"])
     prosst_lora_model = get_peft_model(prosst_base_model, peft_config)
     # TODO: Check: LoRa or base model parameters better for ProSST fine-tuning and learning rate?
@@ -187,7 +208,8 @@ def get_structure_quantizied(pdb_file, tokenizer, wt_seq):
     tokenized_res = tokenizer([wt_seq], return_tensors='pt')
     input_ids = tokenized_res['input_ids']
     attention_mask = tokenized_res['attention_mask']
-    structure_input_ids = torch.tensor([1, *structure_sequence_offset, 2], dtype=torch.long).unsqueeze(0)
+    structure_input_ids = torch.tensor([1, *structure_sequence_offset, 2], 
+                                       dtype=torch.long).unsqueeze(0)
     return input_ids, attention_mask, structure_input_ids
 
 
@@ -214,8 +236,10 @@ def prosst_setup(wt_seq, pdb_file, sequences, device: str | None = None):
     prosst_vocab = prosst_tokenizer.get_vocab()
     prosst_base_model = prosst_base_model.to(device)
     prosst_optimizer = torch.optim.Adam(prosst_lora_model.parameters(), lr=0.0001)
-    input_ids, prosst_attention_mask, structure_input_ids = get_structure_quantizied(pdb_file, prosst_tokenizer, wt_seq)
-    x_llm_train_prosst = prosst_tokenize_sequences(sequences=sequences, vocab=prosst_vocab)
+    input_ids, prosst_attention_mask, structure_input_ids = get_structure_quantizied(
+        pdb_file, prosst_tokenizer, wt_seq)
+    x_llm_train_prosst = prosst_tokenize_sequences(
+        sequences=sequences, vocab=prosst_vocab)
     llm_dict_prosst = {
         'prosst': {
             'llm_base_model': prosst_base_model,
diff --git a/pypef/utils/directed_evolution.py b/pypef/utils/directed_evolution.py
@@ -251,7 +251,7 @@ def in_silico_de(self):
                 predictions = predict_directed_evolution(
                     encoder=self.dca_encoder,
                     variant=self.s_wt[int(new_variant[:-1]) - 1] + new_variant,
-                    sequence=new_sequence,
+                    variant_sequence=new_sequence,
                     hybrid_model_data_pkl=self.model
                 )
             if predictions != 'skip':
diff --git a/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py b/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py

Original file line number	Diff line number	Diff line change
`@@ -251,7 +251,7 @@ def in_silico_de(self):`
`251`	`251`	`predictions = predict_directed_evolution(`
`252`	`252`	`encoder=self.dca_encoder,`
`253`	`253`	`variant=self.s_wt[int(new_variant[:-1]) - 1] + new_variant,`
`254`		`- sequence=new_sequence,`
	`254`	`+ variant_sequence=new_sequence,`
`255`	`255`	`hybrid_model_data_pkl=self.model`
`256`	`256`	`)`
`257`	`257`	`if predictions != 'skip':`