niklases
diff --git a/‎pypef/gaussian_process/gp_prosst_test.py‎
Lines changed: 64 additions & 34 deletions b/‎pypef/gaussian_process/gp_prosst_test.py‎
Lines changed: 64 additions & 34 deletions
diff --git a/‎pypef/hybrid/hybrid_model.py‎
Lines changed: 34 additions & 14 deletions b/‎pypef/hybrid/hybrid_model.py‎
Lines changed: 34 additions & 14 deletions
@@ -1,3 +1,16 @@
+
+
+"""
+Gaussian process optimization similar (but less sophisticated compared) to 
+Kermut: Composite kernel regression for protein variant effects
+Peter Mørch Groth, Mads Herbert Kerrn, Lars Olsen, Jesper Salomon, Wouter Boomsma
+2024, 38th Conference on Neural Information Processing Systems (NeurIPS 2024).
+TL;DR: Gaussian process regression model with a novel composite kernel, Kermut, achieves 
+state-of-the-art variant effect prediction while providing meaningful uncertainties.
+https://openreview.net/forum?id=jM9atrvUii
+"""
+
+
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
@@ -10,12 +23,11 @@
 
 from tqdm import tqdm
 
-from pypef.llm.prosst_lora_tune import (
-    get_logits_from_full_seqs, get_prosst_models, get_structure_quantizied, 
-    prosst_tokenize_sequences, prosst_train
+from pypef.plm.prosst_lora_tune import (
+    get_prosst_models, get_structure_quantizied
 )
-from pypef.llm.inference import inference
-from pypef.utils.helpers import get_vram, get_device
+from pypef.plm.inference import tokenize_sequences, plm_inference
+from pypef.utils.helpers import get_device
 
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -76,14 +88,16 @@ def extract_prosst_embeddings(
         structures = [structures] * len(sequences)
 
     assert len(sequences) == len(structures), \
-        "Number of sequences must match number of structures"
+        (f"Number of sequences must match number of structures "
+         f"{len(sequences)} != {len(structures)}")
 
     for seq, struct in tqdm(zip(sequences, structures),
                             total=len(sequences),
                             desc="Embedding (ProSST)"):
         # Tokenize sequence
         tokenized = prosst_tokenizer(
             [seq],
+            max_length=len(seq) + 2,
             return_tensors="pt",
             padding=False,
             truncation=False
@@ -124,29 +138,32 @@ def extract_prosst_embeddings(
     return X
 
 
+def gp_fit():
+    pass
 
 
 
+def gp_fit_sklearn():
+    pass
+
+
 
 
 if __name__ == '__main__':
-    wt_seq = list(read_fasta_biopython('example_data/blat_ecolx/blat_ecolx_wt_seq.fa').values())[0]
-    pdb = 'example_data/blat_ecolx/BLAT_ECOLX.pdb'
+    wt_seq = list(read_fasta_biopython('datasets/BLAT_ECOLX/blat_ecolx_wt.fasta').values())[0]
+    pdb = 'datasets/BLAT_ECOLX/BLAT_ECOLX.pdb'
     device = get_device()
     print("Getting ProSST models")
     prosst_base_model, prosst_lora_model, prosst_tokenizer, prosst_optimizer = get_prosst_models()
     prosst_vocab = prosst_tokenizer.get_vocab()
     prosst_base_model = prosst_base_model.to(device)
 
     print(f"Getting structure tokens...")
-    input_ids, prosst_attention_mask, structure_input_ids = get_structure_quantizied(
+    wt_input_ids, prosst_attention_mask, structure_input_ids = get_structure_quantizied(
         pdb, prosst_tokenizer, wt_seq, verbose=True
     )
 
-
-
-
-    df = pd.read_csv('example_data/blat_ecolx/BLAT_ECOLX_Stiffler_2015.csv')
+    df = pd.read_csv('datasets/BLAT_ECOLX/BLAT_ECOLX_Stiffler_2015.csv')
     sequences = df['mutated_sequence'].to_list()
     y = df['DMS_score'].to_list()
 
@@ -156,10 +173,18 @@ def extract_prosst_embeddings(
     # --- Step 2: Extract ProSST embeddings ---
     print(structure_input_ids)
     print('np.shape(structure_input_ids):', np.shape(structure_input_ids))
-    wt_structure_input_ids = structure_input_ids[0, 1:-1].tolist()  # Remove CLS/EOS
-    X_train = extract_prosst_embeddings(prosst_base_model, prosst_tokenizer, s_train, wt_structure_input_ids)
+    wt_structure_input_ids = structure_input_ids  
+
+    X_emb_train = extract_prosst_embeddings(prosst_base_model, prosst_tokenizer, s_train, wt_structure_input_ids[0, 1:-1].tolist()) # Remove CLS/EOS
+
+    x_train_2, prosst_attention_mask_2 = tokenize_sequences(s_train, prosst_tokenizer)
+    assert len(prosst_attention_mask[0]) == len(prosst_attention_mask_2), f"{len(prosst_attention_mask[0])}\n  !=\n  {len(prosst_attention_mask_2)}"
+
+    X_emb_train_2 = plm_inference(x_train_2, wt_input_ids, prosst_attention_mask, prosst_base_model, 
+                                  extract_emb=True, wt_structure_input_ids=wt_structure_input_ids) #[0, 1:-1])
+
     print("Embedding extraction done")
-    print(np.shape(X_train))
+    assert np.shape(X_emb_train) == np.shape(X_emb_train_2)
 
     # --- Step 3: Fit Gaussian Process ---
     if USE_SCIKIT_LEARN:
@@ -168,12 +193,12 @@ def extract_prosst_embeddings(
 
         kernel = 1.0 * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1)
         gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True)
-        gpr.fit(X_train, y_train)
+        gpr.fit(X_emb_train, y_train)
 
     else:  # GPYTORCH
         import gpytorch
 
-        X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
+        X_train_t = torch.tensor(X_emb_train, dtype=torch.float32).to(device)
         y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
 
         likelihood = gpytorch.likelihoods.GaussianLikelihood().to(device)
@@ -195,26 +220,31 @@ def extract_prosst_embeddings(
 
     # --- Step 4: Extract ProSST embeddings for test sequences ---
     print("Extracting ProSST embeddings for test sequences...")
-    X_test = extract_prosst_embeddings(
+    X_emb_test = extract_prosst_embeddings(
         model=prosst_base_model,
         prosst_tokenizer=prosst_tokenizer,
         sequences=s_test,
-        structures=wt_structure_input_ids,  # still using same WT structure
+        structures=wt_structure_input_ids[0, 1:-1].tolist(),  # still using same WT structure
         device=device
     )
-    print("Test embeddings shape:", X_test.shape)
+    print("Test embeddings shape:", X_emb_test.shape)
+
+    x_test_2, prosst_attention_mask_2 = tokenize_sequences(s_test, prosst_tokenizer)
+    X_emb_test_2 = plm_inference(x_test_2, wt_input_ids, prosst_attention_mask, prosst_base_model, 
+                                  extract_emb=True, wt_structure_input_ids=wt_structure_input_ids)  #[0, 1:-1])
 
     # --- Step 5: Predict with Gaussian Process ---
-    if USE_SCIKIT_LEARN:
-        y_mean, y_std = gpr.predict(X_test, return_std=True)
-    else:
-        X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
-        gp_model.eval()
-        likelihood.eval()
-        with torch.no_grad(), gpytorch.settings.fast_pred_var():
-            pred = likelihood(gp_model(X_test_t))
-            y_mean = pred.mean.cpu().numpy()
-            lower, upper = pred.confidence_region()  # optional 95% CI
-
-    print("Predicted fitness:", y_mean)
-    print("Spearman correlation:", spearmanr(y_test, y_mean))
+    for x_t in [torch.tensor(X_emb_test).to(device), X_emb_test_2]:
+        if USE_SCIKIT_LEARN:
+            y_mean, y_std = gpr.predict(x_t, return_std=True)
+        else:
+            #X_test_t = torch.tensor(x_t, dtype=torch.float32).to(device)
+            gp_model.eval()
+            likelihood.eval()
+            with torch.no_grad(), gpytorch.settings.fast_pred_var():
+                pred = likelihood(gp_model(x_t))
+                y_mean = pred.mean.cpu().numpy()
+                lower, upper = pred.confidence_region()  # optional 95% CI
+
+        print("Predicted fitness:", y_mean)
+        print("Spearman correlation:", spearmanr(y_test, y_mean))  #  SignificanceResult(statistic=0.8617991109937434, pvalue=0.0)
@@ -39,7 +39,7 @@
 from pypef.dca.gremlin_inference import GREMLIN, get_delta_e_statistical_model
 from pypef.plm.esm_lora_tune import get_esm_models
 from pypef.plm.prosst_lora_tune import get_prosst_models
-from pypef.plm.inference import esm_setup, prosst_setup, llm_tokenizer, inference
+from pypef.plm.inference import esm_setup, prosst_setup, tokenize_sequences, plm_inference
 from pypef.plm.utils import get_batches
 
 # sklearn/base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and 
@@ -90,6 +90,7 @@ def __init__(
                     self.x_train_llm = llm_model_input['esm1v']['x_llm']
                     self.wt_input_ids = llm_model_input['esm1v']['wt_input_ids']
                     self.llm_attention_mask = llm_model_input['esm1v']['llm_attention_mask']
+                    self.llm_tokenizer = llm_model_input['esm1v']['llm_tokenizer']
                 elif len(list(llm_model_input.keys())) == 1 and list(llm_model_input.keys())[0] == 'prosst':
                     self.llm_key = 'prosst'
                     self.llm_base_model = llm_model_input['prosst']['llm_base_model']
@@ -102,6 +103,7 @@ def __init__(
                     self.llm_attention_mask = llm_model_input['prosst']['llm_attention_mask']
                     self.wt_input_ids = llm_model_input['prosst']['wt_input_ids']
                     self.structure_input_ids = llm_model_input['prosst']['structure_input_ids']
+                    self.llm_tokenizer = llm_model_input['prosst']['llm_tokenizer']
                 else:
                     raise RuntimeError("LLM input model dictionary not supported. Currently supported "
                                       "models are 'esm1v' or 'prosst'")
@@ -661,7 +663,7 @@ def hybrid_prediction(
                     verbose=verbose,
                     device=self.device).detach().cpu().numpy()
             elif self.llm_key == 'esm1v':
-                x_llm_b = torch.from_numpy(get_batches(x_llm, batch_size=1, dtype=int))
+                #x_llm_b = torch.from_numpy(get_batches(x_llm, batch_size=1, dtype=int))
                 y_llm = self.llm_inference_function(
                     xs=x_llm, 
                     wt_input_ids=self.wt_input_ids,
@@ -1062,11 +1064,11 @@ def performance_ls_ts(
         if llm is not None:
             if llm.lower().startswith('esm'):
                 llm_dict = esm_setup(train_sequences)
-                x_llm_test = llm_tokenizer(llm_dict, test_sequences)
+                x_llm_test = tokenize_sequences(test_sequences, llm_dict['esm1v']['llm_tokenizer'])
             elif llm.lower() == 'prosst':
                 llm_dict = prosst_setup(
                     wt_seq, pdb_file, sequences=train_sequences)
-                x_llm_test = llm_tokenizer(llm_dict, test_sequences)
+                x_llm_test = tokenize_sequences(test_sequences, llm_dict['prosst']['llm_tokenizer'])
         else:
             llm_dict = None
             x_llm_test = None
@@ -1111,8 +1113,10 @@ def performance_ls_ts(
                 substitution_sep, threads, False
             )
             if model.llm_model_input is not None:
-                logger.info(f"Found hybrid model with LLM {list(model.llm_model_input.keys())[0]}...")
-                x_llm_test = llm_tokenizer(model.llm_model_input, test_sequences)
+                llm_ = list(model.llm_model_input.keys())[0]
+                tokenizer = model.llm_model_input[llm_]['llm_tokenizer']
+                logger.info(f"Found hybrid model with LLM {llm_}...")
+                x_llm_test = tokenize_sequences(test_sequences, tokenizer)
                 y_test_pred = model.hybrid_prediction(x_test, x_llm_test)
             else:
                 y_test_pred = model.hybrid_prediction(x_test)
@@ -1145,11 +1149,23 @@ def performance_ls_ts(
         else:
             model_type = 'LLM'
             if llm == 'esm':
+                llm_dict = esm_setup(test_sequences[0], test_sequences)  # TODO: Improve wt_seq input workaround
                 logger.info("Zero-shot LLM inference on test set using ESM1v...")
-                y_test_pred = inference(test_sequences, llm)
+                y_test_pred = plm_inference(
+                    xs = llm_dict['esm1v']['x_llm'],
+                    wt_input_ids=llm_dict['esm1v']['wt_input_ids'],
+                    model=llm_dict['esm1v']['llm_base_model']
+                )
             elif llm == 'prosst':
+                llm_dict = prosst_setup(test_sequences[0], test_sequences)  # TODO: Improve wt_seq input workaround
                 logger.info("Zero-shot LLM inference on test set using ProSST...")
-                y_test_pred = inference(test_sequences, llm, pdb_file=pdb_file, wt_seq=wt_seq)
+                y_test_pred = plm_inference(
+                    xs = llm_dict['prosst']['x_llm'],
+                    wt_input_ids=llm_dict['prosst']['wt_input_ids'],
+                    model=llm_dict['prosst']['llm_base_model'],
+                    wt_structure_input_ids=llm_dict['prosst']['wt_structure_input_ids']
+                    
+                )
             else:
                 raise RuntimeError("Unknown --llm flag option.")
     else:
@@ -1264,11 +1280,13 @@ def predict_ps(
                             variants, sequences, None, params_file,
                             threads=threads, verbose=False, substitution_sep=separator
                         )
-                        if model.llm_key is None:
+                        if model.llm_key is None:  # TODO: Check llm_key
                             ys_pred = model.hybrid_prediction(x_test)
                         else:
                             sequences = [str(seq) for seq in test_sequences]
-                            x_llm_test = llm_tokenizer(model.llm_model_input, sequences)
+                            llm_ = list(model.llm_model_input.keys())[0]
+                            tokenizer = model.llm_model_input[llm_]['llm_tokenizer']
+                            x_llm_test = tokenize_sequences(sequences, tokenizer)
                             ys_pred = model.hybrid_prediction(np.asarray(x_test), np.asarray(x_llm_test))
                     for k, y in enumerate(ys_pred):
                         all_y_v_pred.append((ys_pred[k], variants[k]))
@@ -1294,11 +1312,11 @@ def predict_ps(
             if llm == 'esm':
                 model_type = 'LLM_ESM1v'
                 logger.info("Zero-shot LLM inference on test set using ESM1v...")
-                ys_pred = inference(sequences, llm)
+                ys_pred = plm_inference(sequences, llm)  # TODO
             elif llm == 'prosst':
                 model_type = 'LLM_ProSST'
                 logger.info("Zero-shot LLM inference on test set using ProSST...")
-                ys_pred = inference(sequences, llm, pdb_file=pdb_file, wt_seq=wt_seq)
+                ys_pred = plm_inference(sequences, llm, pdb_file=pdb_file, wt_seq=wt_seq)  # TODO
         else:
             if not model_type.startswith('Hybrid'):  # statistical DCA model
                 xs, variants, _, _, x_wt, *_ = plmc_or_gremlin_encoding(
@@ -1315,7 +1333,9 @@ def predict_ps(
                     ys_pred = model.hybrid_prediction(xs)
                 else:
                     sequences = [str(seq) for seq in sequences]
-                    xs_llm = llm_tokenizer(model.llm_model_input, sequences)
+                    llm_ = list(model.llm_model_input.keys())[0]
+                    tokenizer = model.llm_model_input[llm_]['llm_tokenizer']
+                    xs_llm = tokenize_sequences(sequences, tokenizer)
                     ys_pred = model.hybrid_prediction(np.asarray(xs), np.asarray(xs_llm))
             assert len(xs) == len(variants) == len(ys_pred)
         y_v_pred = zip(ys_pred, variants)
@@ -1375,7 +1395,7 @@ def predict_directed_evolution(
             if model.llm_model_input is None:
                 y_pred = model.hybrid_prediction(xs)
             else:
-                x_llm = llm_tokenizer(model.llm_model_input, 
+                x_llm = tokenize_sequences(model.llm_model_input, 
                                      variant_sequence, verbose=False)
 
                 y_pred = model.hybrid_prediction(