Update gremlin (dev): implement msa_start & msa_end (II)

niklases · niklases · commit 7d6465c5e40e · 2025-08-16T16:08:18.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,7 +50,7 @@ jobs:
         sleep 3
         pip install .[gui]
         echo $(which pypef)
-        python -m pytest ./tests/ -v -m "not main_script_specific" --capture=tee-sys --log-cli-level=INFO
+        python -m pytest ./tests/ -v -m "not main_script_specific" --log-cli-level=INFO
 
   windows:
     name: windows
@@ -88,5 +88,5 @@ jobs:
         $env:PYTHONPATH = ""
         pip install .[gui]
         echo (Get-Command pypef).Source
-        python -m pytest .\tests -v -m "not main_script_specific" --capture=tee-sys --log-cli-level=INFO
+        python -m pytest .\tests -v -m "not main_script_specific" --log-cli-level=INFO
 
diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py
@@ -145,6 +145,7 @@ def __init__(
         self.optimize = optimize
         if self.optimize:
             self.run_optimization()
+        self.wt_score = self.get_wt_score()
         self.x_wt = self.collect_encoded_sequences(np.atleast_1d(self.wt_seq))
 
     def get_sequences_from_msa(self, msa_file: str):
@@ -437,17 +438,17 @@ def get_scores(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.
             v_idx = self.v_idx
         seqs_int = self.seq2int(seqs)
         wt_seq_len = len(self.wt_seq)
-        #if np.shape(seqs_int)[1] != wt_seq_len:
-        #    raise RuntimeError(
-        #        f"Input sequence shape (length: {np.shape(seqs_int)[1]}) does not match GREMLIN "
-        #        f"MSA shape (common sequence length: {wt_seq_len}) inferred from the MSA."
-        #    )
+        if np.shape(seqs_int)[1] != wt_seq_len:
+            raise RuntimeError(
+                f"Input sequence shape (length: {np.shape(seqs_int)[1]}) does not match GREMLIN "
+                f"MSA shape (common sequence length: {wt_seq_len}) inferred from the MSA."
+            )
         # Check nums of mutations to MSA first/WT sequence and gives warning if too apart from MSA seq
         for i, seq in enumerate(seqs):
             n_mismatches, mismatches = get_mismatches(self.wt_seq, seq)
             if n_mismatches / wt_seq_len > 0.05:
                 logger.warning(
-                    f"Sequence {mismatches} contains more than 5% sequence mismatches to the "
+                    f"Sequence {i + 1}: {mismatches} contains more than 5% sequence mismatches to the "
                     f"first MSA/\"WT\" sequence. Effect predictions will likely be incorrect!"
                 )
         try:
@@ -496,8 +497,8 @@ def get_scores(self, seqs, v=None, w=None, v_idx=None, encode=False, h_wt_seq=0.
     def get_wt_score(self, wt_seq=None, encode=False):
         if wt_seq is None:
             wt_seq = self.wt_seq
-        wt_seq = np.array(wt_seq, dtype=str)
-        return self.get_scores(wt_seq, encode=encode)
+        wt_seq = np.atleast_1d(np.array(wt_seq, dtype=str))
+        return self.get_scores(wt_seq, encode=encode)[0]
 
     def collect_encoded_sequences(self, seqs, v=None, w=None, v_idx=None):
         """
diff --git a/scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py b/scripts/ProteinGym_runs/protgym_hybrid_perf_test_crossval.py
@@ -144,12 +144,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
             
             print('GREMLIN-DCA: optimization...')
             gremlin = GREMLIN(alignment=msa_path, opt_iter=100, optimize=True)
-            sequences_batched = get_batches(sequences, batch_size=1000, 
-                                            dtype=str, keep_remaining=True, verbose=True)
-            x_dca = []  # required later on also
-            for seq_b in tqdm(sequences_batched, desc="Getting GREMLIN sequence encodings", disable=True):
-                for x in gremlin.collect_encoded_sequences(seq_b):
-                    x_dca.append(x)
+            x_dca = gremlin.collect_encoded_sequences(sequences)
             x_wt = gremlin.x_wt
             y_pred_dca = get_delta_e_statistical_model(x_dca, x_wt)
             print(f'DCA (unsupervised performance): {spearmanr(fitnesses, y_pred_dca)[0]:.3f}')
@@ -453,7 +448,7 @@ def plot_csv_data(csv):
     if not JUST_PLOT_RESULTS:
         compute_performances(
             mut_data=combined_mut_data, 
-            start_i=start_i, 
+            start_i=5,#start_i, 
             already_tested_is=already_tested_is
         )
 
diff --git a/scripts/ProteinGym_runs/protgym_hybrid_perf_test_low_n.py b/scripts/ProteinGym_runs/protgym_hybrid_perf_test_low_n.py
@@ -22,9 +22,10 @@
 import sys  # Use local directory PyPEF files
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
 from pypef.dca.gremlin_inference import GREMLIN
+from pypef.llm.utils import get_batches
 from pypef.llm.esm_lora_tune import (
     get_esm_models, esm_tokenize_sequences, 
-    get_batches, esm_train, esm_infer, corr_loss
+    esm_train, esm_infer, corr_loss
 )
 from pypef.llm.prosst_lora_tune import (
     get_logits_from_full_seqs, get_prosst_models, get_structure_quantizied, 
@@ -117,7 +118,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
                         f'{max_muts},Sequence too long ({len(wt_seq)} > {MAX_WT_SEQUENCE_LENGTH})\n'
                     )
                 continue
-            ratio_input_vars_at_gaps = count_gap_variants / len(variants)
+            _ratio_input_vars_at_gaps = count_gap_variants / len(variants)
             pdb_seq = str(list(SeqIO.parse(pdb, "pdb-atom"))[0].seq)
             try:
                 assert wt_seq == pdb_seq  # pdb_seq.startswith(wt_seq)
@@ -135,12 +136,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
             
             print('GREMLIN-DCA: optimization...')
             gremlin = GREMLIN(alignment=msa_path, opt_iter=100, optimize=True)
-            sequences_batched = get_batches(sequences, batch_size=1000, 
-                                            dtype=str, keep_remaining=True, verbose=True)
-            x_dca = []
-            for seq_b in tqdm(sequences_batched, desc="Getting GREMLIN sequence encodings"):
-                for x in gremlin.collect_encoded_sequences(seq_b):
-                    x_dca.append(x)
+            x_dca = gremlin.collect_encoded_sequences(sequences)
             x_wt = gremlin.x_wt
             y_pred_dca = get_delta_e_statistical_model(x_dca, x_wt)
             print(f'DCA (unsupervised performance): {spearmanr(fitnesses, y_pred_dca)[0]:.3f}') 
diff --git a/tests/test_api_functions.py b/tests/test_api_functions.py
@@ -20,6 +20,9 @@
 from pypef.hybrid.hybrid_model import DCALLMHybridModel
 
 
+torch.manual_seed(42)
+np.random.seed(42)
+
 msa_file_avgfp = os.path.abspath(os.path.join(
     __file__, '../../datasets/AVGFP/uref100_avgfp_jhmmer_119.a2m'
 ))
@@ -44,31 +47,44 @@
     os.path.join(__file__, '../../datasets/ANEH/TS_B.fasl'
 ))
 
-train_seqs, _train_vars, train_ys = get_sequences_from_file(ls_b)
-test_seqs, _test_vars, test_ys = get_sequences_from_file(ts_b)
-
-torch.manual_seed(42)
-np.random.seed(42)
+train_seqs_aneh, _train_vars_aneh, train_ys_aneh = get_sequences_from_file(ls_b)
+test_seqs_aneh, _test_vars_aneh, test_ys_aneh = get_sequences_from_file(ts_b)
 
 
-def test_gremlin():
+def test_gremlin_aneh():
     g = GREMLIN(
-        alignment=msa_file_avgfp,
+        alignment=msa_file_aneh,
         char_alphabet="ARNDCQEGHILKMFPSTWYV-",
         wt_seq=None,
         optimize=True,
         gap_cutoff=0.5,
         eff_cutoff=0.8,
         opt_iter=100
     )
-    wt_score = g.get_wt_score()  # only 1 decimal place for Torch result
-    np.testing.assert_almost_equal(wt_score, 952.1, decimal=1)
-    y_pred = g.get_scores(np.append(train_seqs, test_seqs))
+    wt_score = g.get_wt_score()
+    np.testing.assert_almost_equal(wt_score, 1743.2087199198131, decimal=7)
+    assert wt_score == g.wt_score == np.sum(g.x_wt)
+    y_pred = g.get_scores(np.append(train_seqs_aneh, test_seqs_aneh))
     np.testing.assert_almost_equal(
-        spearmanr(np.append(train_ys, test_ys), y_pred)[0], 
-        0.4516502675400598, 
-        decimal=3
+        spearmanr(np.append(train_ys_aneh, test_ys_aneh), y_pred)[0], 
+        -0.5528510930046211, 
+        decimal=7
+    )
+
+
+def test_gremlin_avgfp():
+    g = GREMLIN(
+        alignment=msa_file_avgfp,
+        char_alphabet="ARNDCQEGHILKMFPSTWYV-",
+        wt_seq=None,
+        optimize=True,
+        gap_cutoff=0.5,
+        eff_cutoff=0.8,
+        opt_iter=100
     )
+    wt_score = g.get_wt_score()  
+    np.testing.assert_almost_equal(wt_score, 952.1102220697624, decimal=7)
+    assert wt_score == g.wt_score == np.sum(g.x_wt)
 
 
 def test_hybrid_model_dca_llm():
@@ -81,43 +97,43 @@ def test_hybrid_model_dca_llm():
         eff_cutoff=0.8,
         opt_iter=100
     )
-    x_dca_train = g.get_scores(train_seqs, encode=True)
+    x_dca_train = g.get_scores(train_seqs_aneh, encode=True)
     np.testing.assert_almost_equal(
-        spearmanr(train_ys, np.sum(x_dca_train, axis=1))[0],
+        spearmanr(train_ys_aneh, np.sum(x_dca_train, axis=1))[0],
         -0.5556053466180598,
-        decimal=6
+        decimal=7
     )
-    assert len(train_seqs[0]) == len(g.wt_seq)
+    assert len(train_seqs_aneh[0]) == len(g.wt_seq)
 
-    y_pred_esm = inference(train_seqs, 'esm')
+    y_pred_esm = inference(train_seqs_aneh, 'esm')
     np.testing.assert_almost_equal(
-        spearmanr(train_ys, y_pred_esm)[0], 
+        spearmanr(train_ys_aneh, y_pred_esm)[0], 
         -0.21073416060442696, 
-        decimal=6
+        decimal=7
     )
     aneh_wt_seq = get_wt_sequence(wt_seq_file_aneh)
     y_pred_prosst = inference(
-        train_seqs, 'prosst', 
+        train_seqs_aneh, 'prosst', 
         pdb_file=pdb_file_aneh, wt_seq=aneh_wt_seq
     )
     np.testing.assert_almost_equal(
-        spearmanr(train_ys, y_pred_prosst)[0], 
+        spearmanr(train_ys_aneh, y_pred_prosst)[0], 
         -0.7425657069861902, 
-        decimal=6
+        decimal=7
     )
 
-    x_dca_test = g.get_scores(test_seqs, encode=True)
+    x_dca_test = g.get_scores(test_seqs_aneh, encode=True)
     for i, setup in enumerate([esm_setup, prosst_setup]):
         print(['~~~ ESM ~~~', '~~~ ProSST ~~~'][i])
         if setup == esm_setup:
-            llm_dict = setup(sequences=train_seqs)
+            llm_dict = setup(sequences=train_seqs_aneh)
         else:  # elif setup == prosst_setup:
             llm_dict = setup(
-                aneh_wt_seq, pdb_file_aneh, sequences=train_seqs)
-        x_llm_test = llm_embedder(llm_dict, test_seqs)
+                aneh_wt_seq, pdb_file_aneh, sequences=train_seqs_aneh)
+        x_llm_test = llm_embedder(llm_dict, test_seqs_aneh)
         hm = DCALLMHybridModel(
             x_train_dca=np.array(x_dca_train), 
-            y_train=train_ys,
+            y_train=train_ys_aneh,
             llm_model_input=llm_dict,
             x_wt=g.x_wt,
             seed=42
@@ -129,56 +145,66 @@ def test_hybrid_model_dca_llm():
         print('hm.y_dca_ridge_ttest:', spearmanr(hm.y_ttest, hm.y_dca_ridge_ttest), len(hm.y_ttest))
         print('hm.y_llm_ttest:', spearmanr(hm.y_ttest, hm.y_llm_ttest), len(hm.y_ttest))
         print('hm.y_llm_lora_ttest:', spearmanr(hm.y_ttest, hm.y_llm_lora_ttest), len(hm.y_ttest))
-        print('Hybrid prediction:', spearmanr(test_ys, y_pred_test), len(test_ys))
+        print('Hybrid prediction:', spearmanr(test_ys_aneh, y_pred_test), len(test_ys_aneh))
         np.testing.assert_almost_equal(
             spearmanr(hm.y_ttest, hm.y_dca_ttest)[0], -0.5342743713116743, 
-            decimal=5
+            decimal=7
         )
         np.testing.assert_almost_equal(
             spearmanr(hm.y_ttest, hm.y_dca_ridge_ttest)[0], 0.717333573331078, 
-            decimal=5
+            decimal=7
         )
         np.testing.assert_almost_equal(
             spearmanr(hm.y_ttest, hm.y_llm_ttest)[0], 
             [-0.21761360470606333, -0.8330644449247571][i],
-            decimal=5
+            decimal=7
         )  
         # Nondeterministic behavior (without setting seed), should be about ~0.7 to ~0.9, 
         # but as sample size is so low the following is only checking if not NaN / >=-1.0 and <=1.0,
         # Torch reproducibility documentation: https://pytorch.org/docs/stable/notes/randomness.html
         assert -1.0 <= spearmanr(hm.y_ttest, hm.y_llm_lora_ttest)[0] <= 1.0  
-        assert -1.0 <= spearmanr(test_ys, y_pred_test)[0] <= 1.0
+        assert -1.0 <= spearmanr(test_ys_aneh, y_pred_test)[0] <= 1.0
         # With seed 42 for numpy and torch for implemented LLM's:
         if setup == esm_setup:
             np.testing.assert_almost_equal(
-                spearmanr(hm.y_ttest, hm.y_llm_lora_ttest)[0], 0.7772102863835341, decimal=5
+                spearmanr(hm.y_ttest, hm.y_llm_lora_ttest)[0], 0.7772102863835341, decimal=7
             )
             np.testing.assert_almost_equal(
-                spearmanr(test_ys, y_pred_test)[0], 0.8004896406836318, decimal=5
+                spearmanr(test_ys_aneh, y_pred_test)[0], 0.8004896406836318, decimal=7
             )
         elif setup == prosst_setup:
+            try:
+                np.testing.assert_almost_equal(
+                    spearmanr(hm.y_ttest, hm.y_llm_lora_ttest)[0], 0.7770124558338013, decimal=7
+                )
+            except AssertionError as ae1:
+                try: 
+                    np.testing.assert_almost_equal(  # Different values on different machines 
+                    spearmanr(hm.y_ttest, hm.y_llm_lora_ttest)[0], 0.7239938685054149, decimal=7
+                    )                                # (TODO) has to be investigated
+                except AssertionError as ae2:
+                    raise AssertionError(
+                        f"Neither condition passed:\nFirst comparison failed:\n{ae1}\n"
+                        f"Second comparison failed:\n{ae2}"
+                    )
             np.testing.assert_almost_equal(
-                spearmanr(hm.y_ttest, hm.y_llm_lora_ttest)[0], 0.7770124558338013, decimal=5
+                spearmanr(test_ys_aneh, y_pred_test)[0], 0.8291977762544377, decimal=7
             )
-            np.testing.assert_almost_equal(
-                spearmanr(test_ys, y_pred_test)[0], 0.8291977762544377, decimal=5
-            )
-
 
 
 def test_dataset_b_results():
     aaindex = "WOLR810101.txt"
     x_fft_train, _ = AAIndexEncoding(
-        full_aaidx_txt_path(aaindex), train_seqs
+        full_aaidx_txt_path(aaindex), train_seqs_aneh
     ).collect_encoded_sequences()
     x_fft_test, _ = AAIndexEncoding(
-        full_aaidx_txt_path(aaindex), test_seqs
+        full_aaidx_txt_path(aaindex), test_seqs_aneh
     ).collect_encoded_sequences()
     performances = get_regressor_performances(
-        x_learn=x_fft_train, 
-        x_test=x_fft_test, 
-        y_learn=train_ys, 
-        y_test=test_ys, 
+        x_learn=x_fft_train,
+        x_test=x_fft_test,
+        y_learn=train_ys_aneh,
+        y_test=test_ys_aneh,
         regressor='pls_loocv'
     )  
     # Dataset B PLS_LOOCV results: R², RMSE, NRMSE, Pearson's r, Spearman's rho 
@@ -191,7 +217,8 @@ def test_dataset_b_results():
 
 
 if __name__ == "__main__":
-    test_gremlin()
+    test_gremlin_aneh()
+    test_gremlin_avgfp()
     test_hybrid_model_dca_llm()
     test_dataset_b_results()