Update low N AVGFP test script

niklases · niklases · commit 98c40b76dec2 · 2025-05-04T08:16:22.000+02:00
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -48,7 +48,7 @@
 from pypef.utils.plot import plot_y_true_vs_y_pred
 import pypef.dca.gremlin_inference
 from pypef.dca.gremlin_inference import GREMLIN, get_delta_e_statistical_model
-from pypef.llm.esm_lora_tune import esm_tokenize_sequences, get_batches, esm_setup, get_esm_models
+from pypef.llm.esm_lora_tune import esm_tokenize_sequences, get_batches, esm_setup, get_esm_models, get_device
 from pypef.llm.prosst_lora_tune import get_prosst_models, prosst_setup, prosst_tokenize_sequences
 
 # sklearn/base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and 
@@ -131,7 +131,10 @@ def __init__(
         self.x_train_dca = x_train_dca
         self.y_train = y_train
         self.x_wild_type = x_wt
+        if device is None:
+            device = get_device()
         self.device = device
+        print(f'Using device {device.upper()} for hybrid modeling...')
         self.seed = seed
         if batch_size is None:
             batch_size = 5
@@ -555,7 +558,6 @@ def train_and_optimize(self) -> tuple:
             )
             return self.beta1, self.beta2, self.ridge_opt
 
-
     def hybrid_prediction(
             self,
             x_dca: np.ndarray,
diff --git a/scripts/Encoding_low_N/api_encoding_train_test.ipynb b/scripts/Encoding_low_N/api_encoding_train_test.ipynb
@@ -231,7 +231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "e9f8f301",
    "metadata": {},
    "outputs": [
@@ -275,9 +275,8 @@
     "    print(f'Split {i + 1}/{len(train_val_splits_indices)}:\\nSpearmans rho (ML) = {performances[4]:.3f}')\n",
     "    # B. Hybrid modeling\n",
     "    # -------------------------------------------------------------------------------\n",
-    "    hybrid_model = DCAHybridModel(x_train=x_train_val, y_train=y_train_val, x_wt=x_wt)\n",
-    "    beta_1, beta_2, regressor = hybrid_model.settings(x_train=x_train_val, y_train=y_train_val)\n",
-    "    y_test_pred = hybrid_model.hybrid_prediction(x=x_test, reg=regressor, beta_1=beta_1, beta_2=beta_2)\n",
+    "    hybrid_model = DCALLMHybridModel(x_train_dca=x_train_val, y_train=y_train_val, x_wt=x_wt)\n",
+    "    y_test_pred = hybrid_model.hybrid_prediction(x_dca=x_test)\n",
     "    ten_split_performance_hybrid.append(spearmanr(y_test, y_test_pred)[0])\n",
     "    print(f'Spearmans rho (Hybrid) = {spearmanr(y_test, y_test_pred)[0]:.3f}')\n",
     "print('-'*80 + f'\\n{n_splits}-fold mean Spearmans rho (ML) = {np.mean(ten_split_performance_ml):.3f} '\n",
@@ -472,7 +471,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "d9e26b1d",
    "metadata": {},
    "outputs": [
@@ -511,9 +510,8 @@
     "        performances_dca_ml.append(get_regressor_performances(\n",
     "            x_dca_train, x_dca_test, y_train, y_test, regressor='ridge')[4])  # [4] defines spearmanr correlation\n",
     "\n",
-    "        hybrid_model = DCAHybridModel(x_train=x_dca_train, y_train=y_train, x_wt=x_wt)\n",
-    "        beta_1, beta_2, hybrid_regressor = hybrid_model.settings(x_train=x_dca_train, y_train=y_train)\n",
-    "        y_hybrid_pred = hybrid_model.hybrid_prediction(x=x_dca_test, reg=hybrid_regressor, beta_1=beta_1, beta_2=beta_2)\n",
+    "        hybrid_model = DCALLMHybridModel(x_train_dca=x_dca_train, y_train=y_train, x_wt=x_wt)\n",
+    "        y_hybrid_pred = hybrid_model.hybrid_prediction(x_dca=x_dca_test)\n",
     "        performances_hybrid.append(spearmanr(y_test, y_hybrid_pred)[0])\n",
     "\n",
     "        x_aaidx_train, x_aaidx_test, y_train, y_test = train_test_split(\n",
diff --git a/scripts/Encoding_low_N/api_encoding_train_test.py b/scripts/Encoding_low_N/api_encoding_train_test.py
@@ -44,7 +44,7 @@
 # the test performance with the other encoding techniques to be tested, we start with
 # DCA and split the variant fitness data so that sizes of the data sets are same for
 # all encoding techniques tested.
-print(f'\nRunning script... which takes ~ 2 h (GREMLIN) - 4 h in total (PLMC) when using all 566 AAindex-indices '
+print(f'\nRunning script... which takes ~ 5 h (GREMLIN) - 5+ h in total (PLMC) when using all 566 AAindex-indices '
       f'for encoding and model testing (only {n_aaindices_to_test} AAindinces will be tested)...'
       f'\n\n(1/4) Testing DCA-based sequence encoding...\n' + "=" * 50)
 if not use_gremlin:  # PLMC params file-based encoding
@@ -122,7 +122,8 @@
     x_dca = dca_encoder.collect_encoded_sequences(variants)
     # removing not DCA-encodable positions (and also fitnesses, variants, and sequences)
     # from the 2000 initial variants, 1427 remain
-    x_dca, fitnesses, variants, sequences = remove_nan_encoded_positions(x_dca, fitnesses, variants, sequences)
+    x_dca, fitnesses, variants, sequences = remove_nan_encoded_positions(
+        x_dca, fitnesses, variants, sequences)
     x_wt = dca_encoder.x_wt
 
 # Statistical model performance
@@ -154,9 +155,8 @@
     print(f'Split {i + 1}/{len(train_val_splits_indices)}:\nSpearmans rho (ML) = {performances[4]:.3f}')
     # B. Hybrid modeling
     # -------------------------------------------------------------------------------
-    hybrid_model = DCALLMHybridModel(x_train=x_train_val, y_train=y_train_val, x_wt=x_wt)
-    beta_1, beta_2, regressor = hybrid_model.settings(x_train=x_train_val, y_train=y_train_val)
-    y_test_pred = hybrid_model.hybrid_prediction(x=x_test, reg=regressor, beta_1=beta_1, beta_2=beta_2)
+    hybrid_model = DCALLMHybridModel(x_train_dca=x_train_val, y_train=y_train_val, x_wt=x_wt)
+    y_test_pred = hybrid_model.hybrid_prediction(x_dca=x_test)
     ten_split_performance_hybrid.append(spearmanr(y_test, y_test_pred)[0])
     print(f'Spearmans rho (Hybrid) = {spearmanr(y_test, y_test_pred)[0]:.3f}')
 print('-'*80 + f'\n{n_splits}-fold mean Spearmans rho (ML)= {np.mean(ten_split_performance_ml):.3f} '
@@ -227,9 +227,7 @@
 # 4th example: Low-N and plotting using all encoding techniques and all data
 # -------------------------------------------------------------------------------
 print('Lastly, encoding all variants and performing "low-N" protein engineering task.\n'
-      'This could require some time... < 1 (GREMLIN DCA encoding) to ~ 2 hours (PLMC '
-      'single core DCA encoding) left...')
-
+      'This could require some time...')
 variants = variant_fitness_data.iloc[:, 0]
 fitnesses = variant_fitness_data.iloc[:, 1].tolist()
 variants_split = []
@@ -274,9 +272,8 @@
         performances_dca_ml.append(get_regressor_performances(
             x_dca_train, x_dca_test, y_train, y_test, regressor='ridge')[4])  # [4] defines spearmanr correlation
 
-        hybrid_model = DCALLMHybridModel(x_train=x_dca_train, y_train=y_train, x_wt=x_wt)
-        beta_1, beta_2, hybrid_regressor = hybrid_model.settings(x_train=x_dca_train, y_train=y_train)
-        y_hybrid_pred = hybrid_model.hybrid_prediction(x=x_dca_test, reg=hybrid_regressor, beta_1=beta_1, beta_2=beta_2)
+        hybrid_model = DCALLMHybridModel(x_train_dca=x_dca_train, y_train=y_train, x_wt=x_wt)
+        y_hybrid_pred = hybrid_model.hybrid_prediction(x_dca=x_dca_test)
         performances_hybrid.append(spearmanr(y_test, y_hybrid_pred)[0])
 
         x_aaidx_train, x_aaidx_test, y_train, y_test = train_test_split(
diff --git a/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py b/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py
@@ -437,7 +437,7 @@ def plot_csv_data(csv, plot_name):
     plt.grid(zorder=-1)
     plt.xticks(
         range(len(tested_dsets)), 
-        ['(' + str(n) + name  + ') ' for (n, name) in zip(tested_dsets, df['Dataset'].to_list())], 
+        ['(' + str(n) + ') ' + name  for (n, name) in zip(tested_dsets, df['Dataset'].to_list())], 
         rotation=45, ha='right'
     )
     plt.margins(0.01)

Original file line number	Diff line number	Diff line change
`@@ -437,7 +437,7 @@ def plot_csv_data(csv, plot_name):`
`437`	`437`	`plt.grid(zorder=-1)`
`438`	`438`	`plt.xticks(`
`439`	`439`	`range(len(tested_dsets)),`
`440`		`- ['(' + str(n) + name + ') ' for (n, name) in zip(tested_dsets, df['Dataset'].to_list())],`
	`440`	`+ ['(' + str(n) + ') ' + name for (n, name) in zip(tested_dsets, df['Dataset'].to_list())],`
`441`	`441`	`rotation=45, ha='right'`
`442`	`442`	`)`
`443`	`443`	`plt.margins(0.01)`