Skip to content

Commit 98c40b7

Browse files
committed
Update low N AVGFP test script
1 parent 9c32f74 commit 98c40b7

File tree

4 files changed

+19
-22
lines changed

4 files changed

+19
-22
lines changed

pypef/hybrid/hybrid_model.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
from pypef.utils.plot import plot_y_true_vs_y_pred
4949
import pypef.dca.gremlin_inference
5050
from pypef.dca.gremlin_inference import GREMLIN, get_delta_e_statistical_model
51-
from pypef.llm.esm_lora_tune import esm_tokenize_sequences, get_batches, esm_setup, get_esm_models
51+
from pypef.llm.esm_lora_tune import esm_tokenize_sequences, get_batches, esm_setup, get_esm_models, get_device
5252
from pypef.llm.prosst_lora_tune import get_prosst_models, prosst_setup, prosst_tokenize_sequences
5353

5454
# sklearn/base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and
@@ -131,7 +131,10 @@ def __init__(
131131
self.x_train_dca = x_train_dca
132132
self.y_train = y_train
133133
self.x_wild_type = x_wt
134+
if device is None:
135+
device = get_device()
134136
self.device = device
137+
print(f'Using device {device.upper()} for hybrid modeling...')
135138
self.seed = seed
136139
if batch_size is None:
137140
batch_size = 5
@@ -555,7 +558,6 @@ def train_and_optimize(self) -> tuple:
555558
)
556559
return self.beta1, self.beta2, self.ridge_opt
557560

558-
559561
def hybrid_prediction(
560562
self,
561563
x_dca: np.ndarray,

scripts/Encoding_low_N/api_encoding_train_test.ipynb

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@
231231
},
232232
{
233233
"cell_type": "code",
234-
"execution_count": 11,
234+
"execution_count": null,
235235
"id": "e9f8f301",
236236
"metadata": {},
237237
"outputs": [
@@ -275,9 +275,8 @@
275275
" print(f'Split {i + 1}/{len(train_val_splits_indices)}:\\nSpearmans rho (ML) = {performances[4]:.3f}')\n",
276276
" # B. Hybrid modeling\n",
277277
" # -------------------------------------------------------------------------------\n",
278-
" hybrid_model = DCAHybridModel(x_train=x_train_val, y_train=y_train_val, x_wt=x_wt)\n",
279-
" beta_1, beta_2, regressor = hybrid_model.settings(x_train=x_train_val, y_train=y_train_val)\n",
280-
" y_test_pred = hybrid_model.hybrid_prediction(x=x_test, reg=regressor, beta_1=beta_1, beta_2=beta_2)\n",
278+
" hybrid_model = DCALLMHybridModel(x_train_dca=x_train_val, y_train=y_train_val, x_wt=x_wt)\n",
279+
" y_test_pred = hybrid_model.hybrid_prediction(x_dca=x_test)\n",
281280
" ten_split_performance_hybrid.append(spearmanr(y_test, y_test_pred)[0])\n",
282281
" print(f'Spearmans rho (Hybrid) = {spearmanr(y_test, y_test_pred)[0]:.3f}')\n",
283282
"print('-'*80 + f'\\n{n_splits}-fold mean Spearmans rho (ML) = {np.mean(ten_split_performance_ml):.3f} '\n",
@@ -472,7 +471,7 @@
472471
},
473472
{
474473
"cell_type": "code",
475-
"execution_count": 15,
474+
"execution_count": null,
476475
"id": "d9e26b1d",
477476
"metadata": {},
478477
"outputs": [
@@ -511,9 +510,8 @@
511510
" performances_dca_ml.append(get_regressor_performances(\n",
512511
" x_dca_train, x_dca_test, y_train, y_test, regressor='ridge')[4]) # [4] defines spearmanr correlation\n",
513512
"\n",
514-
" hybrid_model = DCAHybridModel(x_train=x_dca_train, y_train=y_train, x_wt=x_wt)\n",
515-
" beta_1, beta_2, hybrid_regressor = hybrid_model.settings(x_train=x_dca_train, y_train=y_train)\n",
516-
" y_hybrid_pred = hybrid_model.hybrid_prediction(x=x_dca_test, reg=hybrid_regressor, beta_1=beta_1, beta_2=beta_2)\n",
513+
" hybrid_model = DCALLMHybridModel(x_train_dca=x_dca_train, y_train=y_train, x_wt=x_wt)\n",
514+
" y_hybrid_pred = hybrid_model.hybrid_prediction(x_dca=x_dca_test)\n",
517515
" performances_hybrid.append(spearmanr(y_test, y_hybrid_pred)[0])\n",
518516
"\n",
519517
" x_aaidx_train, x_aaidx_test, y_train, y_test = train_test_split(\n",

scripts/Encoding_low_N/api_encoding_train_test.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
# the test performance with the other encoding techniques to be tested, we start with
4545
# DCA and split the variant fitness data so that sizes of the data sets are same for
4646
# all encoding techniques tested.
47-
print(f'\nRunning script... which takes ~ 2 h (GREMLIN) - 4 h in total (PLMC) when using all 566 AAindex-indices '
47+
print(f'\nRunning script... which takes ~ 5 h (GREMLIN) - 5+ h in total (PLMC) when using all 566 AAindex-indices '
4848
f'for encoding and model testing (only {n_aaindices_to_test} AAindinces will be tested)...'
4949
f'\n\n(1/4) Testing DCA-based sequence encoding...\n' + "=" * 50)
5050
if not use_gremlin: # PLMC params file-based encoding
@@ -122,7 +122,8 @@
122122
x_dca = dca_encoder.collect_encoded_sequences(variants)
123123
# removing not DCA-encodable positions (and also fitnesses, variants, and sequences)
124124
# from the 2000 initial variants, 1427 remain
125-
x_dca, fitnesses, variants, sequences = remove_nan_encoded_positions(x_dca, fitnesses, variants, sequences)
125+
x_dca, fitnesses, variants, sequences = remove_nan_encoded_positions(
126+
x_dca, fitnesses, variants, sequences)
126127
x_wt = dca_encoder.x_wt
127128

128129
# Statistical model performance
@@ -154,9 +155,8 @@
154155
print(f'Split {i + 1}/{len(train_val_splits_indices)}:\nSpearmans rho (ML) = {performances[4]:.3f}')
155156
# B. Hybrid modeling
156157
# -------------------------------------------------------------------------------
157-
hybrid_model = DCALLMHybridModel(x_train=x_train_val, y_train=y_train_val, x_wt=x_wt)
158-
beta_1, beta_2, regressor = hybrid_model.settings(x_train=x_train_val, y_train=y_train_val)
159-
y_test_pred = hybrid_model.hybrid_prediction(x=x_test, reg=regressor, beta_1=beta_1, beta_2=beta_2)
158+
hybrid_model = DCALLMHybridModel(x_train_dca=x_train_val, y_train=y_train_val, x_wt=x_wt)
159+
y_test_pred = hybrid_model.hybrid_prediction(x_dca=x_test)
160160
ten_split_performance_hybrid.append(spearmanr(y_test, y_test_pred)[0])
161161
print(f'Spearmans rho (Hybrid) = {spearmanr(y_test, y_test_pred)[0]:.3f}')
162162
print('-'*80 + f'\n{n_splits}-fold mean Spearmans rho (ML)= {np.mean(ten_split_performance_ml):.3f} '
@@ -227,9 +227,7 @@
227227
# 4th example: Low-N and plotting using all encoding techniques and all data
228228
# -------------------------------------------------------------------------------
229229
print('Lastly, encoding all variants and performing "low-N" protein engineering task.\n'
230-
'This could require some time... < 1 (GREMLIN DCA encoding) to ~ 2 hours (PLMC '
231-
'single core DCA encoding) left...')
232-
230+
'This could require some time...')
233231
variants = variant_fitness_data.iloc[:, 0]
234232
fitnesses = variant_fitness_data.iloc[:, 1].tolist()
235233
variants_split = []
@@ -274,9 +272,8 @@
274272
performances_dca_ml.append(get_regressor_performances(
275273
x_dca_train, x_dca_test, y_train, y_test, regressor='ridge')[4]) # [4] defines spearmanr correlation
276274

277-
hybrid_model = DCALLMHybridModel(x_train=x_dca_train, y_train=y_train, x_wt=x_wt)
278-
beta_1, beta_2, hybrid_regressor = hybrid_model.settings(x_train=x_dca_train, y_train=y_train)
279-
y_hybrid_pred = hybrid_model.hybrid_prediction(x=x_dca_test, reg=hybrid_regressor, beta_1=beta_1, beta_2=beta_2)
275+
hybrid_model = DCALLMHybridModel(x_train_dca=x_dca_train, y_train=y_train, x_wt=x_wt)
276+
y_hybrid_pred = hybrid_model.hybrid_prediction(x_dca=x_dca_test)
280277
performances_hybrid.append(spearmanr(y_test, y_hybrid_pred)[0])
281278

282279
x_aaidx_train, x_aaidx_test, y_train, y_test = train_test_split(

scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ def plot_csv_data(csv, plot_name):
437437
plt.grid(zorder=-1)
438438
plt.xticks(
439439
range(len(tested_dsets)),
440-
['(' + str(n) + name + ') ' for (n, name) in zip(tested_dsets, df['Dataset'].to_list())],
440+
['(' + str(n) + ') ' + name for (n, name) in zip(tested_dsets, df['Dataset'].to_list())],
441441
rotation=45, ha='right'
442442
)
443443
plt.margins(0.01)

0 commit comments

Comments
 (0)