Skip to content

Commit de99087

Browse files
committed
A bit reduced/more clear prints
1 parent 7ebb0a2 commit de99087

File tree

2 files changed

+6
-10
lines changed

2 files changed

+6
-10
lines changed

pypef/llm/prosst_lora_tune.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def get_logits_from_full_seqs(
7171
)
7272

7373
logits = torch.log_softmax(outputs.logits[:, 1:-1], dim=-1).squeeze()
74-
for i_s, sequence in enumerate(tqdm(xs, disable=not verbose, desc='Getting sequence logits')):
74+
for i_s, sequence in enumerate(tqdm(xs, disable=not verbose, desc='Getting ProSST sequence logits')):
7575
for i_aa, x_aa in enumerate(sequence):
7676
if i_aa == 0:
7777
seq_log_probs = logits[i_aa, x_aa].reshape(1)
@@ -104,7 +104,7 @@ def prosst_train(
104104
torch.manual_seed(seed)
105105
if device is None:
106106
device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
107-
print(f'Training using {device.upper()} device (N_Train={len(torch.flatten(score_batches))})...')
107+
print(f'ProSST training using {device.upper()} device (N_Train={len(torch.flatten(score_batches))})...')
108108
#structure_sequence = PdbQuantizer()(pdb_file=pdb_path)
109109
#structure_sequence_offset = [i + 3 for i in structure_sequence]
110110
#tokenized_res = tokenizer([wt_seq], return_tensors='pt')

scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,13 +128,13 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
128128
x_dca = gremlin.collect_encoded_sequences(sequences)
129129
x_wt = gremlin.x_wt
130130
y_pred_dca = get_delta_e_statistical_model(x_dca, x_wt)
131-
print('DCA:', spearmanr(fitnesses, y_pred_dca), len(fitnesses))
131+
print(f'DCA (unsupervised performance): {spearmanr(fitnesses, y_pred_dca)[0]:.3f}')
132132
dca_unopt_perf = spearmanr(fitnesses, y_pred_dca)[0]
133133

134134
try:
135135
x_esm, esm_attention_mask = esm_tokenize_sequences(sequences, esm_tokenizer, max_length=1000)#len(wt_seq))
136136
y_esm = esm_infer(get_batches(x_esm, dtype=float, batch_size=1), esm_attention_mask, esm_base_model)
137-
print('ESM1v:', spearmanr(fitnesses, y_esm.cpu()))
137+
print(f'ESM1v (unsupervised performance): {spearmanr(fitnesses, y_esm.cpu())[0]:.3f}')
138138
esm_unopt_perf = spearmanr(fitnesses, y_esm.cpu())[0]
139139
except RuntimeError:
140140
esm_unopt_perf = np.nan
@@ -144,7 +144,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
144144
x_prosst = prosst_tokenize_sequences(sequences=sequences, vocab=prosst_vocab)
145145
y_prosst = get_logits_from_full_seqs(
146146
x_prosst, prosst_base_model, input_ids, prosst_attention_mask, structure_input_ids, train=False)
147-
print('ProSST:', spearmanr(fitnesses, y_prosst.cpu()))
147+
print(f'ProSST (unsupervised performance): {spearmanr(fitnesses, y_prosst.cpu())[0]:.3f}')
148148
prosst_unopt_perf = spearmanr(fitnesses, y_prosst.cpu())[0]
149149
except RuntimeError:
150150
prosst_unopt_perf = np.nan
@@ -239,7 +239,7 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
239239
np.asarray(x_llm_test_prosst)
240240
][i_m]
241241
)
242-
print(f'Hybrid perf.: {spearmanr(y_test, y_test_pred)[0]}')
242+
print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)[0]:.3f}')
243243
hybrid_perfs.append(spearmanr(y_test, y_test_pred)[0])
244244
except RuntimeError: # modeling_prosst.py, line 920, in forward
245245
# or UnboundLocalError in prosst_lora_tune.py, line 167
@@ -263,8 +263,6 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
263263
dset_ns_y_test_i = ''
264264
for ns_y_t in ns_y_test:
265265
dset_ns_y_test_i += f'{ns_y_t},'
266-
print(ns_y_test)
267-
print('\nREADME:\n', dset_hybrid_perfs_i, '\n', dset_ns_y_test_i, '\n')
268266
with open(out_results_csv, 'a') as fh:
269267
fh.write(
270268
f'{numbers_of_datasets[i]},{dset_key},{len(variants_orig)},{max_muts},{dca_unopt_perf},'
@@ -274,7 +272,6 @@ def compute_performances(mut_data, mut_sep=':', start_i: int = 0, already_tested
274272
def plot_csv_data(csv, plot_name):
275273
train_test_size_texts = []
276274
df = pd.read_csv(csv, sep=',')
277-
# No.,Dataset,N_Variants,N_Max_Muts,Untrained_Performance_DCA,Untrained_Performance_LLM,Hybrid_Trained_Performance_100,Hybrid_Trained_Performance_200,Hybrid_Trained_Performance_1000
278275
tested_dsets = df['No.']
279276
dset_dca_perfs = df['Untrained_Performance_DCA']
280277
dset_esm_perfs = df['Untrained_Performance_ESM1v']
@@ -290,7 +287,6 @@ def plot_csv_data(csv, plot_name):
290287
dset_hybrid_perfs_dca_prosst_1000 = df['Hybrid_DCA_ProSST_Trained_Performance_1000']
291288

292289
plt.figure(figsize=(80, 12))
293-
#import gc;gc.collect() # Potentially GC is needed to free some RAM (deallocated VRAM -> partly stored in RAM?) after each run
294290
plt.plot(range(len(tested_dsets)), dset_dca_perfs, 'o--', markersize=8, color='tab:blue', label='DCA (0)')
295291
plt.plot(range(len(tested_dsets) + 1), np.full(len(tested_dsets) + 1, np.nanmean(dset_dca_perfs)), color='tab:blue', linestyle='--')
296292
for i, (p, n_test) in enumerate(zip(dset_dca_perfs, df['N_Y_test'].astype('Int64').to_list())):

0 commit comments

Comments
 (0)