|
44 | 44 | # the test performance with the other encoding techniques to be tested, we start with |
45 | 45 | # DCA and split the variant fitness data so that sizes of the data sets are same for |
46 | 46 | # all encoding techniques tested. |
47 | | -print(f'\nRunning script... which takes ~ 2 h (GREMLIN) - 4 h in total (PLMC) when using all 566 AAindex-indices ' |
| 47 | +print(f'\nRunning script... which takes ~ 5 h (GREMLIN) - 5+ h in total (PLMC) when using all 566 AAindex-indices ' |
48 | 48 | f'for encoding and model testing (only {n_aaindices_to_test} AAindinces will be tested)...' |
49 | 49 | f'\n\n(1/4) Testing DCA-based sequence encoding...\n' + "=" * 50) |
50 | 50 | if not use_gremlin: # PLMC params file-based encoding |
|
122 | 122 | x_dca = dca_encoder.collect_encoded_sequences(variants) |
123 | 123 | # removing not DCA-encodable positions (and also fitnesses, variants, and sequences) |
124 | 124 | # from the 2000 initial variants, 1427 remain |
125 | | - x_dca, fitnesses, variants, sequences = remove_nan_encoded_positions(x_dca, fitnesses, variants, sequences) |
| 125 | + x_dca, fitnesses, variants, sequences = remove_nan_encoded_positions( |
| 126 | + x_dca, fitnesses, variants, sequences) |
126 | 127 | x_wt = dca_encoder.x_wt |
127 | 128 |
|
128 | 129 | # Statistical model performance |
|
154 | 155 | print(f'Split {i + 1}/{len(train_val_splits_indices)}:\nSpearmans rho (ML) = {performances[4]:.3f}') |
155 | 156 | # B. Hybrid modeling |
156 | 157 | # ------------------------------------------------------------------------------- |
157 | | - hybrid_model = DCALLMHybridModel(x_train=x_train_val, y_train=y_train_val, x_wt=x_wt) |
158 | | - beta_1, beta_2, regressor = hybrid_model.settings(x_train=x_train_val, y_train=y_train_val) |
159 | | - y_test_pred = hybrid_model.hybrid_prediction(x=x_test, reg=regressor, beta_1=beta_1, beta_2=beta_2) |
| 158 | + hybrid_model = DCALLMHybridModel(x_train_dca=x_train_val, y_train=y_train_val, x_wt=x_wt) |
| 159 | + y_test_pred = hybrid_model.hybrid_prediction(x_dca=x_test) |
160 | 160 | ten_split_performance_hybrid.append(spearmanr(y_test, y_test_pred)[0]) |
161 | 161 | print(f'Spearmans rho (Hybrid) = {spearmanr(y_test, y_test_pred)[0]:.3f}') |
162 | 162 | print('-'*80 + f'\n{n_splits}-fold mean Spearmans rho (ML)= {np.mean(ten_split_performance_ml):.3f} ' |
|
227 | 227 | # 4th example: Low-N and plotting using all encoding techniques and all data |
228 | 228 | # ------------------------------------------------------------------------------- |
229 | 229 | print('Lastly, encoding all variants and performing "low-N" protein engineering task.\n' |
230 | | - 'This could require some time... < 1 (GREMLIN DCA encoding) to ~ 2 hours (PLMC ' |
231 | | - 'single core DCA encoding) left...') |
232 | | - |
| 230 | + 'This could require some time...') |
233 | 231 | variants = variant_fitness_data.iloc[:, 0] |
234 | 232 | fitnesses = variant_fitness_data.iloc[:, 1].tolist() |
235 | 233 | variants_split = [] |
|
274 | 272 | performances_dca_ml.append(get_regressor_performances( |
275 | 273 | x_dca_train, x_dca_test, y_train, y_test, regressor='ridge')[4]) # [4] defines spearmanr correlation |
276 | 274 |
|
277 | | - hybrid_model = DCALLMHybridModel(x_train=x_dca_train, y_train=y_train, x_wt=x_wt) |
278 | | - beta_1, beta_2, hybrid_regressor = hybrid_model.settings(x_train=x_dca_train, y_train=y_train) |
279 | | - y_hybrid_pred = hybrid_model.hybrid_prediction(x=x_dca_test, reg=hybrid_regressor, beta_1=beta_1, beta_2=beta_2) |
| 275 | + hybrid_model = DCALLMHybridModel(x_train_dca=x_dca_train, y_train=y_train, x_wt=x_wt) |
| 276 | + y_hybrid_pred = hybrid_model.hybrid_prediction(x_dca=x_dca_test) |
280 | 277 | performances_hybrid.append(spearmanr(y_test, y_hybrid_pred)[0]) |
281 | 278 |
|
282 | 279 | x_aaidx_train, x_aaidx_test, y_train, y_test = train_test_split( |
|
0 commit comments