Update DirectedEvolution class

niklases · niklases · commit 778e76147daa · 2025-04-23T20:30:14.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -420,3 +420,4 @@ datasets/AVGFP/model_saves/*
 datasets/AVGFP/Pickles/*
 datasets/AVGFP/DCA_Hybrid_Model_Performance_ESM1v_no_ML.png
 datasets/AVGFP/DCA_Hybrid_Model_Performance_ProSST_no_ML.png
+datasets/AVGFP/HYBRIDgremlinesm_DE_trajectories.png
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -1210,7 +1210,7 @@ def performance_ls_ts(
         )
         model_name = f'HYBRID{model_type.lower()}{llm.lower()}'
         y_test_pred = hybrid_model.hybrid_prediction(np.array(x_test), x_llm_test)
-        print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)}')
+        print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)[0]:.3f} N={len(y_test)}')
         save_model_to_dict_pickle(hybrid_model, model_name)
 
     elif (
diff --git a/pypef/utils/directed_evolution.py b/pypef/utils/directed_evolution.py
@@ -240,22 +240,24 @@ def in_silico_de(self):
                 if wt_prediction is None or wt_prediction == 'skip':
                     wt_prediction = 'skip'
                     while wt_prediction == 'skip':
+                        rand_pos = random.randint(0, len(self.s_wt))
+                        wt_mut = self.s_wt[rand_pos] + str(rand_pos) + self.s_wt[rand_pos]
                         wt_prediction = predict(  # AAidx, OneHot, or DCA-based pure ML prediction
                             path=self.path,
                             model=self.model,
                             encoding=self.encoding,
-                            variants=np.atleast_1d(self.s_wt[int(new_variant[:-1]) - 1] + new_variant[:-1] + 
-                                self.s_wt[int(new_variant[:-1]) - 1]),
+                            variants=np.atleast_1d(wt_mut),
                             sequences=np.atleast_1d(self.s_wt),
                             no_fft=self.no_fft,
                             couplings_file=self.dca_encoder
                         )
-                        logger.info(
-                            f"Step {self.de_step_counter}: "
-                            f"{self.s_wt[int(new_variant[:-1]) - 1] + new_variant[:-1] + self.s_wt[int(new_variant[:-1]) - 1]} --> "
-                            f"{wt_prediction[0][0]} WT relative fitness: {wt_prediction[0][0] - wt_prediction[0][0] + add_epsilon:.3f}"
-                        )
-                y_traj[0] = wt_prediction[0][0]
+                if self.de_step_counter == 0:
+                    logger.info(
+                        f"Step {self.de_step_counter}: "
+                        f"WT ({wt_mut}) --> {wt_prediction[0][0]:.3f} WT relative fitness: "
+                        f"{wt_prediction[0][0] - wt_prediction[0][0] + add_epsilon:.3f}"
+                    )
+                    y_traj[0] = wt_prediction[0][0]
                 predictions = predict(  # AAidx, OneHot, or DCA-based pure ML prediction
                     path=self.path,
                     model=self.model,
@@ -270,32 +272,33 @@ def in_silico_de(self):
                 if wt_prediction is None or wt_prediction == 'skip':
                     wt_prediction = 'skip'
                     while wt_prediction == 'skip':
+                        rand_pos = random.randint(0, len(self.s_wt))
+                        wt_mut = self.s_wt[rand_pos] + str(rand_pos) + self.s_wt[rand_pos]
                         wt_prediction = predict_directed_evolution(
                             encoder=self.dca_encoder,
                             variant=self.s_wt[int(new_variant[:-1]) - 1] + new_variant[:-1] + 
                                 self.s_wt[int(new_variant[:-1]) - 1],  # WT, e.g. F17F
                             variant_sequence=self.s_wt,
                             hybrid_model_data_pkl=self.model
                         )
-                        logger.info(
-                            f"Step {self.de_step_counter}: "
-                            f"WT ({self.s_wt[int(new_variant[:-1]) - 1] + new_variant[:-1] + self.s_wt[int(new_variant[:-1]) - 1]}) --> "
-                            f"{wt_prediction[0][0]} WT relative fitness: {wt_prediction[0][0] - wt_prediction[0][0] + add_epsilon:.3f}"
-                        )
+                if self.de_step_counter == 0:
+                    logger.info(
+                        f"Step {self.de_step_counter}: "
+                        f"WT ({wt_mut}) --> {wt_prediction[0][0]:.3f} WT relative fitness: "
+                        f"{wt_prediction[0][0] - wt_prediction[0][0] + add_epsilon:.3f}"
+                    )
+                    # add_epsilon = 0.01 * abs(wt_prediction[0][0]) # Adding 1% to prediction for hybrid modeling!
                     y_traj[0] = wt_prediction[0][0] - wt_prediction[0][0]
-                    #add_epsilon = 0.01 * abs(wt_prediction[0][0]) # Adding 1% to prediction for hybrid modeling!
-                y_traj[0] = wt_prediction[0][0] - wt_prediction[0][0]
                 predictions = predict_directed_evolution(
                     encoder=self.dca_encoder,
                     variant=self.s_wt[int(new_variant[:-1]) - 1] + new_variant,
                     variant_sequence=new_sequence,
                     hybrid_model_data_pkl=self.model
                 )
-                print('PREDICTIONS:', predictions)
             if predictions != 'skip':
                 logger.info(f"Step {self.de_step_counter + 1}: "
                             f"{self.s_wt[int(new_variant[:-1]) - 1]}{new_variant} --> "
-                            f"{predictions[0][0]} WT relative fitness: {predictions[0][0] - wt_prediction[0][0] + add_epsilon:.3f}")
+                            f"{predictions[0][0]:.3f} WT relative fitness: {predictions[0][0] - wt_prediction[0][0] + add_epsilon:.3f}")
             else:  # skip if variant cannot be encoded by DCA-based encoding technique
                 logger.info(f"Step {self.de_step_counter + 1}: "
                             f"{self.s_wt[int(new_variant[:-1]) - 1]}{new_variant} --> {predictions}")
@@ -319,9 +322,9 @@ def in_silico_de(self):
                 y_traj.append(new_y)         # update the fitness trajectory records
                 s_traj.append(new_sequence)  # update the sequence trajectory records
                 accepted += 1
-                logger.info(f'Accepted variant {new_var} [current evolutionary trajectory: {v_traj}]')
+                logger.info(f'Accepted variant {new_var} (current evolutionary trajectory: {v_traj})')
             else: 
-                logger.info(f'Rejected variant {new_var} [current evolutionary trajectory: {v_traj}]')
+                logger.info(f'Rejected variant {new_var} (current evolutionary trajectory: {v_traj})')
 
         self.assert_trajectory_sequences(v_traj, s_traj)
 

Original file line number	Diff line number	Diff line change
`@@ -1210,7 +1210,7 @@ def performance_ls_ts(`
`1210`	`1210`	`)`
`1211`	`1211`	`model_name = f'HYBRID{model_type.lower()}{llm.lower()}'`
`1212`	`1212`	`y_test_pred = hybrid_model.hybrid_prediction(np.array(x_test), x_llm_test)`
`1213`		`- print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)}')`
	`1213`	`+ print(f'Hybrid performance: {spearmanr(y_test, y_test_pred)[0]:.3f} N={len(y_test)}')`
`1214`	`1214`	`save_model_to_dict_pickle(hybrid_model, model_name)`
`1215`	`1215`
`1216`	`1216`	`elif (`