Fix: DCA hybrid was missing + self.beta2 * y_ridge

niklases · niklases · commit 7ebb0a2c7a81 · 2025-04-12T11:40:50.000+02:00
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -575,10 +575,10 @@ def hybrid_prediction(
             if self.llm_attention_mask is not None:
                 print('No LLM input for hybrid prediction but the model '
                       'has been trained using an LLM model input.. '
-                      'Using only DCA for hybridprediction.. This can lead '
+                      'Using only DCA for hybrid prediction.. This can lead '
                       'to unwanted prediction behavior if the hybrid model '
                       'is trained including an LLM...')
-            return self.beta1 * y_dca + self.beta2
+            return self.beta1 * y_dca + self.beta2 * y_ridge
         
         else:
             if self.llm_key == 'prosst':
@@ -615,13 +615,6 @@ def hybrid_prediction(
                     #desc='Infering LoRA-tuned model', 
                     device=self.device).detach().cpu().numpy()
             
-
-            #y_dca, y_ridge, y_llm, y_llm_lora = (
-            #    reduce_by_batch_modulo(y_dca, batch_size=self.batch_size), 
-            #    reduce_by_batch_modulo(y_ridge, batch_size=self.batch_size), 
-            #    reduce_by_batch_modulo(y_llm, batch_size=self.batch_size), 
-            #    reduce_by_batch_modulo(y_llm_lora, batch_size=self.batch_size)
-            #)
             return self.beta1 * y_dca + self.beta2 * y_ridge + self.beta3 * y_llm + self.beta4 * y_llm_lora
 
     def split_performance(
diff --git a/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv b/scripts/ProteinGym_runs/results/dca_esm_and_hybrid_opt_results.csv
diff --git a/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py b/scripts/ProteinGym_runs/run_performance_tests_proteingym_hybrid_dca_llm.py
@@ -505,15 +505,28 @@ def plot_csv_data(csv, plot_name):
 
     compute_performances(
         mut_data=combined_mut_data, 
-        start_i=0,#start_i, 
+        start_i=start_i, 
         already_tested_is=already_tested_is
     )
 
 
     with open(out_results_csv, 'r') as fh:
-        with open(os.path.join(os.path.dirname(__file__), 'results/dca_esm_and_hybrid_opt_results_clean.csv'), 'w') as fh2:
-            for line in fh:
-                if not line.split(',')[1].startswith('OOM') and not line.split(',')[1].startswith('X'):
-                    fh2.write(line)
+        lines = fh.readlines()
+    clean_out_results_csv = os.path.join(
+        os.path.dirname(__file__), 
+        'results/dca_esm_and_hybrid_opt_results_clean.csv'
+    )
+    with open(clean_out_results_csv, 'w') as fh2:
+        header = lines[0]
+        content = lines[1:]
+        sort_keys = []
+        for line in content:
+                sort_keys.append(int(line.split(',')[0]))
+        content_sorted, sort_keys_sorted = [l for l in zip(*sorted(
+            zip(content, sort_keys), key=lambda x: x[1]))]
+        fh2.write(header)
+        for line in content_sorted:
+            if not line.split(',')[1].startswith('OOM') and not line.split(',')[1].startswith('X'):
+                fh2.write(line)
     
-    plot_csv_data(csv=os.path.join(os.path.dirname(__file__), 'results/dca_esm_and_hybrid_opt_results_clean.csv'), plot_name='mut_performance')
+    plot_csv_data(csv=clean_out_results_csv, plot_name='mut_performance')