Merge pull request #26 from ChEB-AI/fix-pandas-serialisation

sfluegel05 · web-flow · commit 648c675b5fda · 2024-05-08T09:28:01.000+02:00
replace pickle.load with pd.read_pickle for raw files
diff --git a/chebai/loss/bce_weighted.py b/chebai/loss/bce_weighted.py
@@ -35,7 +35,7 @@ def set_pos_weight(self, input):
         ):
             complete_data = pd.concat(
                 [
-                    pickle.load(
+                    pd.read_pickle(
                         open(
                             os.path.join(
                                 self.data_extractor.raw_dir,
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -192,7 +192,7 @@ def graph_to_raw_dataset(self, g, split_name=None):
         return data
 
     def save_raw(self, data: pd.DataFrame, filename: str):
-        pickle.dump(data, open(os.path.join(self.raw_dir, filename), "wb"))
+        pd.to_pickle(data, open(os.path.join(self.raw_dir, filename), "wb"))
 
     def _load_dict(self, input_file_path):
         """
@@ -205,7 +205,7 @@ def _load_dict(self, input_file_path):
             dict: The dictionary, keys are `features`, `labels` and `ident`.
         """
         with open(input_file_path, "rb") as input_file:
-            df = pickle.load(input_file)
+            df = pd.read_pickle(input_file)
             if self.single_class is not None:
                 single_cls_index = list(df.columns).index(int(self.single_class))
             for row in df.values:
@@ -218,7 +218,7 @@ def _load_dict(self, input_file_path):
     @staticmethod
     def _get_data_size(input_file_path):
         with open(input_file_path, "rb") as f:
-            return len(pickle.load(f))
+            return len(pd.read_pickle(f))
 
     def _setup_pruned_test_set(self):
         """Create test set with same leaf nodes, but use classes that appear in train set"""
@@ -468,7 +468,7 @@ def prepare_data(self, *args, **kwargs):
                 with open(
                     os.path.join(self.raw_dir, self.raw_file_names_dict["test"]), "rb"
                 ) as input_file:
-                    test_df = pickle.load(input_file)
+                    test_df = pd.read_pickle(input_file)
             # create train/val split based on test set
             chebi_path = self._load_chebi(
                 self.chebi_version_train

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def set_pos_weight(self, input):`
`35`	`35`	`):`
`36`	`36`	`complete_data = pd.concat(`
`37`	`37`	`[`
`38`		`- pickle.load(`
	`38`	`+ pd.read_pickle(`
`39`	`39`	`open(`
`40`	`40`	`os.path.join(`
`41`	`41`	`self.data_extractor.raw_dir,`