Merge branch 'dev' into protein_prediction

aditya0by0 · aditya0by0 · commit 25177b327222 · 2024-10-11T12:55:49.000+02:00
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -256,7 +256,9 @@ def __init__(
         # Load pretrained checkpoint if provided
         if pretrained_checkpoint:
             with open(pretrained_checkpoint, "rb") as fin:
-                model_dict = torch.load(fin, map_location=self.device)
+                model_dict = torch.load(
+                    fin, map_location=self.device, weights_only=False
+                )
                 if load_prefix:
                     state_dict = filter_dict(model_dict["state_dict"], load_prefix)
                 else:
@@ -414,7 +416,9 @@ def __init__(self, cone_dimensions=20, **kwargs):
         model_prefix = kwargs.get("load_prefix", None)
         if pretrained_checkpoint:
             with open(pretrained_checkpoint, "rb") as fin:
-                model_dict = torch.load(fin, map_location=self.device)
+                model_dict = torch.load(
+                    fin, map_location=self.device, weights_only=False
+                )
                 if model_prefix:
                     state_dict = {
                         str(k)[len(model_prefix) :]: v
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -200,7 +200,9 @@ def load_processed_data(
                     filename = self.processed_file_names_dict[kind]
             except NotImplementedError:
                 filename = f"{kind}.pt"
-        return torch.load(os.path.join(self.processed_dir, filename))
+        return torch.load(
+            os.path.join(self.processed_dir, filename), weights_only=False
+        )
 
     def dataloader(self, kind: str, **kwargs) -> DataLoader:
         """
@@ -519,7 +521,7 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
             DataLoader: DataLoader object for the specified subset.
         """
         subdatasets = [
-            torch.load(os.path.join(s.processed_dir, f"{kind}.pt"))
+            torch.load(os.path.join(s.processed_dir, f"{kind}.pt"), weights_only=False)
             for s in self.subsets
         ]
         dataset = [
@@ -1022,7 +1024,9 @@ def _retrieve_splits_from_csv(self) -> None:
         splits_df = pd.read_csv(self.splits_file_path)
 
         filename = self.processed_file_names_dict["data"]
-        data = torch.load(os.path.join(self.processed_dir, filename))
+        data = torch.load(
+            os.path.join(self.processed_dir, filename), weights_only=False
+        )
         df_data = pd.DataFrame(data)
 
         train_ids = splits_df[splits_df["split"] == "train"]["id"]
@@ -1081,7 +1085,9 @@ def load_processed_data(
 
         # If filename is provided
         try:
-            return torch.load(os.path.join(self.processed_dir, filename))
+            return torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
         except FileNotFoundError:
             raise FileNotFoundError(f"File {filename} doesn't exist")
 
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -13,7 +13,7 @@
 import pickle
 from abc import ABC
 from collections import OrderedDict
-from typing import Any, Dict, Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 
 import fastobo
 import networkx as nx
@@ -244,16 +244,26 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
         with open(data_path, encoding="utf-8") as chebi:
             chebi = "\n".join(l for l in chebi if not l.startswith("xref:"))
 
-        elements = [
-            term_callback(clause)
-            for clause in fastobo.loads(chebi)
-            if clause and ":" in str(clause.id)
-        ]
+        elements = []
+        for term_doc in fastobo.loads(chebi):
+            if (
+                term_doc
+                and isinstance(term_doc.id, fastobo.id.PrefixedIdent)
+                and term_doc.id.prefix == "CHEBI"
+            ):
+                term_dict = term_callback(term_doc)
+                if term_dict:
+                    elements.append(term_dict)
 
         g = nx.DiGraph()
         for n in elements:
             g.add_node(n["id"], **n)
-        g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]])
+
+        # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes
+        # https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142
+        g.add_edges_from(
+            [(p, q["id"]) for q in elements for p in q["parents"] if g.has_node(p)]
+        )
 
         print("Compute transitive closure")
         return nx.transitive_closure_dag(g)
@@ -397,7 +407,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         try:
             filename = self.processed_file_names_dict["data"]
-            data_chebi_version = torch.load(os.path.join(self.processed_dir, filename))
+            data_chebi_version = torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
         except FileNotFoundError:
             raise FileNotFoundError(
                 f"File data.pt doesn't exists. "
@@ -418,7 +430,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
                 data_chebi_train_version = torch.load(
                     os.path.join(
                         self._chebi_version_train_obj.processed_dir, filename_train
-                    )
+                    ),
+                    weights_only=False,
                 )
             except FileNotFoundError:
                 raise FileNotFoundError(
@@ -812,7 +825,7 @@ def chebi_to_int(s: str) -> int:
     return int(s[s.index(":") + 1 :])
 
 
-def term_callback(doc) -> dict:
+def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]:
     """
     Extracts information from a ChEBI term document.
     This function takes a ChEBI term document as input and extracts relevant information such as the term ID, parents,
@@ -852,6 +865,12 @@ def term_callback(doc) -> dict:
             parents.append(chebi_to_int(str(clause.term)))
         elif isinstance(clause, fastobo.term.NameClause):
             name = str(clause.name)
+
+        if isinstance(clause, fastobo.term.IsObsoleteClause):
+            if clause.obsolete:
+                # if the term document contains clause as obsolete as true, skips this document.
+                return False
+
     return {
         "id": chebi_to_int(str(doc.id)),
         "parents": parents,
diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
@@ -514,7 +514,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         try:
             filename = self.processed_file_names_dict["data"]
-            data_go = torch.load(os.path.join(self.processed_dir, filename))
+            data_go = torch.load(
+                os.path.join(self.processed_dir, filename), weights_only=False
+            )
         except FileNotFoundError:
             raise FileNotFoundError(
                 f"File data.pt doesn't exists. "
diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
@@ -891,10 +891,10 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
             DataLoader: DataLoader instance.
         """
         labeled_data = torch.load(
-            os.path.join(self.labeled.processed_dir, f"{kind}.pt")
+            os.path.join(self.labeled.processed_dir, f"{kind}.pt"), weights_only=False
         )
         unlabeled_data = torch.load(
-            os.path.join(self.unlabeled.processed_dir, f"{kind}.pt")
+            os.path.join(self.unlabeled.processed_dir, f"{kind}.pt"), weights_only=False
         )
         if self.data_limit is not None:
             labeled_data = labeled_data[: self.data_limit]
diff --git a/chebai/preprocessing/migration/chebi_data_migration.py b/chebai/preprocessing/migration/chebi_data_migration.py
@@ -168,7 +168,7 @@ def _combine_pt_splits(
         df_list: List[pd.DataFrame] = []
         for split, file_name in old_splits_file_names.items():
             file_path = os.path.join(old_dir, file_name)
-            file_df = pd.DataFrame(torch.load(file_path))
+            file_df = pd.DataFrame(torch.load(file_path, weights_only=False))
             df_list.append(file_df)
 
         return pd.concat(df_list, ignore_index=True)
diff --git a/chebai/result/analyse_sem.py b/chebai/result/analyse_sem.py
@@ -427,7 +427,9 @@ def run_all(
                     os.path.join(buffer_dir_smoothed, "preds000.pt")
                 ):
                     preds = torch.load(
-                        os.path.join(buffer_dir_smoothed, "preds000.pt"), DEVICE
+                        os.path.join(buffer_dir_smoothed, "preds000.pt"),
+                        DEVICE,
+                        weights_only=False,
                     )
                     labels = None
                 else:
diff --git a/chebai/result/base.py b/chebai/result/base.py
@@ -54,7 +54,7 @@ def _generate_predictions(self, data_path, raw=False, **kwargs):
         else:
             data_tuples = [
                 (x.get("raw_features", x["ident"]), x["ident"], x)
-                for x in torch.load(data_path)
+                for x in torch.load(data_path, weights_only=False)
             ]
 
         for raw_features, ident, row in tqdm.tqdm(data_tuples):
diff --git a/chebai/result/pretraining.py b/chebai/result/pretraining.py
@@ -34,7 +34,7 @@ def evaluate_model(logs_base_path, model_filename, data_module):
     collate = data_module.reader.COLLATOR()
     test_file = "test.pt"
     data_path = os.path.join(data_module.processed_dir, test_file)
-    data_list = torch.load(data_path)
+    data_list = torch.load(data_path, weights_only=False)
     preds_list = []
     labels_list = []
 
diff --git a/chebai/result/utils.py b/chebai/result/utils.py
@@ -182,6 +182,7 @@ def load_results_from_buffer(
             torch.load(
                 os.path.join(buffer_dir, filename),
                 map_location=torch.device(device),
+                weights_only=False,
             )
         )
         i += 1
@@ -194,6 +195,7 @@ def load_results_from_buffer(
             torch.load(
                 os.path.join(buffer_dir, filename),
                 map_location=torch.device(device),
+                weights_only=False,
             )
         )
         i += 1
diff --git a/tests/testCustomBalancedAccuracyMetric.py b/tests/testCustomBalancedAccuracyMetric.py
@@ -49,7 +49,9 @@ def test_metric_against_realistic_data(self) -> None:
 
         # load single file to get the num of labels for metric class instantiation
         labels = torch.load(
-            f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device)
+            f"{directory_path}/labels{0:03d}.pt",
+            map_location=torch.device(self.device),
+            weights_only=False,
         )
         num_labels = labels.shape[1]
         balanced_acc_custom = BalancedAccuracy(num_labels=num_labels)
@@ -58,10 +60,12 @@ def test_metric_against_realistic_data(self) -> None:
             labels = torch.load(
                 f"{directory_path}/labels{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             preds = torch.load(
                 f"{directory_path}/preds{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             balanced_acc_custom.update(preds, labels)
 
diff --git a/tests/testCustomMacroF1Metric.py b/tests/testCustomMacroF1Metric.py
@@ -119,7 +119,9 @@ def test_metric_against_realistic_data(self) -> None:
 
         # Load single file to get the number of labels for metric class instantiation
         labels = torch.load(
-            f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device)
+            f"{directory_path}/labels{0:03d}.pt",
+            map_location=torch.device(self.device),
+            weights_only=False,
         )
         num_labels = labels.shape[1]
         macro_f1_custom = MacroF1(num_labels=num_labels)
@@ -130,10 +132,12 @@ def test_metric_against_realistic_data(self) -> None:
             labels = torch.load(
                 f"{directory_path}/labels{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             preds = torch.load(
                 f"{directory_path}/preds{i:03d}.pt",
                 map_location=torch.device(self.device),
+                weights_only=False,
             )
             macro_f1_standard.update(preds, labels)
             macro_f1_custom.update(preds, labels)
diff --git a/tests/testPubChemData.py b/tests/testPubChemData.py
@@ -37,9 +37,15 @@ def getDataSplitsOverlaps(cls) -> None:
         processed_path = os.path.join(os.getcwd(), cls.pubChem.processed_dir)
         print(f"Checking Data from - {processed_path}")
 
-        train_set = torch.load(os.path.join(processed_path, "train.pt"))
-        val_set = torch.load(os.path.join(processed_path, "validation.pt"))
-        test_set = torch.load(os.path.join(processed_path, "test.pt"))
+        train_set = torch.load(
+            os.path.join(processed_path, "train.pt"), weights_only=False
+        )
+        val_set = torch.load(
+            os.path.join(processed_path, "validation.pt"), weights_only=False
+        )
+        test_set = torch.load(
+            os.path.join(processed_path, "test.pt"), weights_only=False
+        )
 
         train_smiles, train_smiles_ids = cls.get_features_ids(train_set)
         val_smiles, val_smiles_ids = cls.get_features_ids(val_set)
diff --git a/tests/testTox21MolNetData.py b/tests/testTox21MolNetData.py
@@ -37,9 +37,15 @@ def getDataSplitsOverlaps(cls) -> None:
         processed_path = os.path.join(os.getcwd(), cls.tox21.processed_dir)
         print(f"Checking Data from - {processed_path}")
 
-        train_set = torch.load(os.path.join(processed_path, "train.pt"))
-        val_set = torch.load(os.path.join(processed_path, "validation.pt"))
-        test_set = torch.load(os.path.join(processed_path, "test.pt"))
+        train_set = torch.load(
+            os.path.join(processed_path, "train.pt"), weights_only=False
+        )
+        val_set = torch.load(
+            os.path.join(processed_path, "validation.pt"), weights_only=False
+        )
+        test_set = torch.load(
+            os.path.join(processed_path, "test.pt"), weights_only=False
+        )
 
         train_smiles, train_smiles_ids = cls.get_features_ids(train_set)
         val_smiles, val_smiles_ids = cls.get_features_ids(val_set)
diff --git a/tutorials/demo_process_results.ipynb b/tutorials/demo_process_results.ipynb
@@ -248,9 +248,9 @@
     "# check if pretraining datasets overlap\n",
     "dm = PubChemDeepSMILES()\n",
     "processed_path = dm.processed_dir\n",
-    "test_set = torch.load(os.path.join(processed_path, \"test.pt\"))\n",
-    "val_set = torch.load(os.path.join(processed_path, \"validation.pt\"))\n",
-    "train_set = torch.load(os.path.join(processed_path, \"train.pt\"))\n",
+    "test_set = torch.load(os.path.join(processed_path, \"test.pt\"), weights_only=False)\n",
+    "val_set = torch.load(os.path.join(processed_path, \"validation.pt\"), weights_only=False)\n",
+    "train_set = torch.load(os.path.join(processed_path, \"train.pt\"), weights_only=False)\n",
     "print(processed_path)\n",
     "test_smiles = [entry[\"features\"] for entry in test_set]\n",
     "val_smiles = [entry[\"features\"] for entry in val_set]\n",
@@ -320,7 +320,7 @@
     "data_module_v200 = ChEBIOver100()\n",
     "data_module_v148 = ChEBIOver100(chebi_version_train=148)\n",
     "data_module_v227 = ChEBIOver100(chebi_version_train=227)\n",
-    "# dataset = torch.load(data_path)\n",
+    "# dataset = torch.load(data_path, weights_only=False)\n",
     "# processors = [CustomResultsProcessor()]\n",
     "# factory = ResultFactory(model, data_module, processors)\n",
     "# factory.execute(data_path)"
@@ -653,7 +653,7 @@
     "    if test_file is None:\n",
     "        test_file = data_module.processed_file_names_dict[\"test\"]\n",
     "    data_path = os.path.join(data_module.processed_dir, test_file)\n",
-    "    data_list = torch.load(data_path)\n",
+    "    data_list = torch.load(data_path, weights_only=False)\n",
     "    preds_list = []\n",
     "    labels_list = []\n",
     "    # if common_classes_mask is not N\n",
diff --git a/tutorials/process_results_old_chebi.ipynb b/tutorials/process_results_old_chebi.ipynb
@@ -167,7 +167,7 @@
     "    if test_file is None:\n",
     "        test_file = data_module.processed_file_names_dict[\"test\"]\n",
     "    data_path = os.path.join(data_module.processed_dir, test_file)\n",
-    "    data_list = torch.load(data_path)\n",
+    "    data_list = torch.load(data_path, weights_only=False)\n",
     "    preds_list = []\n",
     "    labels_list = []\n",
     "\n",

Original file line number	Diff line number	Diff line change
`@@ -891,10 +891,10 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:`
`891`	`891`	`DataLoader: DataLoader instance.`
`892`	`892`	`"""`
`893`	`893`	`labeled_data = torch.load(`
`894`		`- os.path.join(self.labeled.processed_dir, f"{kind}.pt")`
	`894`	`+ os.path.join(self.labeled.processed_dir, f"{kind}.pt"), weights_only=False`
`895`	`895`	`)`
`896`	`896`	`unlabeled_data = torch.load(`
`897`		`- os.path.join(self.unlabeled.processed_dir, f"{kind}.pt")`
	`897`	`+ os.path.join(self.unlabeled.processed_dir, f"{kind}.pt"), weights_only=False`
`898`	`898`	`)`
`899`	`899`	`if self.data_limit is not None:`
`900`	`900`	`labeled_data = labeled_data[: self.data_limit]`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def _generate_predictions(self, data_path, raw=False, **kwargs):`
`54`	`54`	`else:`
`55`	`55`	`data_tuples = [`
`56`	`56`	`(x.get("raw_features", x["ident"]), x["ident"], x)`
`57`		`- for x in torch.load(data_path)`
	`57`	`+ for x in torch.load(data_path, weights_only=False)`
`58`	`58`	`]`
`59`	`59`
`60`	`60`	`for raw_features, ident, row in tqdm.tqdm(data_tuples):`
Original file line number	Diff line number	Diff line change
`@@ -182,6 +182,7 @@ def load_results_from_buffer(`
`182`	`182`	`torch.load(`
`183`	`183`	`os.path.join(buffer_dir, filename),`
`184`	`184`	`map_location=torch.device(device),`
	`185`	`+ weights_only=False,`
`185`	`186`	`)`
`186`	`187`	`)`
`187`	`188`	`i += 1`
`@@ -194,6 +195,7 @@ def load_results_from_buffer(`
`194`	`195`	`torch.load(`
`195`	`196`	`os.path.join(buffer_dir, filename),`
`196`	`197`	`map_location=torch.device(device),`
	`198`	`+ weights_only=False,`
`197`	`199`	`)`
`198`	`200`	`)`
`199`	`201`	`i += 1`