migration script update

aditya0by0 · aditya0by0 · commit 093be281a378 · 2024-11-13T22:56:44.000+01:00
diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py
@@ -14,7 +14,7 @@ class DeepGo1DataMigration:
     A class to handle data migration and processing for the DeepGO project.
     It migrates the DeepGO data to our data structure followed for GO-UniProt data.
 
-    This class handles data from the DeepGO model as described in:
+    This class handles migration of data from the DeepGO paper below:
         Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf,
         DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier,
         Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668
diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py
@@ -5,61 +5,34 @@
 import pandas as pd
 from jsonargparse import CLI
 
-from chebai.preprocessing.datasets.go_uniprot import (
-    GOUniProtOver50,
-    GOUniProtOver250,
-    _GOUniProtDataExtractor,
-)
+from chebai.preprocessing.datasets.go_uniprot import DeepGO2MigratedData
 
 
 class DeepGo2DataMigration:
     """
     A class to handle data migration and processing for the DeepGO project. It migrates the data from the DeepGO-SE
     data structure to our data structure followed for GO-UniProt data.
 
-    It migrates the data of DeepGO model of the below research paper:
+    This class handles migration of data from the DeepGO paper below:
         Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf,
         DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier,
         Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668
-        (https://doi.org/10.1093/bioinformatics/btx624),
-
-    Attributes:
-        _CORRESPONDING_GO_CLASSES (dict): Mapping of GO branches to specific data extractor classes.
-        _MAXLEN (int): Maximum sequence length for sequences.
-        _LABELS_START_IDX (int): Starting index for labels in the dataset.
-
-    Methods:
-        __init__(data_dir, go_branch): Initializes the data directory and GO branch.
-        _load_data(): Loads train, validation, test, and terms data from the specified directory.
-        _record_splits(): Creates a DataFrame with IDs and their corresponding split.
-        migrate(): Executes the migration process including data loading, processing, and saving.
-        _extract_required_data_from_splits(): Extracts required columns from the splits data.
-        _generate_labels(data_df): Generates label columns for the data based on GO terms.
-        extract_go_id(go_list): Extracts GO IDs from a list.
-        save_migrated_data(data_df, splits_df): Saves the processed data and splits.
+        (https://doi.org/10.1093/bioinformatics/btx624)
     """
 
-    # Link for the namespaces convention used for GO branch
-    # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L18-L22
-    _CORRESPONDING_GO_CLASSES = {
-        "cc": GOUniProtOver50,
-        "mf": GOUniProtOver50,
-        "bp": GOUniProtOver250,
-    }
-
     # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
     _MAXLEN = 1000
-    _LABELS_START_IDX = _GOUniProtDataExtractor._LABELS_START_IDX
+    _LABELS_START_IDX = DeepGO2MigratedData._LABELS_START_IDX
 
     def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]):
         """
         Initializes the data migration object with a data directory and GO branch.
 
         Args:
             data_dir (str): Directory containing the data files.
-            go_branch (Literal["cc", "mf", "bp"]): GO branch to use (cellular_component, molecular_function, or biological_process).
+            go_branch (Literal["cc", "mf", "bp"]): GO branch to use.
         """
-        valid_go_branches = list(self._CORRESPONDING_GO_CLASSES.keys())
+        valid_go_branches = list(DeepGO2MigratedData.GO_BRANCH_MAPPING.keys())
         if go_branch not in valid_go_branches:
             raise ValueError(f"go_branch must be one of {valid_go_branches}")
         self._go_branch = go_branch
@@ -71,13 +44,45 @@ def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]):
         self._terms_df: Optional[pd.DataFrame] = None
         self._classes: Optional[List[str]] = None
 
+    def migrate(self) -> None:
+        """
+        Executes the data migration by loading, processing, and saving the data.
+        """
+        print("Starting the migration process...")
+        self._load_data()
+        if not all(
+            df is not None
+            for df in [
+                self._train_df,
+                self._validation_df,
+                self._test_df,
+                self._terms_df,
+            ]
+        ):
+            raise Exception(
+                "Data splits or terms data is not available in instance variables."
+            )
+        splits_df = self._record_splits()
+
+        data_df = self._extract_required_data_from_splits()
+        data_with_labels_df = self._generate_labels(data_df)
+
+        if not all(
+            var is not None for var in [data_with_labels_df, splits_df, self._classes]
+        ):
+            raise Exception(
+                "Data splits or terms data is not available in instance variables."
+            )
+
+        self.save_migrated_data(data_df, splits_df)
+
     def _load_data(self) -> None:
         """
         Loads the test, train, validation, and terms data from the pickled files
         in the data directory.
         """
         try:
-            print(f"Loading data from {self._data_dir}......")
+            print(f"Loading data from directory: {self._data_dir}......")
             self._test_df = pd.DataFrame(
                 pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl"))
             )
@@ -100,7 +105,7 @@ def _record_splits(self) -> pd.DataFrame:
         Returns:
             pd.DataFrame: A combined DataFrame containing split assignments.
         """
-        print("Recording splits...")
+        print("Recording data splits for train, validation, and test sets.")
         split_assignment_list: List[pd.DataFrame] = [
             pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}),
             pd.DataFrame(
@@ -112,38 +117,6 @@ def _record_splits(self) -> pd.DataFrame:
         combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True)
         return combined_split_assignment
 
-    def migrate(self) -> None:
-        """
-        Executes the data migration by loading, processing, and saving the data.
-        """
-        print("Migration started......")
-        self._load_data()
-        if not all(
-            df is not None
-            for df in [
-                self._train_df,
-                self._validation_df,
-                self._test_df,
-                self._terms_df,
-            ]
-        ):
-            raise Exception(
-                "Data splits or terms data is not available in instance variables."
-            )
-        splits_df = self._record_splits()
-
-        data_df = self._extract_required_data_from_splits()
-        data_with_labels_df = self._generate_labels(data_df)
-
-        if not all(
-            var is not None for var in [data_with_labels_df, splits_df, self._classes]
-        ):
-            raise Exception(
-                "Data splits or terms data is not available in instance variables."
-            )
-
-        self.save_migrated_data(data_df, splits_df)
-
     def _extract_required_data_from_splits(self) -> pd.DataFrame:
         """
         Extracts required columns from the combined data splits.
@@ -186,6 +159,19 @@ def _extract_required_data_from_splits(self) -> pd.DataFrame:
         )
         return data_df
 
+    @staticmethod
+    def extract_go_id(go_list: List[str]) -> List[int]:
+        """
+        Extracts and parses GO IDs from a list of GO annotations.
+
+        Args:
+            go_list (List[str]): List of GO annotation strings.
+
+        Returns:
+            List[str]: List of parsed GO IDs.
+        """
+        return [DeepGO2MigratedData._parse_go_id(go_id_str) for go_id_str in go_list]
+
     def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
         """
         Generates label columns for each GO term in the dataset.
@@ -198,7 +184,7 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
         """
         print("Generating labels based on terms.pkl file.......")
         parsed_go_ids: pd.Series = self._terms_df["gos"].apply(
-            lambda gos: _GOUniProtDataExtractor._parse_go_id(gos)
+            lambda gos: DeepGO2MigratedData._parse_go_id(gos)
         )
         all_go_ids_list = parsed_go_ids.values.tolist()
         self._classes = all_go_ids_list
@@ -215,21 +201,6 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
         data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)]
         return data_df
 
-    @staticmethod
-    def extract_go_id(go_list: List[str]) -> List[int]:
-        """
-        Extracts and parses GO IDs from a list of GO annotations.
-
-        Args:
-            go_list (List[str]): List of GO annotation strings.
-
-        Returns:
-            List[str]: List of parsed GO IDs.
-        """
-        return [
-            _GOUniProtDataExtractor._parse_go_id(go_id_str) for go_id_str in go_list
-        ]
-
     def save_migrated_data(
         self, data_df: pd.DataFrame, splits_df: pd.DataFrame
     ) -> None:
@@ -241,29 +212,35 @@ def save_migrated_data(
             splits_df (pd.DataFrame): Split assignment DataFrame.
         """
         print("Saving transformed data......")
-        go_class_instance: _GOUniProtDataExtractor = self._CORRESPONDING_GO_CLASSES[
-            self._go_branch
-        ](go_branch=self._go_branch.upper(), max_sequence_length=self._MAXLEN)
+        deepgo_migr_inst: DeepGO2MigratedData = DeepGO2MigratedData(
+            go_branch=DeepGO2MigratedData.GO_BRANCH_MAPPING[self._go_branch],
+            max_sequence_length=self._MAXLEN,
+        )
 
-        go_class_instance.save_processed(
-            data_df, go_class_instance.processed_main_file_names_dict["data"]
+        # Save data file
+        deepgo_migr_inst.save_processed(
+            data_df, deepgo_migr_inst.processed_main_file_names_dict["data"]
         )
         print(
-            f"{go_class_instance.processed_main_file_names_dict['data']} saved to {go_class_instance.processed_dir_main}"
+            f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}"
         )
 
+        # Save split file
         splits_df.to_csv(
-            os.path.join(go_class_instance.processed_dir_main, "splits.csv"),
+            os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go2.csv"),
             index=False,
         )
-        print(f"splits.csv saved to {go_class_instance.processed_dir_main}")
+        print(f"splits_deep_go2.csv saved to {deepgo_migr_inst.processed_dir_main}")
 
+        # Save classes.txt file
         classes = sorted(self._classes)
         with open(
-            os.path.join(go_class_instance.processed_dir_main, "classes.txt"), "wt"
+            os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go2.txt"),
+            "wt",
         ) as fout:
             fout.writelines(str(node) + "\n" for node in classes)
-        print(f"classes.txt saved to {go_class_instance.processed_dir_main}")
+        print(f"classes_deep_go2.txt saved to {deepgo_migr_inst.processed_dir_main}")
+
         print("Migration completed!")