add classes to use migrated deepgo data

aditya0by0 · aditya0by0 · commit 14db9d641a8b · 2024-11-13T23:00:18.000+01:00
diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
@@ -22,6 +22,8 @@
     "GOUniProtOver50",
     "EXPERIMENTAL_EVIDENCE_CODES",
     "AMBIGUOUS_AMINO_ACIDS",
+    "DeepGO1MigratedData",
+    "DeepGO2MigratedData",
 ]
 
 import gzip
@@ -731,3 +733,187 @@ class GOUniProtOver50(_GOUniProtOverX):
     """
 
     THRESHOLD: int = 50
+
+
+class _DeepGOMigratedData(_GOUniProtDataExtractor, ABC):
+    """
+    Base class for use of the migrated DeepGO data with common properties, name formatting, and file paths.
+
+    Attributes:
+        READER (dr.ProteinDataReader): Protein data reader class.
+        THRESHOLD (Optional[int]): Threshold value for GO class selection,
+            determined by the GO branch type in derived classes.
+    """
+
+    READER: dr.ProteinDataReader = dr.ProteinDataReader
+    THRESHOLD: Optional[int] = None
+
+    # Mapping from GO branch conventions used in DeepGO to our conventions
+    GO_BRANCH_MAPPING: dict = {
+        "cc": "CC",
+        "mf": "MF",
+        "bp": "BP",
+    }
+
+    @property
+    def _name(self) -> str:
+        """
+        Generates a unique identifier for the migrated data based on the GO
+        branch and max sequence length, optionally including a threshold.
+
+        Returns:
+            str: A formatted name string for the data.
+        """
+        threshold_part = f"GO{self.THRESHOLD}_" if self.THRESHOLD is not None else ""
+
+        if self.go_branch != self._ALL_GO_BRANCHES:
+            return f"{threshold_part}{self.go_branch}_{self.max_sequence_length}"
+
+        return f"{threshold_part}{self.max_sequence_length}"
+
+    # ------------------------------ Phase: Prepare data -----------------------------------
+    def prepare_data(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Checks for the existence of migrated DeepGO data in the specified directory.
+        Raises an error if the required data file is not found, prompting
+        migration from DeepGO to this data structure.
+
+        Args:
+            *args (Any): Additional positional arguments.
+            **kwargs (Any): Additional keyword arguments.
+
+        Raises:
+            FileNotFoundError: If the processed data file does not exist.
+        """
+        print("Checking for processed data in", self.processed_dir_main)
+
+        processed_name = self.processed_main_file_names_dict["data"]
+        if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
+            raise FileNotFoundError(
+                f"File {processed_name} not found.\n"
+                f"You must run the appropriate DeepGO migration script "
+                f"(chebai/preprocessing/migration/deep_go) before executing this configuration "
+                f"to migrate data from DeepGO to this data structure."
+            )
+
+    def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List:
+        # Selection of GO classes not needed for migrated data
+        pass
+
+    # ------------------------------ Phase: Raw Properties -----------------------------------
+    @property
+    @abstractmethod
+    def processed_main_file_names_dict(self) -> Dict[str, str]:
+        """
+        Abstract property for defining main processed file names.
+        These files are stored in the same directory as the generated data files
+        but have distinct names to differentiate them during training.
+
+        Returns:
+            dict: A dictionary with key-value pairs for main processed file names.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def processed_file_names_dict(self) -> Dict[str, str]:
+        """
+        Abstract property for defining additional processed file names.
+        These files are stored in the same directory as the generated data files
+        but have distinct names to differentiate them during training.
+
+        Returns:
+            dict: A dictionary with key-value pairs for processed file names.
+        """
+        pass
+
+
+class DeepGO1MigratedData(_DeepGOMigratedData):
+    """
+    Migrated data class specific to DeepGO1. Sets threshold values according
+    to the research paper based on the GO branch.
+
+    Note:
+        Refer reference number 1 at the top of this file for the corresponding research paper.
+
+    Args:
+        **kwargs: Arbitrary keyword arguments passed to the superclass.
+
+    Raises:
+        ValueError: If an unsupported GO branch is provided.
+    """
+
+    def __init__(self, **kwargs):
+        # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
+        assert int(kwargs.get("max_sequence_length")) == 1002
+
+        # Set threshold based on GO branch, as per DeepGO1 paper and its data.
+        if kwargs.get("go_branch") in ["CC", "MF"]:
+            self.THRESHOLD = 50
+        elif kwargs.get("go_branch") == "BP":
+            self.THRESHOLD = 250
+        else:
+            raise ValueError(
+                f"DeepGO1 paper has no defined threshold for branch {self.go_branch}"
+            )
+
+        super(_DeepGOMigratedData, self).__init__(**kwargs)
+
+    @property
+    def processed_main_file_names_dict(self) -> Dict[str, str]:
+        """
+        Returns main processed file names specific to DeepGO1.
+
+        Returns:
+            dict: Dictionary with the main data file name for DeepGO1.
+        """
+        return {"data": "data_deep_go1.pkl"}
+
+    @property
+    def processed_file_names_dict(self) -> Dict[str, str]:
+        """
+        Returns processed file names specific to DeepGO1.
+
+        Returns:
+            dict: Dictionary with data file name for DeepGO1.
+        """
+        return {"data": "data_deep_go1.pt"}
+
+
+class DeepGO2MigratedData(_DeepGOMigratedData):
+    """
+    Migrated data class specific to DeepGO2, inheriting from DeepGO1MigratedData
+    with different processed file names.
+
+    Note:
+        Refer reference number 3 at the top of this file for the corresponding research paper.
+
+    Returns:
+        dict: Dictionary with file names specific to DeepGO2.
+    """
+
+    def __init__(self, **kwargs):
+        # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
+        assert int(kwargs.get("max_sequence_length")) == 1000
+
+        super(_DeepGOMigratedData, self).__init__(**kwargs)
+
+    @property
+    def processed_main_file_names_dict(self) -> Dict[str, str]:
+        """
+        Returns main processed file names specific to DeepGO2.
+
+        Returns:
+            dict: Dictionary with the main data file name for DeepGO2.
+        """
+        return {"data": "data_deep_go2.pkl"}
+
+    @property
+    def processed_file_names_dict(self) -> Dict[str, str]:
+        """
+        Returns processed file names specific to DeepGO2.
+
+        Returns:
+            dict: Dictionary with data file name for DeepGO2.
+        """
+        return {"data": "data_deep_go2.pt"}