Skip to content

Commit 14db9d6

Browse files
committed
add classes to use migrated deepgo data
1 parent 093be28 commit 14db9d6

File tree

1 file changed

+186
-0
lines changed

1 file changed

+186
-0
lines changed

chebai/preprocessing/datasets/go_uniprot.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
"GOUniProtOver50",
2323
"EXPERIMENTAL_EVIDENCE_CODES",
2424
"AMBIGUOUS_AMINO_ACIDS",
25+
"DeepGO1MigratedData",
26+
"DeepGO2MigratedData",
2527
]
2628

2729
import gzip
@@ -731,3 +733,187 @@ class GOUniProtOver50(_GOUniProtOverX):
731733
"""
732734

733735
THRESHOLD: int = 50
736+
737+
738+
class _DeepGOMigratedData(_GOUniProtDataExtractor, ABC):
739+
"""
740+
Base class for use of the migrated DeepGO data with common properties, name formatting, and file paths.
741+
742+
Attributes:
743+
READER (dr.ProteinDataReader): Protein data reader class.
744+
THRESHOLD (Optional[int]): Threshold value for GO class selection,
745+
determined by the GO branch type in derived classes.
746+
"""
747+
748+
READER: dr.ProteinDataReader = dr.ProteinDataReader
749+
THRESHOLD: Optional[int] = None
750+
751+
# Mapping from GO branch conventions used in DeepGO to our conventions
752+
GO_BRANCH_MAPPING: dict = {
753+
"cc": "CC",
754+
"mf": "MF",
755+
"bp": "BP",
756+
}
757+
758+
@property
759+
def _name(self) -> str:
760+
"""
761+
Generates a unique identifier for the migrated data based on the GO
762+
branch and max sequence length, optionally including a threshold.
763+
764+
Returns:
765+
str: A formatted name string for the data.
766+
"""
767+
threshold_part = f"GO{self.THRESHOLD}_" if self.THRESHOLD is not None else ""
768+
769+
if self.go_branch != self._ALL_GO_BRANCHES:
770+
return f"{threshold_part}{self.go_branch}_{self.max_sequence_length}"
771+
772+
return f"{threshold_part}{self.max_sequence_length}"
773+
774+
# ------------------------------ Phase: Prepare data -----------------------------------
775+
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
776+
"""
777+
Checks for the existence of migrated DeepGO data in the specified directory.
778+
Raises an error if the required data file is not found, prompting
779+
migration from DeepGO to this data structure.
780+
781+
Args:
782+
*args (Any): Additional positional arguments.
783+
**kwargs (Any): Additional keyword arguments.
784+
785+
Raises:
786+
FileNotFoundError: If the processed data file does not exist.
787+
"""
788+
print("Checking for processed data in", self.processed_dir_main)
789+
790+
processed_name = self.processed_main_file_names_dict["data"]
791+
if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)):
792+
raise FileNotFoundError(
793+
f"File {processed_name} not found.\n"
794+
f"You must run the appropriate DeepGO migration script "
795+
f"(chebai/preprocessing/migration/deep_go) before executing this configuration "
796+
f"to migrate data from DeepGO to this data structure."
797+
)
798+
799+
def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List:
800+
# Selection of GO classes not needed for migrated data
801+
pass
802+
803+
# ------------------------------ Phase: Raw Properties -----------------------------------
804+
@property
805+
@abstractmethod
806+
def processed_main_file_names_dict(self) -> Dict[str, str]:
807+
"""
808+
Abstract property for defining main processed file names.
809+
These files are stored in the same directory as the generated data files
810+
but have distinct names to differentiate them during training.
811+
812+
Returns:
813+
dict: A dictionary with key-value pairs for main processed file names.
814+
"""
815+
pass
816+
817+
@property
818+
@abstractmethod
819+
def processed_file_names_dict(self) -> Dict[str, str]:
820+
"""
821+
Abstract property for defining additional processed file names.
822+
These files are stored in the same directory as the generated data files
823+
but have distinct names to differentiate them during training.
824+
825+
Returns:
826+
dict: A dictionary with key-value pairs for processed file names.
827+
"""
828+
pass
829+
830+
831+
class DeepGO1MigratedData(_DeepGOMigratedData):
832+
"""
833+
Migrated data class specific to DeepGO1. Sets threshold values according
834+
to the research paper based on the GO branch.
835+
836+
Note:
837+
Refer reference number 1 at the top of this file for the corresponding research paper.
838+
839+
Args:
840+
**kwargs: Arbitrary keyword arguments passed to the superclass.
841+
842+
Raises:
843+
ValueError: If an unsupported GO branch is provided.
844+
"""
845+
846+
def __init__(self, **kwargs):
847+
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
848+
assert int(kwargs.get("max_sequence_length")) == 1002
849+
850+
# Set threshold based on GO branch, as per DeepGO1 paper and its data.
851+
if kwargs.get("go_branch") in ["CC", "MF"]:
852+
self.THRESHOLD = 50
853+
elif kwargs.get("go_branch") == "BP":
854+
self.THRESHOLD = 250
855+
else:
856+
raise ValueError(
857+
f"DeepGO1 paper has no defined threshold for branch {self.go_branch}"
858+
)
859+
860+
super(_DeepGOMigratedData, self).__init__(**kwargs)
861+
862+
@property
863+
def processed_main_file_names_dict(self) -> Dict[str, str]:
864+
"""
865+
Returns main processed file names specific to DeepGO1.
866+
867+
Returns:
868+
dict: Dictionary with the main data file name for DeepGO1.
869+
"""
870+
return {"data": "data_deep_go1.pkl"}
871+
872+
@property
873+
def processed_file_names_dict(self) -> Dict[str, str]:
874+
"""
875+
Returns processed file names specific to DeepGO1.
876+
877+
Returns:
878+
dict: Dictionary with data file name for DeepGO1.
879+
"""
880+
return {"data": "data_deep_go1.pt"}
881+
882+
883+
class DeepGO2MigratedData(_DeepGOMigratedData):
884+
"""
885+
Migrated data class specific to DeepGO2, inheriting from DeepGO1MigratedData
886+
with different processed file names.
887+
888+
Note:
889+
Refer reference number 3 at the top of this file for the corresponding research paper.
890+
891+
Returns:
892+
dict: Dictionary with file names specific to DeepGO2.
893+
"""
894+
895+
def __init__(self, **kwargs):
896+
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
897+
assert int(kwargs.get("max_sequence_length")) == 1000
898+
899+
super(_DeepGOMigratedData, self).__init__(**kwargs)
900+
901+
@property
902+
def processed_main_file_names_dict(self) -> Dict[str, str]:
903+
"""
904+
Returns main processed file names specific to DeepGO2.
905+
906+
Returns:
907+
dict: Dictionary with the main data file name for DeepGO2.
908+
"""
909+
return {"data": "data_deep_go2.pkl"}
910+
911+
@property
912+
def processed_file_names_dict(self) -> Dict[str, str]:
913+
"""
914+
Returns processed file names specific to DeepGO2.
915+
916+
Returns:
917+
dict: Dictionary with data file name for DeepGO2.
918+
"""
919+
return {"data": "data_deep_go2.pt"}

0 commit comments

Comments
 (0)