|
22 | 22 | "GOUniProtOver50", |
23 | 23 | "EXPERIMENTAL_EVIDENCE_CODES", |
24 | 24 | "AMBIGUOUS_AMINO_ACIDS", |
| 25 | + "DeepGO1MigratedData", |
| 26 | + "DeepGO2MigratedData", |
25 | 27 | ] |
26 | 28 |
|
27 | 29 | import gzip |
@@ -731,3 +733,187 @@ class GOUniProtOver50(_GOUniProtOverX): |
731 | 733 | """ |
732 | 734 |
|
733 | 735 | THRESHOLD: int = 50 |
| 736 | + |
| 737 | + |
| 738 | +class _DeepGOMigratedData(_GOUniProtDataExtractor, ABC): |
| 739 | + """ |
| 740 | + Base class for use of the migrated DeepGO data with common properties, name formatting, and file paths. |
| 741 | +
|
| 742 | + Attributes: |
| 743 | + READER (dr.ProteinDataReader): Protein data reader class. |
| 744 | + THRESHOLD (Optional[int]): Threshold value for GO class selection, |
| 745 | + determined by the GO branch type in derived classes. |
| 746 | + """ |
| 747 | + |
| 748 | + READER: dr.ProteinDataReader = dr.ProteinDataReader |
| 749 | + THRESHOLD: Optional[int] = None |
| 750 | + |
| 751 | + # Mapping from GO branch conventions used in DeepGO to our conventions |
| 752 | + GO_BRANCH_MAPPING: dict = { |
| 753 | + "cc": "CC", |
| 754 | + "mf": "MF", |
| 755 | + "bp": "BP", |
| 756 | + } |
| 757 | + |
| 758 | + @property |
| 759 | + def _name(self) -> str: |
| 760 | + """ |
| 761 | + Generates a unique identifier for the migrated data based on the GO |
| 762 | + branch and max sequence length, optionally including a threshold. |
| 763 | +
|
| 764 | + Returns: |
| 765 | + str: A formatted name string for the data. |
| 766 | + """ |
| 767 | + threshold_part = f"GO{self.THRESHOLD}_" if self.THRESHOLD is not None else "" |
| 768 | + |
| 769 | + if self.go_branch != self._ALL_GO_BRANCHES: |
| 770 | + return f"{threshold_part}{self.go_branch}_{self.max_sequence_length}" |
| 771 | + |
| 772 | + return f"{threshold_part}{self.max_sequence_length}" |
| 773 | + |
| 774 | + # ------------------------------ Phase: Prepare data ----------------------------------- |
| 775 | + def prepare_data(self, *args: Any, **kwargs: Any) -> None: |
| 776 | + """ |
| 777 | + Checks for the existence of migrated DeepGO data in the specified directory. |
| 778 | + Raises an error if the required data file is not found, prompting |
| 779 | + migration from DeepGO to this data structure. |
| 780 | +
|
| 781 | + Args: |
| 782 | + *args (Any): Additional positional arguments. |
| 783 | + **kwargs (Any): Additional keyword arguments. |
| 784 | +
|
| 785 | + Raises: |
| 786 | + FileNotFoundError: If the processed data file does not exist. |
| 787 | + """ |
| 788 | + print("Checking for processed data in", self.processed_dir_main) |
| 789 | + |
| 790 | + processed_name = self.processed_main_file_names_dict["data"] |
| 791 | + if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): |
| 792 | + raise FileNotFoundError( |
| 793 | + f"File {processed_name} not found.\n" |
| 794 | + f"You must run the appropriate DeepGO migration script " |
| 795 | + f"(chebai/preprocessing/migration/deep_go) before executing this configuration " |
| 796 | + f"to migrate data from DeepGO to this data structure." |
| 797 | + ) |
| 798 | + |
| 799 | + def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List: |
| 800 | + # Selection of GO classes not needed for migrated data |
| 801 | + pass |
| 802 | + |
| 803 | + # ------------------------------ Phase: Raw Properties ----------------------------------- |
| 804 | + @property |
| 805 | + @abstractmethod |
| 806 | + def processed_main_file_names_dict(self) -> Dict[str, str]: |
| 807 | + """ |
| 808 | + Abstract property for defining main processed file names. |
| 809 | + These files are stored in the same directory as the generated data files |
| 810 | + but have distinct names to differentiate them during training. |
| 811 | +
|
| 812 | + Returns: |
| 813 | + dict: A dictionary with key-value pairs for main processed file names. |
| 814 | + """ |
| 815 | + pass |
| 816 | + |
| 817 | + @property |
| 818 | + @abstractmethod |
| 819 | + def processed_file_names_dict(self) -> Dict[str, str]: |
| 820 | + """ |
| 821 | + Abstract property for defining additional processed file names. |
| 822 | + These files are stored in the same directory as the generated data files |
| 823 | + but have distinct names to differentiate them during training. |
| 824 | +
|
| 825 | + Returns: |
| 826 | + dict: A dictionary with key-value pairs for processed file names. |
| 827 | + """ |
| 828 | + pass |
| 829 | + |
| 830 | + |
| 831 | +class DeepGO1MigratedData(_DeepGOMigratedData): |
| 832 | + """ |
| 833 | + Migrated data class specific to DeepGO1. Sets threshold values according |
| 834 | + to the research paper based on the GO branch. |
| 835 | +
|
| 836 | + Note: |
| 837 | + Refer reference number 1 at the top of this file for the corresponding research paper. |
| 838 | +
|
| 839 | + Args: |
| 840 | + **kwargs: Arbitrary keyword arguments passed to the superclass. |
| 841 | +
|
| 842 | + Raises: |
| 843 | + ValueError: If an unsupported GO branch is provided. |
| 844 | + """ |
| 845 | + |
| 846 | + def __init__(self, **kwargs): |
| 847 | + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 |
| 848 | + assert int(kwargs.get("max_sequence_length")) == 1002 |
| 849 | + |
| 850 | + # Set threshold based on GO branch, as per DeepGO1 paper and its data. |
| 851 | + if kwargs.get("go_branch") in ["CC", "MF"]: |
| 852 | + self.THRESHOLD = 50 |
| 853 | + elif kwargs.get("go_branch") == "BP": |
| 854 | + self.THRESHOLD = 250 |
| 855 | + else: |
| 856 | + raise ValueError( |
| 857 | + f"DeepGO1 paper has no defined threshold for branch {self.go_branch}" |
| 858 | + ) |
| 859 | + |
| 860 | + super(_DeepGOMigratedData, self).__init__(**kwargs) |
| 861 | + |
| 862 | + @property |
| 863 | + def processed_main_file_names_dict(self) -> Dict[str, str]: |
| 864 | + """ |
| 865 | + Returns main processed file names specific to DeepGO1. |
| 866 | +
|
| 867 | + Returns: |
| 868 | + dict: Dictionary with the main data file name for DeepGO1. |
| 869 | + """ |
| 870 | + return {"data": "data_deep_go1.pkl"} |
| 871 | + |
| 872 | + @property |
| 873 | + def processed_file_names_dict(self) -> Dict[str, str]: |
| 874 | + """ |
| 875 | + Returns processed file names specific to DeepGO1. |
| 876 | +
|
| 877 | + Returns: |
| 878 | + dict: Dictionary with data file name for DeepGO1. |
| 879 | + """ |
| 880 | + return {"data": "data_deep_go1.pt"} |
| 881 | + |
| 882 | + |
| 883 | +class DeepGO2MigratedData(_DeepGOMigratedData): |
| 884 | + """ |
| 885 | + Migrated data class specific to DeepGO2, inheriting from DeepGO1MigratedData |
| 886 | + with different processed file names. |
| 887 | +
|
| 888 | + Note: |
| 889 | + Refer reference number 3 at the top of this file for the corresponding research paper. |
| 890 | +
|
| 891 | + Returns: |
| 892 | + dict: Dictionary with file names specific to DeepGO2. |
| 893 | + """ |
| 894 | + |
| 895 | + def __init__(self, **kwargs): |
| 896 | + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 |
| 897 | + assert int(kwargs.get("max_sequence_length")) == 1000 |
| 898 | + |
| 899 | + super(_DeepGOMigratedData, self).__init__(**kwargs) |
| 900 | + |
| 901 | + @property |
| 902 | + def processed_main_file_names_dict(self) -> Dict[str, str]: |
| 903 | + """ |
| 904 | + Returns main processed file names specific to DeepGO2. |
| 905 | +
|
| 906 | + Returns: |
| 907 | + dict: Dictionary with the main data file name for DeepGO2. |
| 908 | + """ |
| 909 | + return {"data": "data_deep_go2.pkl"} |
| 910 | + |
| 911 | + @property |
| 912 | + def processed_file_names_dict(self) -> Dict[str, str]: |
| 913 | + """ |
| 914 | + Returns processed file names specific to DeepGO2. |
| 915 | +
|
| 916 | + Returns: |
| 917 | + dict: Dictionary with data file name for DeepGO2. |
| 918 | + """ |
| 919 | + return {"data": "data_deep_go2.pt"} |
0 commit comments