55import pandas as pd
66from jsonargparse import CLI
77
8- from chebai .preprocessing .datasets .go_uniprot import (
9- GOUniProtOver50 ,
10- GOUniProtOver250 ,
11- _GOUniProtDataExtractor ,
12- )
8+ from chebai .preprocessing .datasets .go_uniprot import DeepGO2MigratedData
139
1410
1511class DeepGo2DataMigration :
1612 """
1713 A class to handle data migration and processing for the DeepGO project. It migrates the data from the DeepGO-SE
1814 data structure to our data structure followed for GO-UniProt data.
1915
20- It migrates the data of DeepGO model of the below research paper:
16+ This class handles migration of data from the DeepGO paper below :
2117 Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf,
2218 DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier,
2319 Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668
24- (https://doi.org/10.1093/bioinformatics/btx624),
25-
26- Attributes:
27- _CORRESPONDING_GO_CLASSES (dict): Mapping of GO branches to specific data extractor classes.
28- _MAXLEN (int): Maximum sequence length for sequences.
29- _LABELS_START_IDX (int): Starting index for labels in the dataset.
30-
31- Methods:
32- __init__(data_dir, go_branch): Initializes the data directory and GO branch.
33- _load_data(): Loads train, validation, test, and terms data from the specified directory.
34- _record_splits(): Creates a DataFrame with IDs and their corresponding split.
35- migrate(): Executes the migration process including data loading, processing, and saving.
36- _extract_required_data_from_splits(): Extracts required columns from the splits data.
37- _generate_labels(data_df): Generates label columns for the data based on GO terms.
38- extract_go_id(go_list): Extracts GO IDs from a list.
39- save_migrated_data(data_df, splits_df): Saves the processed data and splits.
20+ (https://doi.org/10.1093/bioinformatics/btx624)
4021 """
4122
42- # Link for the namespaces convention used for GO branch
43- # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L18-L22
44- _CORRESPONDING_GO_CLASSES = {
45- "cc" : GOUniProtOver50 ,
46- "mf" : GOUniProtOver50 ,
47- "bp" : GOUniProtOver250 ,
48- }
49-
5023 # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
5124 _MAXLEN = 1000
52- _LABELS_START_IDX = _GOUniProtDataExtractor ._LABELS_START_IDX
25+ _LABELS_START_IDX = DeepGO2MigratedData ._LABELS_START_IDX
5326
5427 def __init__ (self , data_dir : str , go_branch : Literal ["cc" , "mf" , "bp" ]):
5528 """
5629 Initializes the data migration object with a data directory and GO branch.
5730
5831 Args:
5932 data_dir (str): Directory containing the data files.
60- go_branch (Literal["cc", "mf", "bp"]): GO branch to use (cellular_component, molecular_function, or biological_process) .
33+ go_branch (Literal["cc", "mf", "bp"]): GO branch to use.
6134 """
62- valid_go_branches = list (self . _CORRESPONDING_GO_CLASSES .keys ())
35+ valid_go_branches = list (DeepGO2MigratedData . GO_BRANCH_MAPPING .keys ())
6336 if go_branch not in valid_go_branches :
6437 raise ValueError (f"go_branch must be one of { valid_go_branches } " )
6538 self ._go_branch = go_branch
@@ -71,13 +44,45 @@ def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]):
7144 self ._terms_df : Optional [pd .DataFrame ] = None
7245 self ._classes : Optional [List [str ]] = None
7346
47+ def migrate (self ) -> None :
48+ """
49+ Executes the data migration by loading, processing, and saving the data.
50+ """
51+ print ("Starting the migration process..." )
52+ self ._load_data ()
53+ if not all (
54+ df is not None
55+ for df in [
56+ self ._train_df ,
57+ self ._validation_df ,
58+ self ._test_df ,
59+ self ._terms_df ,
60+ ]
61+ ):
62+ raise Exception (
63+ "Data splits or terms data is not available in instance variables."
64+ )
65+ splits_df = self ._record_splits ()
66+
67+ data_df = self ._extract_required_data_from_splits ()
68+ data_with_labels_df = self ._generate_labels (data_df )
69+
70+ if not all (
71+ var is not None for var in [data_with_labels_df , splits_df , self ._classes ]
72+ ):
73+ raise Exception (
74+ "Data splits or terms data is not available in instance variables."
75+ )
76+
77+ self .save_migrated_data (data_df , splits_df )
78+
7479 def _load_data (self ) -> None :
7580 """
7681 Loads the test, train, validation, and terms data from the pickled files
7782 in the data directory.
7883 """
7984 try :
80- print (f"Loading data from { self ._data_dir } ......" )
85+ print (f"Loading data from directory: { self ._data_dir } ......" )
8186 self ._test_df = pd .DataFrame (
8287 pd .read_pickle (os .path .join (self ._data_dir , "test_data.pkl" ))
8388 )
@@ -100,7 +105,7 @@ def _record_splits(self) -> pd.DataFrame:
100105 Returns:
101106 pd.DataFrame: A combined DataFrame containing split assignments.
102107 """
103- print ("Recording splits.. ." )
108+ print ("Recording data splits for train, validation, and test sets ." )
104109 split_assignment_list : List [pd .DataFrame ] = [
105110 pd .DataFrame ({"id" : self ._train_df ["proteins" ], "split" : "train" }),
106111 pd .DataFrame (
@@ -112,38 +117,6 @@ def _record_splits(self) -> pd.DataFrame:
112117 combined_split_assignment = pd .concat (split_assignment_list , ignore_index = True )
113118 return combined_split_assignment
114119
115- def migrate (self ) -> None :
116- """
117- Executes the data migration by loading, processing, and saving the data.
118- """
119- print ("Migration started......" )
120- self ._load_data ()
121- if not all (
122- df is not None
123- for df in [
124- self ._train_df ,
125- self ._validation_df ,
126- self ._test_df ,
127- self ._terms_df ,
128- ]
129- ):
130- raise Exception (
131- "Data splits or terms data is not available in instance variables."
132- )
133- splits_df = self ._record_splits ()
134-
135- data_df = self ._extract_required_data_from_splits ()
136- data_with_labels_df = self ._generate_labels (data_df )
137-
138- if not all (
139- var is not None for var in [data_with_labels_df , splits_df , self ._classes ]
140- ):
141- raise Exception (
142- "Data splits or terms data is not available in instance variables."
143- )
144-
145- self .save_migrated_data (data_df , splits_df )
146-
147120 def _extract_required_data_from_splits (self ) -> pd .DataFrame :
148121 """
149122 Extracts required columns from the combined data splits.
@@ -186,6 +159,19 @@ def _extract_required_data_from_splits(self) -> pd.DataFrame:
186159 )
187160 return data_df
188161
162+ @staticmethod
163+ def extract_go_id (go_list : List [str ]) -> List [int ]:
164+ """
165+ Extracts and parses GO IDs from a list of GO annotations.
166+
167+ Args:
168+ go_list (List[str]): List of GO annotation strings.
169+
170+ Returns:
171+ List[str]: List of parsed GO IDs.
172+ """
173+ return [DeepGO2MigratedData ._parse_go_id (go_id_str ) for go_id_str in go_list ]
174+
189175 def _generate_labels (self , data_df : pd .DataFrame ) -> pd .DataFrame :
190176 """
191177 Generates label columns for each GO term in the dataset.
@@ -198,7 +184,7 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
198184 """
199185 print ("Generating labels based on terms.pkl file......." )
200186 parsed_go_ids : pd .Series = self ._terms_df ["gos" ].apply (
201- lambda gos : _GOUniProtDataExtractor ._parse_go_id (gos )
187+ lambda gos : DeepGO2MigratedData ._parse_go_id (gos )
202188 )
203189 all_go_ids_list = parsed_go_ids .values .tolist ()
204190 self ._classes = all_go_ids_list
@@ -215,21 +201,6 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
215201 data_df = data_df [data_df .iloc [:, self ._LABELS_START_IDX :].any (axis = 1 )]
216202 return data_df
217203
218- @staticmethod
219- def extract_go_id (go_list : List [str ]) -> List [int ]:
220- """
221- Extracts and parses GO IDs from a list of GO annotations.
222-
223- Args:
224- go_list (List[str]): List of GO annotation strings.
225-
226- Returns:
227- List[str]: List of parsed GO IDs.
228- """
229- return [
230- _GOUniProtDataExtractor ._parse_go_id (go_id_str ) for go_id_str in go_list
231- ]
232-
233204 def save_migrated_data (
234205 self , data_df : pd .DataFrame , splits_df : pd .DataFrame
235206 ) -> None :
@@ -241,29 +212,35 @@ def save_migrated_data(
241212 splits_df (pd.DataFrame): Split assignment DataFrame.
242213 """
243214 print ("Saving transformed data......" )
244- go_class_instance : _GOUniProtDataExtractor = self ._CORRESPONDING_GO_CLASSES [
245- self ._go_branch
246- ](go_branch = self ._go_branch .upper (), max_sequence_length = self ._MAXLEN )
215+ deepgo_migr_inst : DeepGO2MigratedData = DeepGO2MigratedData (
216+ go_branch = DeepGO2MigratedData .GO_BRANCH_MAPPING [self ._go_branch ],
217+ max_sequence_length = self ._MAXLEN ,
218+ )
247219
248- go_class_instance .save_processed (
249- data_df , go_class_instance .processed_main_file_names_dict ["data" ]
220+ # Save data file
221+ deepgo_migr_inst .save_processed (
222+ data_df , deepgo_migr_inst .processed_main_file_names_dict ["data" ]
250223 )
251224 print (
252- f"{ go_class_instance .processed_main_file_names_dict ['data' ]} saved to { go_class_instance .processed_dir_main } "
225+ f"{ deepgo_migr_inst .processed_main_file_names_dict ['data' ]} saved to { deepgo_migr_inst .processed_dir_main } "
253226 )
254227
228+ # Save split file
255229 splits_df .to_csv (
256- os .path .join (go_class_instance .processed_dir_main , "splits .csv" ),
230+ os .path .join (deepgo_migr_inst .processed_dir_main , "splits_deep_go2 .csv" ),
257231 index = False ,
258232 )
259- print (f"splits .csv saved to { go_class_instance .processed_dir_main } " )
233+ print (f"splits_deep_go2 .csv saved to { deepgo_migr_inst .processed_dir_main } " )
260234
235+ # Save classes.txt file
261236 classes = sorted (self ._classes )
262237 with open (
263- os .path .join (go_class_instance .processed_dir_main , "classes.txt" ), "wt"
238+ os .path .join (deepgo_migr_inst .processed_dir_main , "classes_deep_go2.txt" ),
239+ "wt" ,
264240 ) as fout :
265241 fout .writelines (str (node ) + "\n " for node in classes )
266- print (f"classes.txt saved to { go_class_instance .processed_dir_main } " )
242+ print (f"classes_deep_go2.txt saved to { deepgo_migr_inst .processed_dir_main } " )
243+
267244 print ("Migration completed!" )
268245
269246
0 commit comments