Skip to content

Commit 093be28

Browse files
committed
migration script update
1 parent e0a8524 commit 093be28

File tree

2 files changed

+71
-94
lines changed

2 files changed

+71
-94
lines changed

chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class DeepGo1DataMigration:
1414
A class to handle data migration and processing for the DeepGO project.
1515
It migrates the DeepGO data to our data structure followed for GO-UniProt data.
1616
17-
This class handles data from the DeepGO model as described in:
17+
This class handles migration of data from the DeepGO paper below:
1818
Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf,
1919
DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier,
2020
Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668

chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py

Lines changed: 70 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -5,61 +5,34 @@
55
import pandas as pd
66
from jsonargparse import CLI
77

8-
from chebai.preprocessing.datasets.go_uniprot import (
9-
GOUniProtOver50,
10-
GOUniProtOver250,
11-
_GOUniProtDataExtractor,
12-
)
8+
from chebai.preprocessing.datasets.go_uniprot import DeepGO2MigratedData
139

1410

1511
class DeepGo2DataMigration:
1612
"""
1713
A class to handle data migration and processing for the DeepGO project. It migrates the data from the DeepGO-SE
1814
data structure to our data structure followed for GO-UniProt data.
1915
20-
It migrates the data of DeepGO model of the below research paper:
16+
This class handles migration of data from the DeepGO paper below:
2117
Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf,
2218
DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier,
2319
Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668
24-
(https://doi.org/10.1093/bioinformatics/btx624),
25-
26-
Attributes:
27-
_CORRESPONDING_GO_CLASSES (dict): Mapping of GO branches to specific data extractor classes.
28-
_MAXLEN (int): Maximum sequence length for sequences.
29-
_LABELS_START_IDX (int): Starting index for labels in the dataset.
30-
31-
Methods:
32-
__init__(data_dir, go_branch): Initializes the data directory and GO branch.
33-
_load_data(): Loads train, validation, test, and terms data from the specified directory.
34-
_record_splits(): Creates a DataFrame with IDs and their corresponding split.
35-
migrate(): Executes the migration process including data loading, processing, and saving.
36-
_extract_required_data_from_splits(): Extracts required columns from the splits data.
37-
_generate_labels(data_df): Generates label columns for the data based on GO terms.
38-
extract_go_id(go_list): Extracts GO IDs from a list.
39-
save_migrated_data(data_df, splits_df): Saves the processed data and splits.
20+
(https://doi.org/10.1093/bioinformatics/btx624)
4021
"""
4122

42-
# Link for the namespaces convention used for GO branch
43-
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L18-L22
44-
_CORRESPONDING_GO_CLASSES = {
45-
"cc": GOUniProtOver50,
46-
"mf": GOUniProtOver50,
47-
"bp": GOUniProtOver250,
48-
}
49-
5023
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
5124
_MAXLEN = 1000
52-
_LABELS_START_IDX = _GOUniProtDataExtractor._LABELS_START_IDX
25+
_LABELS_START_IDX = DeepGO2MigratedData._LABELS_START_IDX
5326

5427
def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]):
5528
"""
5629
Initializes the data migration object with a data directory and GO branch.
5730
5831
Args:
5932
data_dir (str): Directory containing the data files.
60-
go_branch (Literal["cc", "mf", "bp"]): GO branch to use (cellular_component, molecular_function, or biological_process).
33+
go_branch (Literal["cc", "mf", "bp"]): GO branch to use.
6134
"""
62-
valid_go_branches = list(self._CORRESPONDING_GO_CLASSES.keys())
35+
valid_go_branches = list(DeepGO2MigratedData.GO_BRANCH_MAPPING.keys())
6336
if go_branch not in valid_go_branches:
6437
raise ValueError(f"go_branch must be one of {valid_go_branches}")
6538
self._go_branch = go_branch
@@ -71,13 +44,45 @@ def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]):
7144
self._terms_df: Optional[pd.DataFrame] = None
7245
self._classes: Optional[List[str]] = None
7346

47+
def migrate(self) -> None:
48+
"""
49+
Executes the data migration by loading, processing, and saving the data.
50+
"""
51+
print("Starting the migration process...")
52+
self._load_data()
53+
if not all(
54+
df is not None
55+
for df in [
56+
self._train_df,
57+
self._validation_df,
58+
self._test_df,
59+
self._terms_df,
60+
]
61+
):
62+
raise Exception(
63+
"Data splits or terms data is not available in instance variables."
64+
)
65+
splits_df = self._record_splits()
66+
67+
data_df = self._extract_required_data_from_splits()
68+
data_with_labels_df = self._generate_labels(data_df)
69+
70+
if not all(
71+
var is not None for var in [data_with_labels_df, splits_df, self._classes]
72+
):
73+
raise Exception(
74+
"Data splits or terms data is not available in instance variables."
75+
)
76+
77+
self.save_migrated_data(data_df, splits_df)
78+
7479
def _load_data(self) -> None:
7580
"""
7681
Loads the test, train, validation, and terms data from the pickled files
7782
in the data directory.
7883
"""
7984
try:
80-
print(f"Loading data from {self._data_dir}......")
85+
print(f"Loading data from directory: {self._data_dir}......")
8186
self._test_df = pd.DataFrame(
8287
pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl"))
8388
)
@@ -100,7 +105,7 @@ def _record_splits(self) -> pd.DataFrame:
100105
Returns:
101106
pd.DataFrame: A combined DataFrame containing split assignments.
102107
"""
103-
print("Recording splits...")
108+
print("Recording data splits for train, validation, and test sets.")
104109
split_assignment_list: List[pd.DataFrame] = [
105110
pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}),
106111
pd.DataFrame(
@@ -112,38 +117,6 @@ def _record_splits(self) -> pd.DataFrame:
112117
combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True)
113118
return combined_split_assignment
114119

115-
def migrate(self) -> None:
116-
"""
117-
Executes the data migration by loading, processing, and saving the data.
118-
"""
119-
print("Migration started......")
120-
self._load_data()
121-
if not all(
122-
df is not None
123-
for df in [
124-
self._train_df,
125-
self._validation_df,
126-
self._test_df,
127-
self._terms_df,
128-
]
129-
):
130-
raise Exception(
131-
"Data splits or terms data is not available in instance variables."
132-
)
133-
splits_df = self._record_splits()
134-
135-
data_df = self._extract_required_data_from_splits()
136-
data_with_labels_df = self._generate_labels(data_df)
137-
138-
if not all(
139-
var is not None for var in [data_with_labels_df, splits_df, self._classes]
140-
):
141-
raise Exception(
142-
"Data splits or terms data is not available in instance variables."
143-
)
144-
145-
self.save_migrated_data(data_df, splits_df)
146-
147120
def _extract_required_data_from_splits(self) -> pd.DataFrame:
148121
"""
149122
Extracts required columns from the combined data splits.
@@ -186,6 +159,19 @@ def _extract_required_data_from_splits(self) -> pd.DataFrame:
186159
)
187160
return data_df
188161

162+
@staticmethod
163+
def extract_go_id(go_list: List[str]) -> List[int]:
164+
"""
165+
Extracts and parses GO IDs from a list of GO annotations.
166+
167+
Args:
168+
go_list (List[str]): List of GO annotation strings.
169+
170+
Returns:
171+
List[str]: List of parsed GO IDs.
172+
"""
173+
return [DeepGO2MigratedData._parse_go_id(go_id_str) for go_id_str in go_list]
174+
189175
def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
190176
"""
191177
Generates label columns for each GO term in the dataset.
@@ -198,7 +184,7 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
198184
"""
199185
print("Generating labels based on terms.pkl file.......")
200186
parsed_go_ids: pd.Series = self._terms_df["gos"].apply(
201-
lambda gos: _GOUniProtDataExtractor._parse_go_id(gos)
187+
lambda gos: DeepGO2MigratedData._parse_go_id(gos)
202188
)
203189
all_go_ids_list = parsed_go_ids.values.tolist()
204190
self._classes = all_go_ids_list
@@ -215,21 +201,6 @@ def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame:
215201
data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)]
216202
return data_df
217203

218-
@staticmethod
219-
def extract_go_id(go_list: List[str]) -> List[int]:
220-
"""
221-
Extracts and parses GO IDs from a list of GO annotations.
222-
223-
Args:
224-
go_list (List[str]): List of GO annotation strings.
225-
226-
Returns:
227-
List[str]: List of parsed GO IDs.
228-
"""
229-
return [
230-
_GOUniProtDataExtractor._parse_go_id(go_id_str) for go_id_str in go_list
231-
]
232-
233204
def save_migrated_data(
234205
self, data_df: pd.DataFrame, splits_df: pd.DataFrame
235206
) -> None:
@@ -241,29 +212,35 @@ def save_migrated_data(
241212
splits_df (pd.DataFrame): Split assignment DataFrame.
242213
"""
243214
print("Saving transformed data......")
244-
go_class_instance: _GOUniProtDataExtractor = self._CORRESPONDING_GO_CLASSES[
245-
self._go_branch
246-
](go_branch=self._go_branch.upper(), max_sequence_length=self._MAXLEN)
215+
deepgo_migr_inst: DeepGO2MigratedData = DeepGO2MigratedData(
216+
go_branch=DeepGO2MigratedData.GO_BRANCH_MAPPING[self._go_branch],
217+
max_sequence_length=self._MAXLEN,
218+
)
247219

248-
go_class_instance.save_processed(
249-
data_df, go_class_instance.processed_main_file_names_dict["data"]
220+
# Save data file
221+
deepgo_migr_inst.save_processed(
222+
data_df, deepgo_migr_inst.processed_main_file_names_dict["data"]
250223
)
251224
print(
252-
f"{go_class_instance.processed_main_file_names_dict['data']} saved to {go_class_instance.processed_dir_main}"
225+
f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}"
253226
)
254227

228+
# Save split file
255229
splits_df.to_csv(
256-
os.path.join(go_class_instance.processed_dir_main, "splits.csv"),
230+
os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go2.csv"),
257231
index=False,
258232
)
259-
print(f"splits.csv saved to {go_class_instance.processed_dir_main}")
233+
print(f"splits_deep_go2.csv saved to {deepgo_migr_inst.processed_dir_main}")
260234

235+
# Save classes.txt file
261236
classes = sorted(self._classes)
262237
with open(
263-
os.path.join(go_class_instance.processed_dir_main, "classes.txt"), "wt"
238+
os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go2.txt"),
239+
"wt",
264240
) as fout:
265241
fout.writelines(str(node) + "\n" for node in classes)
266-
print(f"classes.txt saved to {go_class_instance.processed_dir_main}")
242+
print(f"classes_deep_go2.txt saved to {deepgo_migr_inst.processed_dir_main}")
243+
267244
print("Migration completed!")
268245

269246

0 commit comments

Comments
 (0)