Skip to content

Commit 58ae92d

Browse files
committed
add base code for deep_go data migration
- migration from deep go format to chebai->go_uniprot format
1 parent 154e827 commit 58ae92d

File tree

1 file changed

+54
-0
lines changed

1 file changed

+54
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from typing import List
2+
3+
import pandas as pd
4+
5+
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L18-L22
6+
NAMESPACES = {
7+
"cc": "cellular_component",
8+
"mf": "molecular_function",
9+
"bp": "biological_process",
10+
}
11+
12+
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11
13+
MAXLEN = 1000
14+
15+
16+
def load_data(data_dir):
17+
test_df = pd.DataFrame(pd.read_pickle("test_data.pkl"))
18+
train_df = pd.DataFrame(pd.read_pickle("train_data.pkl"))
19+
validation_df = pd.DataFrame(pd.read_pickle("valid_data.pkl"))
20+
21+
required_columns = [
22+
"proteins",
23+
"accessions",
24+
"sequences",
25+
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L45-L58
26+
"exp_annotations", # Directly associated GO ids
27+
# https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L60-L69
28+
"prop_annotations", # Transitively associated GO ids
29+
]
30+
31+
new_df = pd.concat(
32+
[
33+
train_df[required_columns],
34+
validation_df[required_columns],
35+
test_df[required_columns],
36+
],
37+
ignore_index=True,
38+
)
39+
# Generate splits.csv file to store ids of each corresponding split
40+
split_assignment_list: List[pd.DataFrame] = [
41+
pd.DataFrame({"id": train_df["proteins"], "split": "train"}),
42+
pd.DataFrame({"id": validation_df["proteins"], "split": "validation"}),
43+
pd.DataFrame({"id": test_df["proteins"], "split": "test"}),
44+
]
45+
46+
combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True)
47+
48+
49+
def save_data(data_dir, data_df):
50+
pass
51+
52+
53+
if __name__ == "__main__":
54+
pass

0 commit comments

Comments
 (0)