8
8
9
9
10
10
class DatasetLoader :
11
+ """
12
+ General dataset loader for the integrated drug pair scoring datasets.
13
+ """
14
+
11
15
def __init__ (self , dataset_name : str ):
16
+ """
17
+ Args:
18
+ dataset_name (str): The name of the dataset.
19
+ """
12
20
self .base_url = "https://raw.githubusercontent.com/AstraZeneca/chemicalx/main/dataset"
13
21
self .dataset_name = dataset_name
14
22
assert dataset_name in ["drugcombdb" , "drugcomb" ]
15
23
16
24
def generate_path (self , file_name : str ) -> str :
25
+ """
26
+ Generating a complete url for a dataset file.
27
+
28
+ Args:
29
+ file_name (str): Name of the data file.
30
+ Returns:
31
+ data_path (str): The complete url to the dataset.
32
+ """
17
33
data_path = "/" .join ([self .base_url , self .dataset_name , file_name ])
18
34
return data_path
19
35
20
36
def load_raw_json_data (self , path : str ) -> Dict :
37
+ """
38
+ Given a path reading the raw JSON dataset.
39
+
40
+ Args:
41
+ path (str): The path to the JSON file.
42
+ Returns:
43
+ raw_data (dict): A dictionary with the data.
44
+ """
21
45
with urllib .request .urlopen (path ) as url :
22
46
raw_data = json .loads (url .read ().decode ())
23
47
return raw_data
24
48
25
49
def load_raw_csv_data (self , path : str ) -> pd .DataFrame :
50
+ """
51
+ Reading the labeled triples CSV in memory.
52
+
53
+ Args:
54
+ path (str): The path to the triples CSV file.
55
+ Returns:
56
+ raw_data (pd.DataFrame): A pandas DataFrame with the data.
57
+ """
26
58
data_bytes = urllib .request .urlopen (path ).read ()
27
59
types = {"drug_1" : str , "drug_2" : str , "context" : str , "label" : float }
28
60
raw_data = pd .read_csv (io .BytesIO (data_bytes ), encoding = "utf8" , sep = "," , dtype = types )
29
61
return raw_data
30
62
31
63
def get_context_features (self ):
64
+ """
65
+ Reading the context feature set.
66
+
67
+ Returns:
68
+ context_feature_set (ContextFeatureSet): The ContextFeatureSet of the dataset of interest.
69
+ """
32
70
path = self .generate_path ("context_set.json" )
33
71
raw_data = self .load_raw_json_data (path )
34
72
raw_data = {k : np .array (v ) for k , v in raw_data .items ()}
@@ -37,6 +75,12 @@ def get_context_features(self):
37
75
return context_feature_set
38
76
39
77
def get_drug_features (self ):
78
+ """
79
+ Reading the drug feature set.
80
+
81
+ Returns:
82
+ drug_feature_set (DrugFeatureSet): The DrugFeatureSet of the dataset of interest.
83
+ """
40
84
path = self .generate_path ("drug_set.json" )
41
85
raw_data = self .load_raw_json_data (path )
42
86
raw_data = {k : {"smiles" : v ["smiles" ], "features" : np .array (v ["features" ])} for k , v in raw_data .items ()}
@@ -45,8 +89,14 @@ def get_drug_features(self):
45
89
return drug_feature_set
46
90
47
91
def get_labeled_triples (self ):
92
+ """
93
+ Getting the labeled triples file from the storage.
94
+
95
+ Returns:
96
+ labeled_triples (LabeledTriples): The labeled triples in the dataset.
97
+ """
48
98
path = self .generate_path ("labeled_triples.csv" )
49
99
raw_data = self .load_raw_csv_data (path )
50
- labeled_triple_set = LabeledTriples ()
51
- labeled_triple_set .update_from_pandas (raw_data )
52
- return labeled_triple_set
100
+ labeled_triples = LabeledTriples ()
101
+ labeled_triples .update_from_pandas (raw_data )
102
+ return labeled_triples
0 commit comments