1111logger = logging .getLogger ("dataherb.core.base" )
1212
1313
14- class Herb :
14+ class Herb ( object ) :
1515 """
16- Herb is the base class for a dataset.
16+ Herb is a collection of the dataset.
1717 """
1818 def __init__ (self , herb_meta_json ):
1919 """
@@ -25,6 +25,7 @@ def __init__(self, herb_meta_json):
2525 self .description = self .herb_meta_json .get ("description" )
2626 self .repository = self .herb_meta_json .get ("repository" )
2727 self .id = self .herb_meta_json .get ("id" )
28+ self ._get_leaves ()
2829
2930 def search_score (self , keywords , keys = None ):
3031 """
@@ -75,71 +76,140 @@ def download(self):
7576
7677 data_files = []
7778
78- for file in self .herb_meta_json .get ("data" ):
79- file_path = "https://raw.githubusercontent.com/{}/master/{}" .format (
80- self .repository ,
81- file .get ("path" )
79+ for leaf_meta in self .herb_meta_json .get ("data" ):
80+ leaf = Leaf (leaf_meta , self )
81+ data_files .append (leaf .download ())
82+
83+ return data_files
84+
85+ def _get_leaves (self ):
86+ """
87+ leaf fetches the leaf/leaves of the Herb.
88+ """
89+ self .leaves = {}
90+
91+ for leaf_meta in self .herb_meta_json .get ("data" ):
92+ leaf_meta_path = leaf_meta .get ("path" )
93+ leaf = Leaf (leaf_meta , self )
94+ self .leaves [leaf_meta_path ] = leaf
95+
96+ def __str__ (self ):
97+ return self .metadata ()
98+
99+
100+ class Leaf (object ):
101+ """
102+ Leaf is a data file of the Herb.
103+ """
104+
105+ def __init__ (self , leaf_meta_json , herb ):
106+ self .leaf_meta_json = leaf_meta_json
107+ self .herb = herb
108+
109+ self .url = "https://raw.githubusercontent.com/{}/master/{}" .format (
110+ self .herb .repository ,
111+ self .leaf_meta_json .get ("path" )
112+ )
113+ self .format = self .leaf_meta_json .get ("format" )
114+ # decode the file content using decode
115+ self .decode = self .leaf_meta_json .get ("decode" , "utf-8" )
116+ self .name = self .leaf_meta_json .get ("name" )
117+ self .description = self .leaf_meta_json .get ("description" )
118+ self .path = self .leaf_meta_json .get ("path" )
119+ self .downloaded = {}
120+
121+ def download (self ):
122+ """
123+ download downloads the data
124+ """
125+
126+ # Fetch data from remote
127+ file_content = _get_data_from_url (self .url )
128+ if not file_content .status_code == 200 :
129+ file_error_msg = "Could not fetch remote file: {}; {}" .format (
130+ self .url ,
131+ file_content .status_code
82132 )
83- file_format = file .get ("format" )
84- # decode the file content using file_decode
85- file_decode = file .get ("decode" , "utf-8" )
86-
87- # Fetch data from remote
88- file_content = _get_data_from_url (file_path )
89- if not file_content .status_code == 200 :
90- file_error_msg = "Could not fetch remote file: {}; {}" .format (
91- file_path ,
92- file_content .status_code
93- )
94- logger .error (
95- file_error_msg
96- )
97- file_content = json .dumps ([{
98- "path" : file_path ,
99- "error" : file_error_msg
100- }])
133+ logger .error (
134+ file_error_msg
135+ )
136+ file_content = json .dumps ([{
137+ "url" : self .url ,
138+ "error" : file_error_msg
139+ }])
140+ else :
141+ file_content = file_content .content
142+
143+ if self .format .lower () == "csv" :
144+ if isinstance (file_content , bytes ):
145+ file_string_io = io .StringIO (file_content .decode (self .decode ))
101146 else :
102- file_content = file_content .content
103-
104- if file_format .lower () == "csv" :
105- if isinstance (file_content , bytes ):
106- file_string_io = io .StringIO (file_content .decode (file_decode ))
107- else :
108- file_string_io = file_content
109- # csv files may have comment rows
110- file_comment = file .get ("comment" )
111- # csv files may have different separators
112- file_separator = file .get ("seperator" , "," )
113- try :
114- data = pd .read_csv (
115- file_string_io , comment = file_comment , sep = file_separator
116- )
117- except Exception as e :
118- logger .error (f"Error loading remote file: { file_path } " )
119- data = file_string_io
120- elif file_format .lower () == "json" :
121- if isinstance (file_content , bytes ):
122- file_string_io = io .StringIO (file_content .decode (file_decode ))
123- else :
124- file_string_io = file_content
125-
126- try :
127- data = pd .read_json (file_path )
128- except Exception as e :
129- logger .error (f"Error loading remote file: { file_path } " )
130- data = file_string_io
147+ file_string_io = file_content
148+ # csv files may have comment rows
149+ file_comment = self .leaf_meta_json .get ("comment" )
150+ # csv files may have different separators
151+ file_separator = self .leaf_meta_json .get ("seperator" , "," )
152+ try :
153+ data = pd .read_csv (
154+ file_string_io , comment = file_comment , sep = file_separator
155+ )
156+ except Exception as e :
157+ logger .error (f"Error loading remote file: { self .url } " )
158+ data = file_string_io
159+ elif self .format .lower () == "json" :
160+ if isinstance (file_content , bytes ):
161+ file_string_io = io .StringIO (file_content .decode (self .decode ))
131162 else :
132- logger .error (f"data file format { file_format } is not supported!" )
163+ file_string_io = file_content
164+
165+ try :
166+ data = pd .read_json (self .url )
167+ except Exception as e :
168+ logger .error (f"Error loading remote file: { self .url } " )
169+ data = file_string_io
170+ else :
171+ logger .error (f"data file format { self .format } is not supported!" )
172+
173+ self .downloaded = {
174+ "data" : data ,
175+ "content" : file_content
176+ }
133177
134- data_files .append ({
135- "name" : file .get ("name" ),
136- "description" : file .get ("description" ),
137- "path" : file_path ,
138- "file" : file .get ("path" ),
139- "data" : data
140- })
178+ @property
179+ def data (self ):
180+ if not self .downloaded :
181+ self .download ()
141182
142- return data_files
183+ return self .downloaded .get ("data" )
184+
185+ @property
186+ def content (self ):
187+ if not self .downloaded :
188+ self .download ()
189+
190+ return self .downloaded .get ("content" )
191+
192+ def metadata (self , format = None ):
193+ """
194+ metadata formats the metadata of the herb
195+ """
196+ if format is None :
197+ format = "yaml"
198+
199+ if format == "yaml" :
200+ return yaml .dump (self .leaf_meta_json )
201+ elif format == "json" :
202+ return self .leaf_meta_json
203+ else :
204+ logger .error (f"format { format } is not support for metadata!" )
143205
144206 def __str__ (self ):
145- return self .metadata ()
207+ return """{} from {} with size {}, the remote file is located at {};\n \n {}
208+ """ .format (
209+ self .leaf_meta_json .get ("path" ),
210+ self .herb .id ,
211+ self .leaf_meta_json .get ("size" ),
212+ self .path ,
213+ self .metadata ()
214+ )
215+
0 commit comments