Skip to content

Commit 07da64b

Browse files
committed
new design for herbs and leaves
1 parent dd99189 commit 07da64b

File tree

2 files changed

+148
-72
lines changed

2 files changed

+148
-72
lines changed

dataherb/core/base.py

Lines changed: 133 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
logger = logging.getLogger("dataherb.core.base")
1212

1313

14-
class Herb:
14+
class Herb(object):
1515
"""
16-
Herb is the base class for a dataset.
16+
Herb is a collection of the dataset.
1717
"""
1818
def __init__(self, herb_meta_json):
1919
"""
@@ -25,6 +25,7 @@ def __init__(self, herb_meta_json):
2525
self.description = self.herb_meta_json.get("description")
2626
self.repository = self.herb_meta_json.get("repository")
2727
self.id = self.herb_meta_json.get("id")
28+
self._get_leaves()
2829

2930
def search_score(self, keywords, keys=None):
3031
"""
@@ -75,71 +76,140 @@ def download(self):
7576

7677
data_files = []
7778

78-
for file in self.herb_meta_json.get("data"):
79-
file_path = "https://raw.githubusercontent.com/{}/master/{}".format(
80-
self.repository,
81-
file.get("path")
79+
for leaf_meta in self.herb_meta_json.get("data"):
80+
leaf = Leaf(leaf_meta, self)
81+
data_files.append(leaf.download())
82+
83+
return data_files
84+
85+
def _get_leaves(self):
86+
"""
87+
leaf fetches the leaf/leaves of the Herb.
88+
"""
89+
self.leaves = {}
90+
91+
for leaf_meta in self.herb_meta_json.get("data"):
92+
leaf_meta_path = leaf_meta.get("path")
93+
leaf = Leaf(leaf_meta, self)
94+
self.leaves[leaf_meta_path] = leaf
95+
96+
def __str__(self):
97+
return self.metadata()
98+
99+
100+
class Leaf(object):
101+
"""
102+
Leaf is a data file of the Herb.
103+
"""
104+
105+
def __init__(self, leaf_meta_json, herb):
106+
self.leaf_meta_json = leaf_meta_json
107+
self.herb = herb
108+
109+
self.url = "https://raw.githubusercontent.com/{}/master/{}".format(
110+
self.herb.repository,
111+
self.leaf_meta_json.get("path")
112+
)
113+
self.format = self.leaf_meta_json.get("format")
114+
# decode the file content using decode
115+
self.decode = self.leaf_meta_json.get("decode", "utf-8")
116+
self.name = self.leaf_meta_json.get("name")
117+
self.description = self.leaf_meta_json.get("description")
118+
self.path = self.leaf_meta_json.get("path")
119+
self.downloaded = {}
120+
121+
def download(self):
122+
"""
123+
download downloads the data
124+
"""
125+
126+
# Fetch data from remote
127+
file_content = _get_data_from_url(self.url)
128+
if not file_content.status_code == 200:
129+
file_error_msg = "Could not fetch remote file: {}; {}".format(
130+
self.url,
131+
file_content.status_code
82132
)
83-
file_format = file.get("format")
84-
# decode the file content using file_decode
85-
file_decode = file.get("decode", "utf-8")
86-
87-
# Fetch data from remote
88-
file_content = _get_data_from_url(file_path)
89-
if not file_content.status_code == 200:
90-
file_error_msg = "Could not fetch remote file: {}; {}".format(
91-
file_path,
92-
file_content.status_code
93-
)
94-
logger.error(
95-
file_error_msg
96-
)
97-
file_content = json.dumps([{
98-
"path": file_path,
99-
"error": file_error_msg
100-
}])
133+
logger.error(
134+
file_error_msg
135+
)
136+
file_content = json.dumps([{
137+
"url": self.url,
138+
"error": file_error_msg
139+
}])
140+
else:
141+
file_content = file_content.content
142+
143+
if self.format.lower() == "csv":
144+
if isinstance(file_content, bytes):
145+
file_string_io = io.StringIO(file_content.decode(self.decode))
101146
else:
102-
file_content = file_content.content
103-
104-
if file_format.lower() == "csv":
105-
if isinstance(file_content, bytes):
106-
file_string_io = io.StringIO(file_content.decode(file_decode))
107-
else:
108-
file_string_io = file_content
109-
# csv files may have comment rows
110-
file_comment = file.get("comment")
111-
# csv files may have different separators
112-
file_separator = file.get("seperator", ",")
113-
try:
114-
data = pd.read_csv(
115-
file_string_io, comment=file_comment, sep=file_separator
116-
)
117-
except Exception as e:
118-
logger.error(f"Error loading remote file: {file_path}")
119-
data = file_string_io
120-
elif file_format.lower() == "json":
121-
if isinstance(file_content, bytes):
122-
file_string_io = io.StringIO(file_content.decode(file_decode))
123-
else:
124-
file_string_io = file_content
125-
126-
try:
127-
data = pd.read_json(file_path)
128-
except Exception as e:
129-
logger.error(f"Error loading remote file: {file_path}")
130-
data = file_string_io
147+
file_string_io = file_content
148+
# csv files may have comment rows
149+
file_comment = self.leaf_meta_json.get("comment")
150+
# csv files may have different separators
151+
file_separator = self.leaf_meta_json.get("seperator", ",")
152+
try:
153+
data = pd.read_csv(
154+
file_string_io, comment=file_comment, sep=file_separator
155+
)
156+
except Exception as e:
157+
logger.error(f"Error loading remote file: {self.url}")
158+
data = file_string_io
159+
elif self.format.lower() == "json":
160+
if isinstance(file_content, bytes):
161+
file_string_io = io.StringIO(file_content.decode(self.decode))
131162
else:
132-
logger.error(f"data file format {file_format} is not supported!")
163+
file_string_io = file_content
164+
165+
try:
166+
data = pd.read_json(self.url)
167+
except Exception as e:
168+
logger.error(f"Error loading remote file: {self.url}")
169+
data = file_string_io
170+
else:
171+
logger.error(f"data file format {self.format} is not supported!")
172+
173+
self.downloaded = {
174+
"data": data,
175+
"content": file_content
176+
}
133177

134-
data_files.append({
135-
"name": file.get("name"),
136-
"description": file.get("description"),
137-
"path": file_path,
138-
"file": file.get("path"),
139-
"data": data
140-
})
178+
@property
179+
def data(self):
180+
if not self.downloaded:
181+
self.download()
141182

142-
return data_files
183+
return self.downloaded.get("data")
184+
185+
@property
186+
def content(self):
187+
if not self.downloaded:
188+
self.download()
189+
190+
return self.downloaded.get("content")
191+
192+
def metadata(self, format=None):
193+
"""
194+
metadata formats the metadata of the herb
195+
"""
196+
if format is None:
197+
format = "yaml"
198+
199+
if format == "yaml":
200+
return yaml.dump(self.leaf_meta_json)
201+
elif format == "json":
202+
return self.leaf_meta_json
203+
else:
204+
logger.error(f"format {format} is not support for metadata!")
143205

144206
def __str__(self):
145-
return self.metadata()
207+
return """{} from {} with size {}, the remote file is located at {};\n\n{}
208+
""".format(
209+
self.leaf_meta_json.get("path"),
210+
self.herb.id,
211+
self.leaf_meta_json.get("size"),
212+
self.path,
213+
self.metadata()
214+
)
215+

dataherb/dataherb.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
logging.basicConfig()
88
logger = logging.getLogger("dataherb.dataherb")
99

10-
_DATAHERB_API_URL = "https://dataherb.github.io/api/datasets.json"
10+
_DATAHERB_API_URL = "https://dataherb.github.io/api/flora.json"
1111

12-
class DataHerb:
12+
class Flora(object):
1313
"""
1414
DataHerb is the container of datasets.
1515
"""
@@ -24,6 +24,8 @@ def __init__(self, api_url=None, flora=None):
2424
api_url = _DATAHERB_API_URL
2525
self.api_url = api_url
2626

27+
self.website = "https://dataherb.github.io/flora"
28+
2729
if flora is None:
2830
flora = self._get_flora()
2931
self.flora = flora
@@ -75,9 +77,11 @@ def herb_meta(self, id):
7577
if herbs:
7678
herb = herbs[0]
7779

78-
herb = herb.get("herb")
80+
herb = herb.get("herb")
7981

80-
return herb.metadata()
82+
return herb.metadata()
83+
else:
84+
return
8185

8286
def herb(self, id):
8387
"""
@@ -94,21 +98,23 @@ def herb(self, id):
9498
if herbs:
9599
herb = herbs[0]
96100

97-
herb = herb.get("herb")
98-
99-
return herb.download()
101+
herb = herb.get("herb")
100102

103+
return herb
104+
else:
105+
logger.error(f"Could not find herb {id}")
106+
return
101107

102108

103109
if __name__ == "__main__":
104110

105-
dataherb = DataHerb()
111+
dataherb = Flora()
106112
geo_datasets = _search_by_keywords_in_flora(dataherb.flora, keywords=["geo"])
107113

108114
print(geo_datasets)
109115

110116
print(
111-
dataherb.herb("geonames_timezone")
117+
dataherb.herb("geonames_timezone").leaves.get("dataset/geonames_timezone.csv").data
112118
)
113119

114120
logger.debug("End of Game")

0 commit comments

Comments
 (0)