Skip to content

Commit bb52199

Browse files
committed
added parser for csv
1 parent aabb7ea commit bb52199

File tree

5 files changed

+250
-35
lines changed

5 files changed

+250
-35
lines changed

dataherb/command.py

Lines changed: 113 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,105 @@
22
import os
33

44
import click
5+
import inquirer
56

67
from dataherb.flora import Flora
7-
from dataherb.parse.model import MetaData
8+
from dataherb.parse.model import MetaData, IGNORED_FOLDERS_AND_FILES
89

910
__CWD__ = os.getcwd()
1011

1112
logging.basicConfig()
1213
logger = logging.getLogger("dataherb.command")
1314

15+
def describe_file(file):
16+
questions = [
17+
inquirer.Text(
18+
'name',
19+
message=f"How would you like to name the file: {file}?"
20+
),
21+
inquirer.Text(
22+
'description',
23+
message=f"What is {file} about?"
24+
),
25+
inquirer.Text(
26+
'updated_at',
27+
message=f"When was {file} last updated? In ISO date format such as 2020-02-17."
28+
)
29+
]
1430

15-
_FLORA = Flora()
31+
answers = inquirer.prompt(questions)
32+
meta = {
33+
"name": answers.get('name'),
34+
"description": answers.get("description"),
35+
"updated_at": answers.get("updated_at")
36+
}
1637

17-
_FLORA.herb("geonames_timezone").leaves.get("dataset/geonames_timezone.csv").data
38+
return meta
39+
40+
def describe_dataset():
41+
"""
42+
describe_dataset asks the user to specify some basic info about the dataset
43+
"""
44+
questions = [
45+
inquirer.Text(
46+
'name',
47+
message="How would you like to name the dataset?"
48+
),
49+
inquirer.Text(
50+
'description',
51+
message="What is the dataset about? This will be the description of the dataset."
52+
)
53+
]
54+
55+
answers = inquirer.prompt(questions)
56+
meta = {
57+
"name": answers.get('name', ""),
58+
"description": answers.get('description', "")
59+
}
60+
61+
return meta
62+
63+
64+
def where_is_dataset():
65+
"""
66+
where_is_dataset asks the user where the dataset is located.
67+
"""
68+
try:
69+
folders = []
70+
for root, dirs, files in os.walk(__CWD__):
71+
for d in dirs:
72+
if d not in IGNORED_FOLDERS_AND_FILES:
73+
folders.append(
74+
os.path.relpath(os.path.join(root, d), ".")
75+
)
76+
except Exception as e:
77+
logger.error("Can not get a list of folders in current directory.")
78+
folders = []
79+
80+
if folders:
81+
questions = [
82+
inquirer.List(
83+
'dataset_folder',
84+
message="Which folder contains the data file?",
85+
choices=folders
86+
)
87+
]
88+
else:
89+
questions = [
90+
inquirer.Path(
91+
'dataset_folder',
92+
message="Which folder will you place the data files?",
93+
path_type=inquirer.Path.DIRECTORY,
94+
)
95+
]
96+
97+
answers = inquirer.prompt(questions)
98+
dataset_folder = answers.get('dataset_folder')
99+
100+
return dataset_folder
101+
102+
103+
# _FLORA.herb("geonames_timezone").leaves.get("dataset/geonames_timezone.csv").data
18104

19105
@click.group()
20106
def dataherb():
@@ -23,8 +109,9 @@ def dataherb():
23109

24110
@dataherb.command()
25111
def search(keywords, ids):
112+
fl = Flora()
26113
click.echo('Search Herbs in DataHerb Flora ...')
27-
_FLORA.search()
114+
fl.search()
28115

29116
@dataherb.command()
30117
@click.confirmation_option(
@@ -35,13 +122,33 @@ def search(keywords, ids):
35122
def create():
36123

37124
md = MetaData()
125+
126+
dataset_basics = describe_dataset()
127+
print(dataset_basics)
128+
md.template.update(dataset_basics)
129+
130+
dataset_folder = where_is_dataset()
131+
print(
132+
f"Looking into the folder {dataset_folder} for data files..."
133+
)
134+
135+
dataset_files = md.parse_structure(dataset_folder)
136+
print(
137+
f"found {dataset_files}"
138+
)
139+
140+
for file in dataset_files:
141+
file_meta = describe_file(file)
142+
md.append_leaf(file, file_meta)
143+
38144
md.create()
39145

40146
click.echo(
41147
"The .dataherb folder and metadata.yml file has been created inside \n"
42-
f"{__CWD__}"
148+
f"{__CWD__}\n"
149+
"Please review the metadata.yml file and update other necessary fields of your desire."
43150
)
44151

45152
if __name__ == "__main__":
46153
fl = Flora()
47-
pass
154+
pass

dataherb/parse/model.py

Lines changed: 122 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,33 @@
1-
import os
1+
import csv
22
import logging
3-
import yaml
3+
import os
4+
from collections import OrderedDict
5+
from pathlib import Path
6+
7+
import ruamel.yaml
8+
from ruamel.yaml.representer import RoundTripRepresenter
49

510
logging.basicConfig()
611
logger = logging.getLogger("dataherb.parse.model")
712

13+
IGNORED_FOLDERS_AND_FILES = ['.git', '.dataherb', '.vscode']
14+
15+
# Add representer to ruamel.yaml for OrderedDict
16+
class MyRepresenter(RoundTripRepresenter):
17+
pass
18+
19+
ruamel.yaml.add_representer(
20+
OrderedDict, MyRepresenter.represent_dict, representer=MyRepresenter
21+
)
22+
yaml = ruamel.yaml.YAML()
23+
yaml.Representer = MyRepresenter
24+
825

926
class MetaData(object):
1027
def __init__(self):
11-
self.template = {
28+
self.dataherb_folder = '.dataherb'
29+
self.metadata_file = 'metadata.yml'
30+
self.template = OrderedDict({
1231
"name": "",
1332
"description": "",
1433
"contributors": [
@@ -17,44 +36,123 @@ def __init__(self):
1736
"github": ""
1837
}
1938
],
20-
"data": [
39+
"data": [],
40+
"references": [
2141
{
2242
"name": "",
23-
"path": "",
24-
"format": "",
25-
"size": "",
26-
"updated_at": "",
27-
"fields": [
28-
{
29-
"name": "",
30-
"description": ""
31-
},
32-
{
33-
"name": "",
34-
"description": ""
35-
}
36-
]
43+
"link": ""
3744
}
38-
],
39-
"references": [
45+
]
46+
})
47+
48+
def parse_structure(self, folder=None):
49+
50+
if folder is None:
51+
folder = '.'
52+
53+
tree_f = []
54+
tree_d = []
55+
for root, dirs, files in os.walk(folder):
56+
for d in dirs:
57+
if d not in IGNORED_FOLDERS_AND_FILES:
58+
tree_d.append(
59+
os.path.relpath(os.path.join(root, d), folder)
60+
)
61+
for f in files:
62+
tree_f.append(
63+
os.path.relpath(os.path.join(root, f), folder)
64+
)
65+
66+
self.tree = tree_f
67+
68+
return self.tree
69+
70+
def parse_csv(self, csv_file):
71+
"""
72+
parse_csv parses the csv files for metadata generation
73+
"""
74+
75+
with open(csv_file, "r") as f:
76+
reader = csv.reader(f)
77+
columns = next(reader)
78+
79+
fields = []
80+
for col in columns:
81+
fields.append({
82+
"name": col,
83+
"description": ""
84+
})
85+
86+
return fields
87+
88+
def _generate_leaf(self, path, meta_input):
89+
90+
name = meta_input.get("name", "")
91+
description = meta_input.get("description", "")
92+
updated_at = meta_input.get("updated_at", "")
93+
94+
file_format = path.split(".")[-1]
95+
if len(file_format) >= 10:
96+
logger.error(f"The format of file {path} could not be determined!")
97+
file_format = ""
98+
99+
file_size = os.stat(path).st_size
100+
101+
if file_format == "csv":
102+
fields = self.parse_csv(path)
103+
else:
104+
fields = [
40105
{
41106
"name": "",
42-
"link": ""
107+
"description": ""
108+
},
109+
{
110+
"name": "",
111+
"description": ""
43112
}
44113
]
114+
115+
res = {
116+
"name": name,
117+
"description": description,
118+
"path": path,
119+
"format": file_format,
120+
"size": file_size,
121+
"updated_at": updated_at,
122+
"fields": fields
45123
}
46124

125+
return res
126+
127+
def append_leaf(self, dataset_file, meta_input):
128+
129+
existing_leaves = self.template["data"]
130+
existing_leaves.append(
131+
self._generate_leaf(dataset_file, meta_input)
132+
)
133+
self.template.update(data=existing_leaves)
134+
47135
def create(self):
48136

49137
# create .dataherb folder
50-
dataherb_folder = '.dataherb'
138+
dataherb_folder = self.dataherb_folder
51139
try:
52140
os.mkdir(dataherb_folder)
53141
logger.info("Created ", dataherb_folder)
54142
except FileExistsError:
55-
logger.info(dataherb_folder, " already exists!")
143+
logger.info(
144+
dataherb_folder,
145+
" already exists! Creating metadata.yml file inside."
146+
)
147+
pass
148+
149+
metadata_file = self.metadata_file
56150

57-
metadata_file = 'metadata.yml'
151+
if os.path.isfile(os.path.join(dataherb_folder, metadata_file)):
152+
logger.error(
153+
f'File {os.path.join(dataherb_folder, metadata_file)} already exists!'
154+
)
155+
raise SystemExit
58156

59157
with open(os.path.join(dataherb_folder, metadata_file), 'w') as fp:
60158
documents = yaml.dump(self.template, fp)

docs/source/HISTORY.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,12 @@ This is the CHANGELOG of the package.
66
[0.0.1] - 2020-02-13
77
-------------------------
88

9-
First version
9+
First version
10+
11+
[0.0.3] - 2020-02-23
12+
-------------------------
13+
14+
Added
15+
16+
1. Data columns autogenerator for csv files.
17+

environment.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ dependencies:
66
- python=3.7
77
- pandas>=0.23
88
- requests>=2.22.0
9-
- pyyaml>=5.3
109
- click==7.0
1110
- pip
1211
- pip:
1312
- fuzzywuzzy>=0.18.0
14-
- python-Levenshtein>=0.12
13+
- python-Levenshtein>=0.12
14+
- ruamel.yaml>=0.16.10
15+
- inquirer>=2.6.3

requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
pandas>=0.23
22
requests>=2.22.0
33
fuzzywuzzy>=0.18.0
4-
pyyaml>=1.18
4+
ruamel.yaml>=0.16.10
55
python-Levenshtein>=0.12
6-
click==7.0
6+
click==7.0
7+
inquirer>=2.6.3

0 commit comments

Comments
 (0)