1- import os
1+ import csv
22import logging
3- import yaml
3+ import os
4+ from collections import OrderedDict
5+ from pathlib import Path
6+
7+ import ruamel .yaml
8+ from ruamel .yaml .representer import RoundTripRepresenter
49
510logging .basicConfig ()
611logger = logging .getLogger ("dataherb.parse.model" )
712
13+ IGNORED_FOLDERS_AND_FILES = ['.git' , '.dataherb' , '.vscode' ]
14+
15+ # Add representer to ruamel.yaml for OrderedDict
16+ class MyRepresenter (RoundTripRepresenter ):
17+ pass
18+
19+ ruamel .yaml .add_representer (
20+ OrderedDict , MyRepresenter .represent_dict , representer = MyRepresenter
21+ )
22+ yaml = ruamel .yaml .YAML ()
23+ yaml .Representer = MyRepresenter
24+
825
926class MetaData (object ):
1027 def __init__ (self ):
11- self .template = {
28+ self .dataherb_folder = '.dataherb'
29+ self .metadata_file = 'metadata.yml'
30+ self .template = OrderedDict ({
1231 "name" : "" ,
1332 "description" : "" ,
1433 "contributors" : [
@@ -17,44 +36,123 @@ def __init__(self):
1736 "github" : ""
1837 }
1938 ],
20- "data" : [
39+ "data" : [],
40+ "references" : [
2141 {
2242 "name" : "" ,
23- "path" : "" ,
24- "format" : "" ,
25- "size" : "" ,
26- "updated_at" : "" ,
27- "fields" : [
28- {
29- "name" : "" ,
30- "description" : ""
31- },
32- {
33- "name" : "" ,
34- "description" : ""
35- }
36- ]
43+ "link" : ""
3744 }
38- ],
39- "references" : [
45+ ]
46+ })
47+
48+ def parse_structure (self , folder = None ):
49+
50+ if folder is None :
51+ folder = '.'
52+
53+ tree_f = []
54+ tree_d = []
55+ for root , dirs , files in os .walk (folder ):
56+ for d in dirs :
57+ if d not in IGNORED_FOLDERS_AND_FILES :
58+ tree_d .append (
59+ os .path .relpath (os .path .join (root , d ), folder )
60+ )
61+ for f in files :
62+ tree_f .append (
63+ os .path .relpath (os .path .join (root , f ), folder )
64+ )
65+
66+ self .tree = tree_f
67+
68+ return self .tree
69+
70+ def parse_csv (self , csv_file ):
71+ """
72+ parse_csv parses the csv files for metadata generation
73+ """
74+
75+ with open (csv_file , "r" ) as f :
76+ reader = csv .reader (f )
77+ columns = next (reader )
78+
79+ fields = []
80+ for col in columns :
81+ fields .append ({
82+ "name" : col ,
83+ "description" : ""
84+ })
85+
86+ return fields
87+
88+ def _generate_leaf (self , path , meta_input ):
89+
90+ name = meta_input .get ("name" , "" )
91+ description = meta_input .get ("description" , "" )
92+ updated_at = meta_input .get ("updated_at" , "" )
93+
94+ file_format = path .split ("." )[- 1 ]
95+ if len (file_format ) >= 10 :
96+ logger .error (f"The format of file { path } could not be determined!" )
97+ file_format = ""
98+
99+ file_size = os .stat (path ).st_size
100+
101+ if file_format == "csv" :
102+ fields = self .parse_csv (path )
103+ else :
104+ fields = [
40105 {
41106 "name" : "" ,
42- "link" : ""
107+ "description" : ""
108+ },
109+ {
110+ "name" : "" ,
111+ "description" : ""
43112 }
44113 ]
114+
115+ res = {
116+ "name" : name ,
117+ "description" : description ,
118+ "path" : path ,
119+ "format" : file_format ,
120+ "size" : file_size ,
121+ "updated_at" : updated_at ,
122+ "fields" : fields
45123 }
46124
125+ return res
126+
127+ def append_leaf (self , dataset_file , meta_input ):
128+
129+ existing_leaves = self .template ["data" ]
130+ existing_leaves .append (
131+ self ._generate_leaf (dataset_file , meta_input )
132+ )
133+ self .template .update (data = existing_leaves )
134+
47135 def create (self ):
48136
49137 # create .dataherb folder
50- dataherb_folder = '.dataherb'
138+ dataherb_folder = self . dataherb_folder
51139 try :
52140 os .mkdir (dataherb_folder )
53141 logger .info ("Created " , dataherb_folder )
54142 except FileExistsError :
55- logger .info (dataherb_folder , " already exists!" )
143+ logger .info (
144+ dataherb_folder ,
145+ " already exists! Creating metadata.yml file inside."
146+ )
147+ pass
148+
149+ metadata_file = self .metadata_file
56150
57- metadata_file = 'metadata.yml'
151+ if os .path .isfile (os .path .join (dataherb_folder , metadata_file )):
152+ logger .error (
153+ f'File { os .path .join (dataherb_folder , metadata_file )} already exists!'
154+ )
155+ raise SystemExit
58156
59157 with open (os .path .join (dataherb_folder , metadata_file ), 'w' ) as fp :
60158 documents = yaml .dump (self .template , fp )
0 commit comments