Skip to content

Commit 2b355e0

Browse files
author
muehlenpfordt
committed
implement slightly altered JSON style
1 parent 04c6d41 commit 2b355e0

File tree

1 file changed

+105
-78
lines changed

1 file changed

+105
-78
lines changed

timeseries_scripts/make_json.py

Lines changed: 105 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@
3333
minutes) is provided in a separate file. All data processing is
3434
conducted in python and pandas and has been documented in the
3535
Jupyter notebooks linked below.
36-
documentation: https://github.com/Open-Power-System-Data/datapackage_timeseries/blob/{version}/main.ipynb
36+
37+
documentation:
38+
https://github.com/Open-Power-System-Data/datapackage_timeseries/blob/{version}/main.ipynb
3739
3840
version: '{version}'
3941
40-
last_changes: Included data from CEPS and PSE
42+
last_changes: '{changes}'
4143
4244
keywords:
4345
- Open Power System Data
@@ -56,99 +58,102 @@
5658
- web: http://neon-energie.de/en/team/
5759
name: Jonathan Muehlenpfordt
5860
59-
60-
resources:
6161
'''
6262

6363
source_template = '''
64-
- name: {source}
65-
# web: {web}
64+
- name: {source}
65+
# web: {web}
6666
'''
6767

6868
resource_template = '''
69-
- path: time_series_{res_key}_singleindex.csv
70-
format: csv
71-
mediatype: text/csv
72-
encoding: UTF8
73-
dialect:
74-
csvddfVersion: 1.0
75-
delimiter: ","
76-
lineTerminator: "\\n"
77-
header: true
78-
alternative_formats:
79-
- path: time_series_{res_key}_singleindex.csv
80-
stacking: Singleindex
81-
format: csv
82-
- path: time_series.xlsx
83-
stacking: Multiindex
84-
format: xlsx
85-
- path: time_series_{res_key}_multiindex.csv
86-
stacking: Multiindex
87-
format: csv
88-
- path: time_series_{res_key}_stacked.csv
89-
stacking: Stacked
90-
format: csv
91-
schema:
92-
primaryKey: {utc}
93-
missingValue: ""
94-
fields:
69+
- path: time_series_{res_key}_singleindex.csv
70+
format: csv
71+
mediatype: text/csv
72+
encoding: UTF8
73+
schema: {res_key}
74+
dialect:
75+
csvddfVersion: 1.0
76+
delimiter: ","
77+
lineTerminator: "\\n"
78+
header: true
79+
alternative_formats:
80+
- path: time_series_{res_key}_singleindex.csv
81+
stacking: Singleindex
82+
format: csv
83+
- path: time_series.xlsx
84+
stacking: Multiindex
85+
format: xlsx
86+
- path: time_series_{res_key}_multiindex.csv
87+
stacking: Multiindex
88+
format: csv
89+
- path: time_series_{res_key}_stacked.csv
90+
stacking: Stacked
91+
format: csv
9592
'''
9693

97-
indexfield = '''
98-
- name: {utc}
99-
description: Start of timeperiod in Coordinated Universal Time
100-
type: datetime
101-
format: fmt:%Y-%m-%dT%H%M%SZ
102-
opsd-contentfilter: true
103-
- name: {cet}
104-
description: Start of timeperiod in Central European (Summer-) Time
105-
type: datetime
106-
format: fmt:%Y-%m-%dT%H%M%S%z
107-
- name: {marker}
108-
description: marker to indicate which columns are missing data in source data and has been interpolated (e.g. solar_DE-transnetbw_generation;)
109-
type: string
94+
schemas_template = '''
95+
{res_key}:
96+
primaryKey: {utc}
97+
missingValue: ""
98+
fields:
99+
- name: {utc}
100+
description: Start of timeperiod in Coordinated Universal Time
101+
type: datetime
102+
format: fmt:%Y-%m-%dT%H%M%SZ
103+
opsd-contentfilter: true
104+
- name: {cet}
105+
description: Start of timeperiod in Central European (Summer-) Time
106+
type: datetime
107+
format: fmt:%Y-%m-%dT%H%M%S%z
108+
- name: {marker}
109+
description: marker to indicate which columns are missing data in source data
110+
and has been interpolated (e.g. DE_transnetbw_solar_generation;)
111+
type: string
110112
'''
111113

112114
field_template = '''
113-
- name: {variable}_{region}_{attribute}
114-
description: {description}
115-
type: number (float)
116-
source:
117-
name: {source}
118-
web: {web}
119-
opsd-properties:
120-
Region: {region}
121-
Variable: {variable}
122-
Attribute: {attribute}
115+
- name: {region}_{variable}_{attribute}
116+
description: {description}
117+
type: number (float)
118+
source:
119+
name: {source}
120+
web: {web}
121+
opsd-properties:
122+
Region: "{region}"
123+
Variable: {variable}
124+
Attribute: {attribute}
123125
'''
124126

125127
descriptions_template = '''
126128
load: Consumption in {geo} in MW
127129
generation: Actual {tech} generation in {geo} in MW
128130
actual: Actual {tech} generation in {geo} in MW
129-
forecast: Forecasted {tech} generation forecast in {geo} in MW
131+
forecast: Forecasted {tech} generation in {geo} in MW
130132
capacity: Electrical capacity of {tech} in {geo} in MW
131133
profile: Share of {tech} capacity producing in {geo}
132134
epex: Day-ahead spot price for {geo}
133135
elspot: Day-ahead spot price for {geo}
136+
day_ahead: Day-ahead spot price for {geo}
134137
'''
135138

136-
# Columns-specific metadata
139+
# Dataset-specific metadata
137140

138141
# For each dataset/outputfile, the metadata has an entry in the
139-
# "resources" list that describes the file/dataset. The main part of each
140-
# entry is the "schema" dictionary, consisting of a list of "fields",
141-
# meaning the columns in the dataset. The first field is the timestamp
142-
# index of the dataset. For the other fields, we iterate over the columns
142+
# "resources" list and another in the "schemas" dictionary.
143+
# A "schema" consits of a list of "fields", meaning the columns in the dataset.
144+
# The first 2 fields are the timestamps (UTC and CE(S)T).
145+
# For the other fields, we iterate over the columns
143146
# of the MultiIndex index of the datasets to contruct the corresponding
144147
# metadata.
148+
# The file is constructed from different buildings blocks made up of YAML-strings
149+
# as this makes for more readable code.
145150

146151

147-
def make_json(data_sets, info_cols, version, headers):
152+
def make_json(data_sets, info_cols, version, changes, headers):
148153
'''
149154
Create a datapackage.json file that complies with the Frictionless
150155
data JSON Table Schema from the information in the column-MultiIndex.
151-
156+
152157
Parameters
153158
----------
154159
data_sets: dict of pandas.DataFrames
@@ -159,21 +164,36 @@ def make_json(data_sets, info_cols, version, headers):
159164
timestamps or the marker column
160165
version: str
161166
Version tag of the Data Package
167+
changes : str
168+
Desription of the changes from the last version to this one.
162169
headers : list
163170
List of strings indicating the level names of the pandas.MultiIndex
164171
for the columns of the dataframe.
165-
172+
166173
Returns
167174
----------
168175
None
169-
176+
170177
'''
171178

172-
resource_list = '' # list of files included in the datapackage
173-
source_list = '' # list of sources were data comes from
179+
# list of files included in the datapackage in YAML-format
180+
resource_list = '''
181+
- mediatype: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
182+
format: xlsx
183+
path: time_series.xlsx
184+
'''
185+
source_list = '' # list of sources were data comes from in YAML-format
186+
schemas_dict = '' # dictionary of schemas in YAML-format
187+
174188
for res_key, df in data_sets.items():
175-
# Create the list of of columns in a file, starting with the index field
176-
field_list = indexfield.format(**info_cols)
189+
field_list = '' # list of columns in a file in YAML-format
190+
191+
# Both datasets (15min and 60min) get an antry in the resource list
192+
resource_list = resource_list + resource_template.format(
193+
res_key=res_key)
194+
195+
# Create the list of of columns in a file, starting with the index
196+
# field
177197
for col in df.columns:
178198
if col[0] in info_cols.values():
179199
continue
@@ -185,16 +205,19 @@ def make_json(data_sets, info_cols, version, headers):
185205
elif h['region'] == 'CS':
186206
geo = 'Serbia and Montenegro'
187207
else:
188-
geo = pycountry.countries.get(alpha2=h['region']).name
208+
geo = pycountry.countries.get(alpha_2=h['region']).name
189209

190210
descriptions = yaml.load(
191211
descriptions_template.format(tech=h['variable'], geo=geo)
192212
)
193-
h['description'] = descriptions[h['attribute']]
213+
try:
214+
h['description'] = descriptions[h['attribute']]
215+
except KeyError:
216+
h['description'] = descriptions[h['variable']]
194217
field_list = field_list + field_template.format(**h)
195218
source_list = source_list + source_template.format(**h)
196-
resource_list = resource_list + \
197-
resource_template.format(res_key=res_key, **info_cols) + field_list
219+
schemas_dict = schemas_dict + schemas_template.format(
220+
res_key=res_key, **info_cols) + field_list
198221

199222
# Remove duplicates from sources_list. set() returns unique values from a
200223
# collection, but it cannot compare dicts. Since source_list is a list of of
@@ -203,18 +226,22 @@ def make_json(data_sets, info_cols, version, headers):
203226
for tupleized in set(tuple(entry.items())
204227
for entry in yaml.load(source_list))]
205228

206-
metadata = yaml.load(metadata_head.format(version=version))
229+
# Parse the YAML-Strings and stitch the building blocks together
230+
metadata = yaml.load(metadata_head.format(
231+
version=version, changes=changes))
207232
metadata['sources'] = source_list
208233
metadata['resources'] = yaml.load(resource_list)
209-
for resource in metadata['resources']:
210-
for field in resource['schema']['fields']:
234+
metadata['schemas'] = yaml.load(schemas_dict)
235+
236+
# Remove URL for source if a column is based on own calculations
237+
for schema in metadata['schemas'].values():
238+
for field in schema['fields']:
211239
if 'source' in field.keys() and field['source']['name'] == 'own calculation':
212240
del field['source']['web']
213241

214242
# write the metadata to disk
215243
datapackage_json = json.dumps(metadata, indent=4, separators=(',', ': '))
216244
with open('datapackage.json', 'w') as f:
217245
f.write(datapackage_json)
218-
219-
return
220246

247+
return

0 commit comments

Comments
 (0)