Skip to content

Commit 6684886

Browse files
author
Jonathan Mühlenpfordt
committed
Fixed column name issues
- replaced variable_name "wind-total" by "wind" for Energinet.DK and Svenska Kraftnät in order to line up with other data sources - removed special characters from column names - improved automated generation of datapckage.json file
1 parent d6864c4 commit 6684886

File tree

3 files changed

+80
-89
lines changed

3 files changed

+80
-89
lines changed

processing.ipynb

Lines changed: 52 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@
195195
"# been cached on the OPSD server as input.\n",
196196
"# All data from that version will be downloaded - subset will be ignored.\n",
197197
"# Type None to download directly from the original sources.\n",
198-
"archive_version = None # '2016-07-14'"
198+
"archive_version = None # i.e. '2016-07-14'"
199199
]
200200
},
201201
{
@@ -234,15 +234,15 @@
234234
"cell_type": "code",
235235
"execution_count": null,
236236
"metadata": {
237-
"collapsed": true
237+
"collapsed": false
238238
},
239239
"outputs": [],
240240
"source": [
241241
"subset = yaml.load('''\n",
242242
"insert_source_here:\n",
243243
"- insert_dataset1_from_that_source_here\n",
244244
"- insert_dataset2_here\n",
245-
"more_sources...\n",
245+
"more_sources:\n",
246246
"- more_data_sets\n",
247247
"''') # Or\n",
248248
"subset = None"
@@ -269,7 +269,7 @@
269269
" sources = {source_name: {k: v\n",
270270
" for k, v in sources[source_name].items()\n",
271271
" if k in variable_list}\n",
272-
" for source_name, variable_list in subset.items()}\n"
272+
" for source_name, variable_list in subset.items()}"
273273
]
274274
},
275275
{
@@ -459,7 +459,7 @@
459459
"cell_type": "markdown",
460460
"metadata": {},
461461
"source": [
462-
"Loop through sources andn variables to do the reading"
462+
"Loop through sources and variables to do the reading"
463463
]
464464
},
465465
{
@@ -509,7 +509,7 @@
509509
"execution_count": null,
510510
"metadata": {
511511
"collapsed": false,
512-
"scrolled": false
512+
"scrolled": true
513513
},
514514
"outputs": [],
515515
"source": [
@@ -569,7 +569,7 @@
569569
},
570570
"outputs": [],
571571
"source": [
572-
"#data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n",
572+
"data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n",
573573
"data_sets['60min'] = pd.read_pickle('raw_60.pickle')"
574574
]
575575
},
@@ -644,7 +644,7 @@
644644
},
645645
"outputs": [],
646646
"source": [
647-
"#%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n",
647+
"%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n",
648648
"%time data_sets['60min'], nan_table60 = find_nan(data_sets['60min'], headers, patch=True)"
649649
]
650650
},
@@ -732,7 +732,7 @@
732732
},
733733
"outputs": [],
734734
"source": [
735-
"writer = pd.ExcelWriter('NaN_table60.xlsx')\n",
735+
"writer = pd.ExcelWriter('NaN_table.xlsx')\n",
736736
"nan_table15.to_excel(writer, '15min')\n",
737737
"nan_table60.to_excel(writer, '60min')\n",
738738
"writer.save()"
@@ -932,7 +932,7 @@
932932
"\n",
933933
"The marker column is resampled separately in such a way that all information on where data has been interpolated is preserved.\n",
934934
"\n",
935-
"The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 1 minute to run."
935+
"The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 15 seconds to run."
936936
]
937937
},
938938
{
@@ -943,6 +943,7 @@
943943
},
944944
"outputs": [],
945945
"source": [
946+
"%%time\n",
946947
"def resample_markers(group):\n",
947948
" '''Resample marker column from 15 to 60 min\n",
948949
" \n",
@@ -965,33 +966,12 @@
965966
" aggregated_marker = '; '.join(set(unpacked)) + '; '\n",
966967
" else:\n",
967968
" aggregated_marker = np.nan\n",
968-
" return aggregated_marker"
969-
]
970-
},
971-
{
972-
"cell_type": "code",
973-
"execution_count": null,
974-
"metadata": {
975-
"collapsed": false,
976-
"scrolled": true
977-
},
978-
"outputs": [],
979-
"source": [
980-
"%%time\n",
969+
" return aggregated_marker\n",
970+
"\n",
971+
"\n",
981972
"marker_col_15 = data_sets['15min']['comment']\n",
982973
"marker_col_15 = marker_col_15.groupby(\n",
983-
" pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)"
984-
]
985-
},
986-
{
987-
"cell_type": "code",
988-
"execution_count": null,
989-
"metadata": {
990-
"collapsed": false
991-
},
992-
"outputs": [],
993-
"source": [
994-
"%%time\n",
974+
" pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)\n",
995975
"marker_col_15 = marker_col_15.reindex(data_sets['60min'].index)\n",
996976
"data_sets['60min']['comment'] = (\n",
997977
" data_sets['60min']['comment']\n",
@@ -1018,18 +998,6 @@
1018998
" data_sets['60min'] = resampled"
1019999
]
10201000
},
1021-
{
1022-
"cell_type": "code",
1023-
"execution_count": null,
1024-
"metadata": {
1025-
"collapsed": false,
1026-
"scrolled": true
1027-
},
1028-
"outputs": [],
1029-
"source": [
1030-
"data_sets['60min']['2016-09-27 21:45:00':].shape"
1031-
]
1032-
},
10331001
{
10341002
"cell_type": "markdown",
10351003
"metadata": {
@@ -1048,6 +1016,20 @@
10481016
"The index column of th data sets defines the start of the timeperiod represented by each row of that data set in **UTC** time. We include an additional column for the **CE(S)T** Central European (Summer-) Time, as this might help aligning the output data with other data sources."
10491017
]
10501018
},
1019+
{
1020+
"cell_type": "code",
1021+
"execution_count": null,
1022+
"metadata": {
1023+
"collapsed": true
1024+
},
1025+
"outputs": [],
1026+
"source": [
1027+
"info_cols = {'utc': 'utc_timestamp',\n",
1028+
" 'cet':'cet_cest_timestamp',\n",
1029+
" 'marker': 'comment'}\n",
1030+
"version = '2016-10-28'"
1031+
]
1032+
},
10511033
{
10521034
"cell_type": "code",
10531035
"execution_count": null,
@@ -1062,8 +1044,8 @@
10621044
"for res_key, df in data_sets.items():\n",
10631045
" if df.empty:\n",
10641046
" continue\n",
1065-
" df.index.rename('utc-timestamp', inplace=True)\n",
1066-
" df.insert(0, 'ce(s)t-timestamp',\n",
1047+
" df.index.rename(info_cols['utc'] inplace=True)\n",
1048+
" df.insert(0, info_cols['cet'] \n",
10671049
" df.index.tz_localize('UTC').tz_convert('Europe/Brussels'))"
10681050
]
10691051
},
@@ -1097,7 +1079,7 @@
10971079
},
10981080
"outputs": [],
10991081
"source": [
1100-
"make_json(data_sets, headers)"
1082+
"make_json(data_sets, info_cols, version, headers)"
11011083
]
11021084
},
11031085
{
@@ -1200,23 +1182,25 @@
12001182
" continue\n",
12011183
"\n",
12021184
" for col_name, col in df.iteritems():\n",
1203-
" if not (col_name[0] in ['ce(s)t-timestamp', 'comment', 'marker'] or\n",
1185+
" if not (col_name[0] in info_cols.values() or\n",
12041186
" col_name[2] == 'profile'):\n",
12051187
" df[col_name] = col.round(0)\n",
1206-
" \n",
1207-
" df_singleindex = df.copy()\n",
12081188
"\n",
1189+
" # MultIndex\n",
1190+
" data_sets_multiindex[res_key + '_multiindex'] = df\n",
1191+
"\n",
1192+
" # SingleIndex\n",
1193+
" df_singleindex = df.copy()\n",
12091194
" # use first 3 levels of multiindex to create singleindex\n",
12101195
" df_singleindex.columns = [\n",
1211-
" col[0] if col[0] in ['ce(s)t-timestamp', 'comment']\n",
1196+
" col[0] if col[0] in info_cols.values()\n",
12121197
" else '_'.join(col[0:3]) for col in df.columns.values]\n",
12131198
"\n",
12141199
" data_sets_singleindex[res_key + '_singleindex'] = df_singleindex\n",
12151200
"\n",
1216-
" data_sets_multiindex[res_key + '_multiindex'] = df\n",
1217-
"\n",
1201+
" # Stacked\n",
12181202
" stacked = df.copy()\n",
1219-
" stacked.drop('ce(s)t-timestamp', axis=1, inplace=True)\n",
1203+
" stacked.drop(info_cols['cet'], axis=1, inplace=True)\n",
12201204
" stacked.columns = stacked.columns.droplevel(['source', 'web'])\n",
12211205
" stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n",
12221206
" data_sets_stacked[res_key + '_stacked'] = stacked"
@@ -1260,9 +1244,9 @@
12601244
" f = 'time_series_' + res_key\n",
12611245
" df = df.copy()\n",
12621246
" df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n",
1263-
" df['ce(s)t-timestamp'] = df['ce(s)t-timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
1247+
" df[info_cols['cet']] = df[info_cols['cet']].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
12641248
" df.to_sql(f, sqlite3.connect('time_series.sqlite'),\n",
1265-
" if_exists='replace', index_label='utc-timestamp')"
1249+
" if_exists='replace', index_label=info_cols['utc'])"
12661250
]
12671251
},
12681252
{
@@ -1284,7 +1268,7 @@
12841268
}
12851269
},
12861270
"source": [
1287-
"Writing the full tables to Excel takes extremely long. As a workaroun, only the first five rows are exported. The rest of the data is inserted manually from the CSV."
1271+
"Writing the full tables to Excel takes extremely long. As a workaround, only the first 5 rows are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files."
12881272
]
12891273
},
12901274
{
@@ -1341,15 +1325,17 @@
13411325
"%%time\n",
13421326
"# itertoools.chain() allows iterating over multiple dicts at once\n",
13431327
"for res_stacking_key, df in itertools.chain(\n",
1344-
" #data_sets_singleindex.items(),\n",
1345-
" #data_sets_multiindex.items(),):\n",
1346-
" data_sets_stacked.items()):\n",
1347-
" # convert the format of the ce(s)t-timestamp to ISO-8601\n",
1348-
" if not res_stacking_key in ['15min_stacked', '60min_stacked']:\n",
1328+
" data_sets_singleindex.items(),\n",
1329+
" data_sets_multiindex.items(),\n",
1330+
" data_sets_stacked.items()\n",
1331+
" ):\n",
1332+
" # convert the format of the cet_cest-timestamp to ISO-8601\n",
1333+
" if not (res_stacking_key in ['15min_stacked', '60min_stacked']\n",
1334+
" or type(df.iloc[0,0]) == str):\n",
13491335
" df.iloc[:,0] = df.iloc[:,0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
13501336
" f = 'time_series_' + res_stacking_key\n",
13511337
" df.to_csv(f + '.csv', float_format='%.2f',\n",
1352-
" date_format='%Y-%m-%dT%H:%M:%S%z')"
1338+
" date_format='%Y-%m-%dT%H:%M:%SZ')"
13531339
]
13541340
}
13551341
],

timeseries_scripts/make_json.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
minutes) is provided in a separate file. All data processing is
3434
conducted in python and pandas and has been documented in the
3535
Jupyter notebooks linked below.
36-
documentation: https://github.com/Open-Power-System-Data/datapackage_timeseries/blob/2016-10-27/main.ipynb
36+
documentation: https://github.com/Open-Power-System-Data/datapackage_timeseries/blob/{version}/main.ipynb
3737
38-
version: '2016-10-27'
38+
version: '{version}'
3939
4040
last_changes: Included data from CEPS and PSE
4141
@@ -79,7 +79,7 @@
7979
- path: time_series_{res_key}_singleindex.csv
8080
stacking: Singleindex
8181
format: csv
82-
- path: time_series_{res_key}.xlsx
82+
- path: time_series.xlsx
8383
stacking: Multiindex
8484
format: xlsx
8585
- path: time_series_{res_key}_multiindex.csv
@@ -89,22 +89,22 @@
8989
stacking: Stacked
9090
format: csv
9191
schema:
92-
primaryKey: timestamp
92+
primaryKey: {utc}
9393
missingValue: ""
9494
fields:
9595
'''
9696

9797
indexfield = '''
98-
- name: utc-timestamp
99-
description: Start of timeperiod in UTC
98+
- name: {utc}
99+
description: Start of timeperiod in Coordinated Universal Time
100100
type: datetime
101101
format: fmt:%Y-%m-%dT%H%M%SZ
102102
opsd-contentfilter: true
103-
- name: ce(s)t-timestamp
104-
description: Start of timeperiod in CE(S)T
103+
- name: {cet}
104+
description: Start of timeperiod in Central European (Summer-) Time
105105
type: datetime
106106
format: fmt:%Y-%m-%dT%H%M%S%z
107-
- name: comment
107+
- name: {marker}
108108
description: marker to indicate which columns are missing data in source data and has been interpolated (e.g. solar_DE-transnetbw_generation;)
109109
type: string
110110
'''
@@ -129,9 +129,8 @@
129129
forecast: Forecasted {tech} generation forecast in {geo} in MW
130130
capacity: Electrical capacity of {tech} in {geo} in MW
131131
profile: Share of {tech} capacity producing in {geo}
132-
offshoreshare: {tech} actual offshore generation in {geo} in MW
133-
EPEX: lalala
134-
Elspot: dududu
132+
epex: Day-ahead spot price for {geo}
133+
elspot: Day-ahead spot price for {geo}
135134
'''
136135

137136
# Columns-specific metadata
@@ -145,7 +144,7 @@
145144
# metadata.
146145

147146

148-
def make_json(data_sets, headers):
147+
def make_json(data_sets, info_cols, version, headers):
149148
'''
150149
Create a datapackage.json file that complies with the Frictionless
151150
data JSON Table Schema from the information in the column-MultiIndex.
@@ -155,6 +154,11 @@ def make_json(data_sets, headers):
155154
data_sets: dict of pandas.DataFrames
156155
A dict with keys '15min' and '60min' and values the respective
157156
DataFrames
157+
info_cols : dict of strings
158+
Names for non-data columns such as for the index, for additional
159+
timestamps or the marker column
160+
version: str
161+
Version tag of the Data Package
158162
headers : list
159163
List of strings indicating the level names of the pandas.MultiIndex
160164
for the columns of the dataframe.
@@ -168,13 +172,14 @@ def make_json(data_sets, headers):
168172
resource_list = '' # list of files included in the datapackage
169173
source_list = '' # list of sources were data comes from
170174
for res_key, df in data_sets.items():
171-
field_list = indexfield # list of of columns in a file, starting with the index field
175+
# Create the list of of columns in a file, starting with the index field
176+
field_list = indexfield.format(**info_cols)
172177
for col in df.columns:
173-
if col[0] in ['ce(s)t-timestamp', 'comment']:
178+
if col[0] in info_cols.values():
174179
continue
175180
h = {k: v for k, v in zip(headers, col)}
176181
if len(h['region']) > 2:
177-
geo = h['region'] + ' control area'
182+
geo = h['region'] + ' balancing area'
178183
elif h['region'] == 'NI':
179184
geo = 'Northern Ireland'
180185
elif h['region'] == 'CS':
@@ -189,16 +194,16 @@ def make_json(data_sets, headers):
189194
field_list = field_list + field_template.format(**h)
190195
source_list = source_list + source_template.format(**h)
191196
resource_list = resource_list + \
192-
resource_template.format(res_key=res_key) + field_list
197+
resource_template.format(res_key=res_key, **info_cols) + field_list
193198

194199
# Remove duplicates from sources_list. set() returns unique values from a
195-
# collection, butit cannot compare dicts. Since source_list is a list of of
200+
# collection, but it cannot compare dicts. Since source_list is a list of of
196201
# dicts, this requires some juggling with data types
197202
source_list = [dict(tupleized)
198203
for tupleized in set(tuple(entry.items())
199204
for entry in yaml.load(source_list))]
200205

201-
metadata = yaml.load(metadata_head)
206+
metadata = yaml.load(metadata_head.format(version=version))
202207
metadata['sources'] = source_list
203208
metadata['resources'] = yaml.load(resource_list)
204209
for resource in metadata['resources']:

0 commit comments

Comments
 (0)