|
195 | 195 | "# been cached on the OPSD server as input.\n", |
196 | 196 | "# All data from that version will be downloaded - subset will be ignored.\n", |
197 | 197 | "# Type None to download directly from the original sources.\n", |
198 | | - "archive_version = None # '2016-07-14'" |
| 198 | + "archive_version = None # i.e. '2016-07-14'" |
199 | 199 | ] |
200 | 200 | }, |
201 | 201 | { |
|
234 | 234 | "cell_type": "code", |
235 | 235 | "execution_count": null, |
236 | 236 | "metadata": { |
237 | | - "collapsed": true |
| 237 | + "collapsed": false |
238 | 238 | }, |
239 | 239 | "outputs": [], |
240 | 240 | "source": [ |
241 | 241 | "subset = yaml.load('''\n", |
242 | 242 | "insert_source_here:\n", |
243 | 243 | "- insert_dataset1_from_that_source_here\n", |
244 | 244 | "- insert_dataset2_here\n", |
245 | | - "more_sources...\n", |
| 245 | + "more_sources:\n", |
246 | 246 | "- more_data_sets\n", |
247 | 247 | "''') # Or\n", |
248 | 248 | "subset = None" |
|
269 | 269 | " sources = {source_name: {k: v\n", |
270 | 270 | " for k, v in sources[source_name].items()\n", |
271 | 271 | " if k in variable_list}\n", |
272 | | - " for source_name, variable_list in subset.items()}\n" |
| 272 | + " for source_name, variable_list in subset.items()}" |
273 | 273 | ] |
274 | 274 | }, |
275 | 275 | { |
|
459 | 459 | "cell_type": "markdown", |
460 | 460 | "metadata": {}, |
461 | 461 | "source": [ |
462 | | - "Loop through sources andn variables to do the reading" |
| 462 | + "Loop through sources and variables to do the reading" |
463 | 463 | ] |
464 | 464 | }, |
465 | 465 | { |
|
509 | 509 | "execution_count": null, |
510 | 510 | "metadata": { |
511 | 511 | "collapsed": false, |
512 | | - "scrolled": false |
| 512 | + "scrolled": true |
513 | 513 | }, |
514 | 514 | "outputs": [], |
515 | 515 | "source": [ |
|
569 | 569 | }, |
570 | 570 | "outputs": [], |
571 | 571 | "source": [ |
572 | | - "#data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n", |
| 572 | + "data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n", |
573 | 573 | "data_sets['60min'] = pd.read_pickle('raw_60.pickle')" |
574 | 574 | ] |
575 | 575 | }, |
|
644 | 644 | }, |
645 | 645 | "outputs": [], |
646 | 646 | "source": [ |
647 | | - "#%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n", |
| 647 | + "%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n", |
648 | 648 | "%time data_sets['60min'], nan_table60 = find_nan(data_sets['60min'], headers, patch=True)" |
649 | 649 | ] |
650 | 650 | }, |
|
732 | 732 | }, |
733 | 733 | "outputs": [], |
734 | 734 | "source": [ |
735 | | - "writer = pd.ExcelWriter('NaN_table60.xlsx')\n", |
| 735 | + "writer = pd.ExcelWriter('NaN_table.xlsx')\n", |
736 | 736 | "nan_table15.to_excel(writer, '15min')\n", |
737 | 737 | "nan_table60.to_excel(writer, '60min')\n", |
738 | 738 | "writer.save()" |
|
932 | 932 | "\n", |
933 | 933 | "The marker column is resampled separately in such a way that all information on where data has been interpolated is preserved.\n", |
934 | 934 | "\n", |
935 | | - "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 1 minute to run." |
| 935 | + "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 15 seconds to run." |
936 | 936 | ] |
937 | 937 | }, |
938 | 938 | { |
|
943 | 943 | }, |
944 | 944 | "outputs": [], |
945 | 945 | "source": [ |
| 946 | + "%%time\n", |
946 | 947 | "def resample_markers(group):\n", |
947 | 948 | " '''Resample marker column from 15 to 60 min\n", |
948 | 949 | " \n", |
|
965 | 966 | " aggregated_marker = '; '.join(set(unpacked)) + '; '\n", |
966 | 967 | " else:\n", |
967 | 968 | " aggregated_marker = np.nan\n", |
968 | | - " return aggregated_marker" |
969 | | - ] |
970 | | - }, |
971 | | - { |
972 | | - "cell_type": "code", |
973 | | - "execution_count": null, |
974 | | - "metadata": { |
975 | | - "collapsed": false, |
976 | | - "scrolled": true |
977 | | - }, |
978 | | - "outputs": [], |
979 | | - "source": [ |
980 | | - "%%time\n", |
| 969 | + " return aggregated_marker\n", |
| 970 | + "\n", |
| 971 | + "\n", |
981 | 972 | "marker_col_15 = data_sets['15min']['comment']\n", |
982 | 973 | "marker_col_15 = marker_col_15.groupby(\n", |
983 | | - " pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)" |
984 | | - ] |
985 | | - }, |
986 | | - { |
987 | | - "cell_type": "code", |
988 | | - "execution_count": null, |
989 | | - "metadata": { |
990 | | - "collapsed": false |
991 | | - }, |
992 | | - "outputs": [], |
993 | | - "source": [ |
994 | | - "%%time\n", |
| 974 | + " pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)\n", |
995 | 975 | "marker_col_15 = marker_col_15.reindex(data_sets['60min'].index)\n", |
996 | 976 | "data_sets['60min']['comment'] = (\n", |
997 | 977 | " data_sets['60min']['comment']\n", |
|
1018 | 998 | " data_sets['60min'] = resampled" |
1019 | 999 | ] |
1020 | 1000 | }, |
1021 | | - { |
1022 | | - "cell_type": "code", |
1023 | | - "execution_count": null, |
1024 | | - "metadata": { |
1025 | | - "collapsed": false, |
1026 | | - "scrolled": true |
1027 | | - }, |
1028 | | - "outputs": [], |
1029 | | - "source": [ |
1030 | | - "data_sets['60min']['2016-09-27 21:45:00':].shape" |
1031 | | - ] |
1032 | | - }, |
1033 | 1001 | { |
1034 | 1002 | "cell_type": "markdown", |
1035 | 1003 | "metadata": { |
|
1048 | 1016 | "The index column of th data sets defines the start of the timeperiod represented by each row of that data set in **UTC** time. We include an additional column for the **CE(S)T** Central European (Summer-) Time, as this might help aligning the output data with other data sources." |
1049 | 1017 | ] |
1050 | 1018 | }, |
| 1019 | + { |
| 1020 | + "cell_type": "code", |
| 1021 | + "execution_count": null, |
| 1022 | + "metadata": { |
| 1023 | + "collapsed": true |
| 1024 | + }, |
| 1025 | + "outputs": [], |
| 1026 | + "source": [ |
| 1027 | + "info_cols = {'utc': 'utc_timestamp',\n", |
| 1028 | + " 'cet':'cet_cest_timestamp',\n", |
| 1029 | + " 'marker': 'comment'}\n", |
| 1030 | + "version = '2016-10-28'" |
| 1031 | + ] |
| 1032 | + }, |
1051 | 1033 | { |
1052 | 1034 | "cell_type": "code", |
1053 | 1035 | "execution_count": null, |
|
1062 | 1044 | "for res_key, df in data_sets.items():\n", |
1063 | 1045 | " if df.empty:\n", |
1064 | 1046 | " continue\n", |
1065 | | - " df.index.rename('utc-timestamp', inplace=True)\n", |
1066 | | - " df.insert(0, 'ce(s)t-timestamp',\n", |
| 1047 | + " df.index.rename(info_cols['utc'] inplace=True)\n", |
| 1048 | + " df.insert(0, info_cols['cet'] \n", |
1067 | 1049 | " df.index.tz_localize('UTC').tz_convert('Europe/Brussels'))" |
1068 | 1050 | ] |
1069 | 1051 | }, |
|
1097 | 1079 | }, |
1098 | 1080 | "outputs": [], |
1099 | 1081 | "source": [ |
1100 | | - "make_json(data_sets, headers)" |
| 1082 | + "make_json(data_sets, info_cols, version, headers)" |
1101 | 1083 | ] |
1102 | 1084 | }, |
1103 | 1085 | { |
|
1200 | 1182 | " continue\n", |
1201 | 1183 | "\n", |
1202 | 1184 | " for col_name, col in df.iteritems():\n", |
1203 | | - " if not (col_name[0] in ['ce(s)t-timestamp', 'comment', 'marker'] or\n", |
| 1185 | + " if not (col_name[0] in info_cols.values() or\n", |
1204 | 1186 | " col_name[2] == 'profile'):\n", |
1205 | 1187 | " df[col_name] = col.round(0)\n", |
1206 | | - " \n", |
1207 | | - " df_singleindex = df.copy()\n", |
1208 | 1188 | "\n", |
| 1189 | + " # MultIndex\n", |
| 1190 | + " data_sets_multiindex[res_key + '_multiindex'] = df\n", |
| 1191 | + "\n", |
| 1192 | + " # SingleIndex\n", |
| 1193 | + " df_singleindex = df.copy()\n", |
1209 | 1194 | " # use first 3 levels of multiindex to create singleindex\n", |
1210 | 1195 | " df_singleindex.columns = [\n", |
1211 | | - " col[0] if col[0] in ['ce(s)t-timestamp', 'comment']\n", |
| 1196 | + " col[0] if col[0] in info_cols.values()\n", |
1212 | 1197 | " else '_'.join(col[0:3]) for col in df.columns.values]\n", |
1213 | 1198 | "\n", |
1214 | 1199 | " data_sets_singleindex[res_key + '_singleindex'] = df_singleindex\n", |
1215 | 1200 | "\n", |
1216 | | - " data_sets_multiindex[res_key + '_multiindex'] = df\n", |
1217 | | - "\n", |
| 1201 | + " # Stacked\n", |
1218 | 1202 | " stacked = df.copy()\n", |
1219 | | - " stacked.drop('ce(s)t-timestamp', axis=1, inplace=True)\n", |
| 1203 | + " stacked.drop(info_cols['cet'], axis=1, inplace=True)\n", |
1220 | 1204 | " stacked.columns = stacked.columns.droplevel(['source', 'web'])\n", |
1221 | 1205 | " stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n", |
1222 | 1206 | " data_sets_stacked[res_key + '_stacked'] = stacked" |
|
1260 | 1244 | " f = 'time_series_' + res_key\n", |
1261 | 1245 | " df = df.copy()\n", |
1262 | 1246 | " df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n", |
1263 | | - " df['ce(s)t-timestamp'] = df['ce(s)t-timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n", |
| 1247 | + " df[info_cols['cet']] = df[info_cols['cet']].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n", |
1264 | 1248 | " df.to_sql(f, sqlite3.connect('time_series.sqlite'),\n", |
1265 | | - " if_exists='replace', index_label='utc-timestamp')" |
| 1249 | + " if_exists='replace', index_label=info_cols['utc'])" |
1266 | 1250 | ] |
1267 | 1251 | }, |
1268 | 1252 | { |
|
1284 | 1268 | } |
1285 | 1269 | }, |
1286 | 1270 | "source": [ |
1287 | | - "Writing the full tables to Excel takes extremely long. As a workaroun, only the first five rows are exported. The rest of the data is inserted manually from the CSV." |
| 1271 | + "Writing the full tables to Excel takes extremely long. As a workaround, only the first 5 rows are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files." |
1288 | 1272 | ] |
1289 | 1273 | }, |
1290 | 1274 | { |
|
1341 | 1325 | "%%time\n", |
1342 | 1326 | "# itertoools.chain() allows iterating over multiple dicts at once\n", |
1343 | 1327 | "for res_stacking_key, df in itertools.chain(\n", |
1344 | | - " #data_sets_singleindex.items(),\n", |
1345 | | - " #data_sets_multiindex.items(),):\n", |
1346 | | - " data_sets_stacked.items()):\n", |
1347 | | - " # convert the format of the ce(s)t-timestamp to ISO-8601\n", |
1348 | | - " if not res_stacking_key in ['15min_stacked', '60min_stacked']:\n", |
| 1328 | + " data_sets_singleindex.items(),\n", |
| 1329 | + " data_sets_multiindex.items(),\n", |
| 1330 | + " data_sets_stacked.items()\n", |
| 1331 | + " ):\n", |
| 1332 | + " # convert the format of the cet_cest-timestamp to ISO-8601\n", |
| 1333 | + " if not (res_stacking_key in ['15min_stacked', '60min_stacked']\n", |
| 1334 | + " or type(df.iloc[0,0]) == str):\n", |
1349 | 1335 | " df.iloc[:,0] = df.iloc[:,0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n", |
1350 | 1336 | " f = 'time_series_' + res_stacking_key\n", |
1351 | 1337 | " df.to_csv(f + '.csv', float_format='%.2f',\n", |
1352 | | - " date_format='%Y-%m-%dT%H:%M:%S%z')" |
| 1338 | + " date_format='%Y-%m-%dT%H:%M:%SZ')" |
1353 | 1339 | ] |
1354 | 1340 | } |
1355 | 1341 | ], |
|
0 commit comments