Fixed column name issues

Jonathan Mühlenpfordt · Jonathan Mühlenpfordt · commit 668488609b0b · 2016-10-28T19:56:50.000+02:00
- replaced variable_name "wind-total" by "wind" for Energinet.DK and
Svenska Kraftnät in order to line up with other data sources
- removed special characters from column names
- improved automated generation of datapckage.json file
diff --git a/processing.ipynb b/processing.ipynb
@@ -195,7 +195,7 @@
     "# been cached on the OPSD server as input.\n",
     "# All data from that version will be downloaded - subset will be ignored.\n",
     "# Type None to download directly from the original sources.\n",
-    "archive_version = None  # '2016-07-14'"
+    "archive_version = None  # i.e. '2016-07-14'"
    ]
   },
   {
@@ -234,15 +234,15 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
     "subset = yaml.load('''\n",
     "insert_source_here:\n",
     "- insert_dataset1_from_that_source_here\n",
     "- insert_dataset2_here\n",
-    "more_sources...\n",
+    "more_sources:\n",
     "- more_data_sets\n",
     "''')  # Or\n",
     "subset = None"
@@ -269,7 +269,7 @@
     "    sources = {source_name: {k: v\n",
     "                             for k, v in sources[source_name].items()\n",
     "                             if k in variable_list}\n",
-    "               for source_name, variable_list in subset.items()}\n"
+    "               for source_name, variable_list in subset.items()}"
    ]
   },
   {
@@ -459,7 +459,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Loop through sources andn variables to do the reading"
+    "Loop through sources and variables to do the reading"
    ]
   },
   {
@@ -509,7 +509,7 @@
    "execution_count": null,
    "metadata": {
     "collapsed": false,
-    "scrolled": false
+    "scrolled": true
    },
    "outputs": [],
    "source": [
@@ -569,7 +569,7 @@
    },
    "outputs": [],
    "source": [
-    "#data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n",
+    "data_sets['15min'] = pd.read_pickle('raw_15.pickle')\n",
     "data_sets['60min'] = pd.read_pickle('raw_60.pickle')"
    ]
   },
@@ -644,7 +644,7 @@
    },
    "outputs": [],
    "source": [
-    "#%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n",
+    "%time data_sets['15min'], nan_table15 = find_nan(data_sets['15min'], headers, patch=True)\n",
     "%time data_sets['60min'], nan_table60 = find_nan(data_sets['60min'], headers, patch=True)"
    ]
   },
@@ -732,7 +732,7 @@
    },
    "outputs": [],
    "source": [
-    "writer = pd.ExcelWriter('NaN_table60.xlsx')\n",
+    "writer = pd.ExcelWriter('NaN_table.xlsx')\n",
     "nan_table15.to_excel(writer, '15min')\n",
     "nan_table60.to_excel(writer, '60min')\n",
     "writer.save()"
@@ -932,7 +932,7 @@
     "\n",
     "The marker column is resampled separately in such a way that all information on where data has been interpolated is preserved.\n",
     "\n",
-    "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 1 minute to run."
+    "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 15 seconds to run."
    ]
   },
   {
@@ -943,6 +943,7 @@
    },
    "outputs": [],
    "source": [
+    "%%time\n",
     "def resample_markers(group):\n",
     "    '''Resample marker column from 15 to 60 min\n",
     "    \n",
@@ -965,33 +966,12 @@
     "        aggregated_marker = '; '.join(set(unpacked)) + '; '\n",
     "    else:\n",
     "        aggregated_marker = np.nan\n",
-    "    return aggregated_marker"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "%%time\n",
+    "    return aggregated_marker\n",
+    "\n",
+    "\n",
     "marker_col_15 = data_sets['15min']['comment']\n",
     "marker_col_15 = marker_col_15.groupby(\n",
-    "    pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "%%time\n",
+    "    pd.Grouper(freq='60Min', closed='left', label='left')).agg(resample_markers)\n",
     "marker_col_15 = marker_col_15.reindex(data_sets['60min'].index)\n",
     "data_sets['60min']['comment'] = (\n",
     "    data_sets['60min']['comment']\n",
@@ -1018,18 +998,6 @@
     "    data_sets['60min'] = resampled"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "data_sets['60min']['2016-09-27 21:45:00':].shape"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1048,6 +1016,20 @@
     "The index column of th data sets defines the start of the timeperiod represented by each row of that data set in **UTC** time. We include an additional column for the **CE(S)T** Central European (Summer-) Time, as this might help aligning the output data with other data sources."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "info_cols = {'utc': 'utc_timestamp',\n",
+    "             'cet':'cet_cest_timestamp',\n",
+    "             'marker': 'comment'}\n",
+    "version = '2016-10-28'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1062,8 +1044,8 @@
     "for res_key, df in data_sets.items():\n",
     "    if df.empty:\n",
     "        continue\n",
-    "    df.index.rename('utc-timestamp', inplace=True)\n",
-    "    df.insert(0, 'ce(s)t-timestamp',\n",
+    "    df.index.rename(info_cols['utc'] inplace=True)\n",
+    "    df.insert(0, info_cols['cet'] \n",
     "              df.index.tz_localize('UTC').tz_convert('Europe/Brussels'))"
    ]
   },
@@ -1097,7 +1079,7 @@
    },
    "outputs": [],
    "source": [
-    "make_json(data_sets, headers)"
+    "make_json(data_sets, info_cols, version, headers)"
    ]
   },
   {
@@ -1200,23 +1182,25 @@
     "        continue\n",
     "\n",
     "    for col_name, col in df.iteritems():\n",
-    "        if not (col_name[0] in ['ce(s)t-timestamp', 'comment', 'marker'] or\n",
+    "        if not (col_name[0] in info_cols.values() or\n",
     "                col_name[2] == 'profile'):\n",
     "            df[col_name] = col.round(0)\n",
-    "            \n",
-    "    df_singleindex = df.copy()\n",
     "\n",
+    "    # MultIndex\n",
+    "    data_sets_multiindex[res_key + '_multiindex'] = df\n",
+    "\n",
+    "    # SingleIndex\n",
+    "    df_singleindex = df.copy()\n",
     "    # use first 3 levels of multiindex to create singleindex\n",
     "    df_singleindex.columns = [\n",
-    "        col[0] if col[0] in ['ce(s)t-timestamp', 'comment']\n",
+    "        col[0] if col[0] in info_cols.values()\n",
     "        else '_'.join(col[0:3]) for col in df.columns.values]\n",
     "\n",
     "    data_sets_singleindex[res_key + '_singleindex'] = df_singleindex\n",
     "\n",
-    "    data_sets_multiindex[res_key + '_multiindex'] = df\n",
-    "\n",
+    "    # Stacked\n",
     "    stacked = df.copy()\n",
-    "    stacked.drop('ce(s)t-timestamp', axis=1, inplace=True)\n",
+    "    stacked.drop(info_cols['cet'], axis=1, inplace=True)\n",
     "    stacked.columns = stacked.columns.droplevel(['source', 'web'])\n",
     "    stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n",
     "    data_sets_stacked[res_key + '_stacked'] = stacked"
@@ -1260,9 +1244,9 @@
     "    f = 'time_series_' + res_key\n",
     "    df = df.copy()\n",
     "    df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n",
-    "    df['ce(s)t-timestamp'] = df['ce(s)t-timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
+    "    df[info_cols['cet']] = df[info_cols['cet']].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
     "    df.to_sql(f, sqlite3.connect('time_series.sqlite'),\n",
-    "              if_exists='replace', index_label='utc-timestamp')"
+    "              if_exists='replace', index_label=info_cols['utc'])"
    ]
   },
   {
@@ -1284,7 +1268,7 @@
     }
    },
    "source": [
-    "Writing the full tables to Excel takes extremely long. As a workaroun, only the first five rows are exported. The rest of the data is inserted manually from the CSV."
+    "Writing the full tables to Excel takes extremely long. As a workaround, only the first 5 rows are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files."
    ]
   },
   {
@@ -1341,15 +1325,17 @@
     "%%time\n",
     "# itertoools.chain() allows iterating over multiple dicts at once\n",
     "for res_stacking_key, df in itertools.chain(\n",
-    "        #data_sets_singleindex.items(),\n",
-    "        #data_sets_multiindex.items(),):\n",
-    "        data_sets_stacked.items()):\n",
-    "    # convert the format of the ce(s)t-timestamp to ISO-8601\n",
-    "    if not  res_stacking_key in ['15min_stacked', '60min_stacked']:\n",
+    "        data_sets_singleindex.items(),\n",
+    "        data_sets_multiindex.items(),\n",
+    "        data_sets_stacked.items()\n",
+    "    ):\n",
+    "    # convert the format of the cet_cest-timestamp to ISO-8601\n",
+    "    if not (res_stacking_key in ['15min_stacked', '60min_stacked']\n",
+    "            or type(df.iloc[0,0]) == str):\n",
     "        df.iloc[:,0] = df.iloc[:,0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
     "    f = 'time_series_' + res_stacking_key\n",
     "    df.to_csv(f + '.csv', float_format='%.2f',\n",
-    "              date_format='%Y-%m-%dT%H:%M:%S%z')"
+    "              date_format='%Y-%m-%dT%H:%M:%SZ')"
    ]
   }
  ],
diff --git a/timeseries_scripts/make_json.py b/timeseries_scripts/make_json.py
@@ -33,9 +33,9 @@
     minutes) is provided in a separate file. All data processing is
     conducted in python and pandas and has been documented in the
     Jupyter notebooks linked below.
-documentation: https://github.com/Open-Power-System-Data/datapackage_timeseries/blob/2016-10-27/main.ipynb
+documentation: https://github.com/Open-Power-System-Data/datapackage_timeseries/blob/{version}/main.ipynb
 
-version: '2016-10-27'
+version: '{version}'
 
 last_changes: Included data from CEPS and PSE
 
@@ -79,7 +79,7 @@
           - path: time_series_{res_key}_singleindex.csv
             stacking: Singleindex
             format: csv
-          - path: time_series_{res_key}.xlsx
+          - path: time_series.xlsx
             stacking: Multiindex
             format: xlsx
           - path: time_series_{res_key}_multiindex.csv
@@ -89,22 +89,22 @@
             stacking: Stacked
             format: csv
       schema:
-          primaryKey: timestamp
+          primaryKey: {utc}
           missingValue: ""
           fields:
 '''
 
 indexfield = '''
-            - name: utc-timestamp
-              description: Start of timeperiod in UTC
+            - name: {utc}
+              description: Start of timeperiod in Coordinated Universal Time
               type: datetime
               format: fmt:%Y-%m-%dT%H%M%SZ
               opsd-contentfilter: true
-            - name: ce(s)t-timestamp
-              description: Start of timeperiod in CE(S)T
+            - name: {cet}
+              description: Start of timeperiod in Central European (Summer-) Time
               type: datetime
               format: fmt:%Y-%m-%dT%H%M%S%z
-            - name: comment
+            - name: {marker}
               description: marker to indicate which columns are missing data in source data and has been interpolated (e.g. solar_DE-transnetbw_generation;)
               type: string
 '''
@@ -129,9 +129,8 @@
 forecast: Forecasted {tech} generation forecast in {geo} in MW
 capacity: Electrical capacity of {tech} in {geo} in MW
 profile: Share of {tech} capacity producing in {geo}
-offshoreshare: {tech} actual offshore generation in {geo} in MW 
-EPEX: lalala
-Elspot: dududu
+epex: Day-ahead spot price for {geo}
+elspot: Day-ahead spot price for {geo}
 '''
 
 # Columns-specific metadata
@@ -145,7 +144,7 @@
 # metadata.
 
 
-def make_json(data_sets, headers):
+def make_json(data_sets, info_cols, version, headers):
     '''
     Create a datapackage.json file that complies with the Frictionless
     data JSON Table Schema from the information in the column-MultiIndex.
@@ -155,6 +154,11 @@ def make_json(data_sets, headers):
     data_sets: dict of pandas.DataFrames
         A dict with keys '15min' and '60min' and values the respective
         DataFrames
+    info_cols : dict of strings
+        Names for non-data columns such as for the index, for additional 
+        timestamps or the marker column
+    version: str
+        Version tag of the Data Package
     headers : list
         List of strings indicating the level names of the pandas.MultiIndex
         for the columns of the dataframe.
@@ -168,13 +172,14 @@ def make_json(data_sets, headers):
     resource_list = ''  # list of files included in the datapackage
     source_list = ''  # list of sources were data comes from
     for res_key, df in data_sets.items():
-        field_list = indexfield  # list of of columns in a file, starting with the index field
+        # Create the list of of columns in a file, starting with the index field
+        field_list = indexfield.format(**info_cols)
         for col in df.columns:
-            if col[0] in ['ce(s)t-timestamp', 'comment']:
+            if col[0] in info_cols.values():
                 continue
             h = {k: v for k, v in zip(headers, col)}
             if len(h['region']) > 2:
-                geo = h['region'] + ' control area'
+                geo = h['region'] + ' balancing area'
             elif h['region'] == 'NI':
                 geo = 'Northern Ireland'
             elif h['region'] == 'CS':
@@ -189,16 +194,16 @@ def make_json(data_sets, headers):
             field_list = field_list + field_template.format(**h)
             source_list = source_list + source_template.format(**h)
         resource_list = resource_list + \
-            resource_template.format(res_key=res_key) + field_list
+            resource_template.format(res_key=res_key, **info_cols) + field_list
 
     # Remove duplicates from sources_list. set() returns unique values from a
-    # collection, butit cannot compare dicts. Since source_list is a list of of
+    # collection, but it cannot compare dicts. Since source_list is a list of of
     # dicts, this requires some juggling with data types
     source_list = [dict(tupleized)
                    for tupleized in set(tuple(entry.items())
                                         for entry in yaml.load(source_list))]
 
-    metadata = yaml.load(metadata_head)
+    metadata = yaml.load(metadata_head.format(version=version))
     metadata['sources'] = source_list
     metadata['resources'] = yaml.load(resource_list)
     for resource in metadata['resources']:
diff --git a/timeseries_scripts/read.py b/timeseries_scripts/read.py