SolarArbiter · wholmgren · May 12, 2020 · May 7, 2020 · May 8, 2020 · May 8, 2020
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -279,8 +279,25 @@ DOE RTC
    io.fetch.rtc.request_doe_rtc_data
    io.fetch.rtc.fetch_doe_rtc
 
+NREL PVDAQ
+----------
+
+.. autosummary::
+   :toctree: generated/
+
+   io.fetch.pvdaq.get_pvdaq_metadata
+   io.fetch.pvdaq.get_pvdaq_data
+
+
 Reference observations
-----------------------
+======================
+
+The following modules contain code for initializing the reference
+database, wrappers for fetching data, functions for processing (e.g.
+renaming and resampling) data, and wrapper functions for posting data.
+The pure fetch functions are found in ``pvlib.iotools`` and in
+``solarforecastarbiter.io.fetch``. See the source code for additional
+files with site and observation metadata.
 
 .. autosummary::
    :toctree: generated/
@@ -295,6 +312,7 @@ Reference observations
    io.reference_observations.srml
    io.reference_observations.surfrad
    io.reference_observations.arm
+   io.reference_observations.pvdaq
 
 SFA API
 =======

diff --git a/docs/source/whatsnew/1.0.0rc1.rst b/docs/source/whatsnew/1.0.0rc1.rst
@@ -23,7 +23,8 @@ Enhancements
   limit each request to one week of data (:issue:`424`) (:pull:`435`)
 * PDF report figures are generated instead of SVG for easy integration into PDF
   reports (:issue:`360`) (:pull:`437`)
-
+* Added support for NREL PVDAQ sites to the reference database functions.
+  (:issue:`397`) (:pull:`438`)
 
 Bug fixes
 ~~~~~~~~~

diff --git a/solarforecastarbiter/cli.py b/solarforecastarbiter/cli.py
@@ -161,7 +161,7 @@ def referencedata():
 
 network_opt = click.option(
     '--network', multiple=True,
-    help="The Networks to act on. Defaults to all.",
+    help="The networks to act on. Defaults to all.",
     default=reference_data.NETWORK_OPTIONS,
     type=click.Choice(reference_data.NETWORK_OPTIONS))
 

diff --git a/solarforecastarbiter/datamodel.py b/solarforecastarbiter/datamodel.py
@@ -478,7 +478,7 @@ class Observation(BaseModel):
         Variable name, e.g. power, GHI. Each allowed variable has an
         associated pre-defined unit.
     interval_value_type : str
-        The type of the data in the observation. Typically interval mean or
+        The type of the data in the observation. Typically interval_mean or
         instantaneous, but additional types may be defined for events.
     interval_length : pandas.Timedelta
         The length of time between consecutive data points, e.g. 5 minutes,

diff --git a/solarforecastarbiter/io/fetch/pvdaq.py b/solarforecastarbiter/io/fetch/pvdaq.py
@@ -0,0 +1,106 @@
+"""Functions to read NREL PVDAQ data.
+"""
+
+# Code originally written by Bennet Meyers (@bmeyers), Stanford, SLAC in
+# https://github.com/pvlib/pvlib-python/pull/664
+# Adapated by Will Holmgren (@wholmgren), University of Arizona
+
+import json
+from io import StringIO
+
+import requests
+import pandas as pd
+
+
+# consider adding an auth=(username, password) kwarg (default None) to
+# support private data queries
+
+def get_pvdaq_metadata(system_id, api_key):
+    """Query PV system metadata from NREL's PVDAQ data service.
+
+    Parameters
+    ----------
+    system_id: int
+        The system ID corresponding to the site that data should be
+        queried from.
+
+    api_key: string
+        Your NREL API key (https://developer.nrel.gov/docs/api-key/)
+
+    Returns
+    -------
+    dict
+    """
+
+    params = {'system_id': system_id, 'api_key': api_key}
+    sites_url = 'https://developer.nrel.gov/api/pvdaq/v3/sites.json'
+    r = requests.get(sites_url, params=params)
+    r.raise_for_status()
+    outputs = json.loads(r.content)['outputs']
+    return outputs
+
+
+def get_pvdaq_data(system_id, year, api_key='DEMO_KEY'):
+    """Query PV system data from NREL's PVDAQ data service:
+
+    https://maps.nrel.gov/pvdaq/
+
+    This function uses the annual raw data file API, which is the most
+    efficient way of accessing multi-year, sub-hourly time series data.
+
+    Parameters
+    ----------
+    system_id: int
+        The system ID corresponding to the site that data should be
+        queried from.
+
+    year: int or list of ints
+        Either the year to request or the list of years to request.
+        Multiple years will be concatenated into a single DataFrame.
+
+    api_key: string
+        Your NREL API key (https://developer.nrel.gov/docs/api-key/)
+
+    Returns
+    -------
+    pandas.DataFrame
+        A DataFrame containing the time series data from the
+        PVDAQ service over the years requested. Times are typically
+        in local time.
+
+    Notes
+    -----
+    The PVDAQ metadata contains a key "available_years" that is a useful
+    value for the *year* argument.
+    """
+
+    try:
+        year = int(year)
+    except TypeError:
+        year = [int(yr) for yr in year]
+    else:
+        year = [year]
+
+    # Each year must queries separately, so iterate over the years and
+    # generate a list of dataframes.
+    # Consider putting this loop in its own private function with
+    # try / except / try again pattern for network issues and NREL API
+    # throttling
+    df_list = []
+    for yr in year:
+        params = {
+            'api_key': api_key,
+            'system_id': system_id,
+            'year': yr
+        }
+        base_url = 'https://developer.nrel.gov/api/pvdaq/v3/data_file'
+        response = requests.get(base_url, params=params)
+        response.raise_for_status()
+        df = pd.read_csv(StringIO(response.text))
+        df_list.append(df)
+
+    # concatenate the list of yearly DataFrames
+    df = pd.concat(df_list, axis=0, sort=True)
+    df['Date-Time'] = pd.to_datetime(df['Date-Time'])
+    df.set_index('Date-Time', inplace=True)
+    return df
diff --git a/solarforecastarbiter/io/reference_observations/README.md b/solarforecastarbiter/io/reference_observations/README.md
@@ -1,8 +1,10 @@
-# Reference Observaitons
-Module for importing reference data into the SolarForecastArbiter.
+# Reference Observations
 
-This module serves two purposes:
-  - Creating reference metadata objects.
-    - Any sites found in an observation network and their associated Observations and avaialable metadata.
-  - Importing measurements
-    - Interacting with a Network's API to import the appropriate data as it becomes available.       
+Package for importing reference data into the Solar Forecast Arbiter.
+
+This package serves two purposes:
+
+- Creating reference metadata objects.
+  - Any sites found in an observation network and their associated Observations and available metadata.
+- Importing reference measurements
+  - Interacting with a network's API to import the appropriate data as it becomes available.
diff --git a/solarforecastarbiter/io/reference_observations/common.py b/solarforecastarbiter/io/reference_observations/common.py
@@ -193,16 +193,21 @@ def create_observation(api, site, variable, extra_params=None, **kwargs):
         'variable': variable,
         'extra_parameters': json.dumps(extra_parameters)
     })
+
+    return check_and_post_observation(api, observation)
+
+
+def check_and_post_observation(api, observation):
     existing = existing_observations(api)
     if observation.name in existing:
-        logger.info('Observation, %s, already exists', observation_name)
+        logger.info('Observation, %s, already exists', observation.name)
         return existing[observation.name]
 
     try:
         created = api.create_observation(observation)
     except HTTPError as e:
-        logger.error(f'Failed to create {variable} observation at Site '
-                     f'{site.name}.')
+        logger.error(f'Failed to create {observation.variable} observation '
+                     f'at Site {observation.site.name}.')
         logger.debug(f'HTTP Error: {e.response.text}')
     else:
         logger.info(f"Observation {created.name} created successfully.")
@@ -263,7 +268,7 @@ def update_site_observations(api, fetch_func, site, observations,
         An active Reference user session.
     fetch_func : function
         A function that requests data and returns a DataFrame for a given site.
-        The function should accept the parameters (api, site, start end) as
+        The function should accept the parameters (api, site, start, end) as
         they appear in this function.
     site : solarforecastarbiter.datamodel.Site
         The Site with observations to update.
@@ -291,21 +296,26 @@ def update_site_observations(api, fetch_func, site, observations,
         post_observation_data(api, obs, data_in_range, start, end)
 
 
-def _prepare_data_to_post(data, variable, observation, start, end):
+def _prepare_data_to_post(data, variable, observation, start, end,
+                          resample_how):
     """Manipulate the data including reindexing to observation.interval_label
     to prepare for posting"""
     data = data[[variable]]
     data = data.rename(columns={variable: 'value'})
     # ensure data is sorted before slicing and for optimal order in the
     # database
     data = data.sort_index()
+
+    if resample_how:
+        resampler = data.resample(observation.interval_length)
+        data = getattr(resampler, resample_how)()
+
     # remove all future values, some files have forward filled nightly data
     data = data[start:min(end, _utcnow())]
-    # we assume any reference data is given at the proper intervals
-    # and already averaged if appropriate
-    # so just reindex the data to put nans where required
+
     if data.empty:
         return data
+    # reindex the data to put nans where required
     # we don't extend the new index to start, end, since reference
     # data has some lag time from the end it was requested from
     # and it isn't necessary to keep the nans between uploads in db
@@ -349,14 +359,20 @@ def post_observation_data(api, observation, data, start, end):
     # check for a non-standard variable label in extra_parameters
     variable = extra_parameters.get('network_data_label',
                                     observation.variable)
+    # check if the raw observation needs to be resampled before posting
+    resample_how = extra_parameters.get('resample_how', None)
     try:
         var_df = _prepare_data_to_post(data, variable, observation,
-                                       start, end)
+                                       start, end, resample_how)
     except KeyError:
         logger.error(f'{variable} could not be found in the data file '
                      f'from {data.index[0]} to {data.index[-1]}'
                      f'for Observation {observation.name}')
         return
+    except AttributeError:
+        logger.error(f'{variable} could not be resampled using method '
+                     f'{resample_how} for Observation {observation.name}')
+        return
 
     # skip post id data is empty, if there are nans, should still post
     if var_df.empty:

diff --git a/solarforecastarbiter/io/reference_observations/midc.py b/solarforecastarbiter/io/reference_observations/midc.py
@@ -103,10 +103,14 @@ def update_observation_data(api, sites, observations, start, end):
     """Post new observation data to all MIDC observations from
     start to end.
 
+    Parameters
+    ----------
     api : solarforecastarbiter.io.api.APISession
         An active Reference user session.
     sites: list
         List of all reference sites as Objects
+    observations: list of solarforecastarbiter.datamodel.Observation
+        List of all reference observations.
     start : datetime
         The beginning of the period to request data for.
     end : datetime