Skip to content

Commit 642fad0

Browse files
authored
Split daily validation posting on missing periods and week (#435)
* make sure data is sorted before slicing * sort data in pre-post too * split daily validation post split by uneven frequency and week * update whatsnew * flake8 * fix test to make sure frame assertion happens
1 parent 78c4f59 commit 642fad0

File tree

5 files changed

+117
-18
lines changed

5 files changed

+117
-18
lines changed

docs/source/whatsnew/1.0.0rc1.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,16 @@ Enhancements
1919
~~~~~~~~~~~~
2020
* Reference net load forecasts provided using week-ahead persistence. (:issue:`55`) (:pull:`392`)
2121
* Datamodel now supports ``'net_load'`` as an allowed variable. (:issue:`55`) (:pull:`392`)
22+
* Posting of daily validation now splits requests to avoid missing periods and
23+
limit each request to one week of data (:issue:`424`) (:pull:`435`)
2224

2325

2426
Bug fixes
2527
~~~~~~~~~
2628
* Fix incorrect ordering of months and weekdays in metrics plots.
2729
(:issue:`428`) (:pull:`430`)
30+
* Ensure data is sorted from reference data sources before slicing and
31+
posting to the API (:pull:`435`)
2832

2933

3034
Contributors

solarforecastarbiter/io/reference_observations/common.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,8 @@ def update_site_observations(api, fetch_func, site, observations,
283283
start = get_last_site_timestamp(api, site_observations, end)
284284
logger.debug('Fetching data for %s from %s to %s', site.name, start, end)
285285
obs_df = fetch_func(api, site, start, end)
286-
data_in_range = obs_df[start:end]
286+
# must be sorted for proper inexact start:end slicing
287+
data_in_range = obs_df.sort_index()[start:end]
287288
if data_in_range.empty:
288289
return
289290
for obs in site_observations:
@@ -295,6 +296,9 @@ def _prepare_data_to_post(data, variable, observation, start, end):
295296
to prepare for posting"""
296297
data = data[[variable]]
297298
data = data.rename(columns={variable: 'value'})
299+
# ensure data is sorted before slicing and for optimal order in the
300+
# database
301+
data = data.sort_index()
298302
# remove all future values, some files have forward filled nightly data
299303
data = data[start:min(end, _utcnow())]
300304
# we assume any reference data is given at the proper intervals

solarforecastarbiter/io/reference_observations/tests/test_common.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,24 @@ def test_update_site_observations_no_data(
492492
mock_api.assert_not_called()
493493

494494

495+
def test_update_site_observations_out_of_order(
496+
mock_api, site_objects_param, mocker,
497+
observation_objects_param, fake_ghi_data):
498+
start = pd.Timestamp('20190101T1200Z')
499+
end = pd.Timestamp('20190101T1230Z')
500+
fetch = mocker.MagicMock()
501+
fetch.return_value = fake_ghi_data.sample(frac=1)
502+
common.update_site_observations(
503+
mock_api, fetch, site_objects[1], observation_objects_param,
504+
start, end)
505+
args, _ = mock_api.post_observation_values.call_args
506+
assert args[0] == ''
507+
pd.testing.assert_frame_equal(
508+
args[1], fake_ghi_data.rename(
509+
columns={'ghi': 'value'})[start:end].resample(
510+
args[1].index.freq).first())
511+
512+
495513
@pytest.fixture()
496514
def template_fx(mock_api, mocker):
497515
mock_api.create_forecast = mocker.MagicMock(side_effect=lambda x: x)

solarforecastarbiter/validation/tasks.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from solarforecastarbiter import pvmodel
88
from solarforecastarbiter.io.api import APISession
9-
from solarforecastarbiter.validation import validator
9+
from solarforecastarbiter.validation import validator, quality_mapping
1010

1111

1212
logger = logging.getLogger(__name__)
@@ -391,7 +391,7 @@ def _daily_validation(session, observation, start, end, base_url):
391391
logger.info('Validating data for %s from %s to %s',
392392
observation.name, start, end)
393393
observation_values = session.get_observation_values(
394-
observation.observation_id, start, end)
394+
observation.observation_id, start, end).sort_index()
395395
value_series = observation_values['value'].astype(float)
396396
if len(value_series.dropna()) < 10:
397397
raise IndexError(
@@ -410,9 +410,28 @@ def _daily_validation(session, observation, start, end, base_url):
410410

411411
quality_flags.name = 'quality_flag'
412412
observation_values.update(quality_flags)
413-
session.post_observation_values(observation.observation_id,
414-
observation_values,
415-
params='donotvalidate')
413+
return _group_continuous_week_post(
414+
session, observation, observation_values)
415+
416+
417+
def _group_continuous_week_post(session, observation, observation_values):
418+
# observation_values expected to be sorted
419+
# observation values already have uneven frequency checked
420+
gid = quality_mapping.check_if_series_flagged(
421+
observation_values['quality_flag'], 'UNEVEN FREQUENCY').cumsum()
422+
# make series of week + year integers to further
423+
# split data to post at most one week at a time
424+
# ~10,000 pts of 1min data
425+
week_int = (gid.index.week + gid.index.year).values
426+
# combine the continuous groups with groups of weeks
427+
# gid is unique for each group since week_int and cumsum
428+
# increase monotonically and are positive
429+
gid += week_int
430+
observation_values['gid'] = gid
431+
for _, group in observation_values.groupby('gid'):
432+
session.post_observation_values(observation.observation_id,
433+
group[['value', 'quality_flag']],
434+
params='donotvalidate')
416435

417436

418437
def daily_single_observation_validation(access_token, observation_id, start,

solarforecastarbiter/validation/tests/test_validation_tasks.py

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -632,8 +632,9 @@ def test_daily_observation_validation_ghi(mocker, make_observation,
632632
DESCRIPTION_MASK_MAPPING['UNEVEN FREQUENCY'] |
633633
LATEST_VERSION_FLAG
634634
]
635-
assert post_mock.called_once
636-
assert_frame_equal(post_mock.call_args[0][1], out)
635+
assert post_mock.called
636+
posted_df = pd.concat([cal[0][1] for cal in post_mock.call_args_list])
637+
assert_frame_equal(posted_df, out)
637638

638639

639640
def test_daily_observation_validation_ghi_zeros(mocker, make_observation,
@@ -677,8 +678,9 @@ def test_daily_observation_validation_ghi_zeros(mocker, make_observation,
677678
base | DESCRIPTION_MASK_MAPPING['NIGHTTIME'] |
678679
DESCRIPTION_MASK_MAPPING['UNEVEN FREQUENCY']
679680
]
680-
assert post_mock.called_once
681-
assert_frame_equal(post_mock.call_args[0][1], out)
681+
assert post_mock.called
682+
posted_df = pd.concat([cal[0][1] for cal in post_mock.call_args_list])
683+
assert_frame_equal(posted_df, out)
682684

683685

684686
def test_validate_daily_dc_power(mocker, make_observation, daily_index):
@@ -767,8 +769,9 @@ def test_daily_observation_validation_dc_power(mocker, make_observation,
767769
DESCRIPTION_MASK_MAPPING['NIGHTTIME'] |
768770
LATEST_VERSION_FLAG
769771
]
770-
assert post_mock.called_once
771-
assert_frame_equal(post_mock.call_args[0][1], out)
772+
assert post_mock.called
773+
posted_df = pd.concat([cal[0][1] for cal in post_mock.call_args_list])
774+
assert_frame_equal(posted_df, out)
772775

773776

774777
def test_validate_daily_ac_power(mocker, make_observation, daily_index):
@@ -863,8 +866,9 @@ def test_daily_observation_validation_ac_power(mocker, make_observation,
863866
DESCRIPTION_MASK_MAPPING['NIGHTTIME'] |
864867
LATEST_VERSION_FLAG
865868
]
866-
assert post_mock.called_once
867-
assert_frame_equal(post_mock.call_args[0][1], out)
869+
assert post_mock.called
870+
posted_df = pd.concat([cal[0][1] for cal in post_mock.call_args_list])
871+
assert_frame_equal(posted_df, out)
868872

869873

870874
@pytest.mark.parametrize('var', ['air_temperature', 'wind_speed', 'dni', 'dhi',
@@ -885,13 +889,14 @@ def test_daily_observation_validation_other(var, mocker, make_observation,
885889
return_value=data)
886890
post_mock = mocker.patch(
887891
'solarforecastarbiter.io.api.APISession.post_observation_values')
888-
validate_mock = mocker.MagicMock()
892+
validated = pd.Series(2, index=daily_index)
893+
validate_mock = mocker.MagicMock(return_value=validated)
889894
mocker.patch.dict(
890895
'solarforecastarbiter.validation.tasks.IMMEDIATE_VALIDATION_FUNCS',
891896
{var: validate_mock})
892897
tasks.daily_single_observation_validation(
893898
'', obs.observation_id, data.index[0], data.index[-1])
894-
assert post_mock.called_once
899+
assert post_mock.called
895900
assert validate_mock.called
896901

897902

@@ -914,13 +919,14 @@ def test_daily_observation_validation_many(mocker, make_observation,
914919
return_value=data)
915920
post_mock = mocker.patch(
916921
'solarforecastarbiter.io.api.APISession.post_observation_values')
917-
validate_mock = mocker.MagicMock()
922+
validated = pd.Series(2, index=daily_index)
923+
validate_mock = mocker.MagicMock(return_value=validated)
918924
mocker.patch.dict(
919925
'solarforecastarbiter.validation.tasks.IMMEDIATE_VALIDATION_FUNCS',
920926
{'dhi': validate_mock, 'dni': validate_mock})
921927
tasks.daily_observation_validation(
922928
'', data.index[0], data.index[-1])
923-
assert post_mock.called_once
929+
assert post_mock.called
924930
assert validate_mock.call_count == 2
925931

926932

@@ -967,3 +973,51 @@ def test_daily_observation_validation_not_enough(mocker, make_observation):
967973
'', data.index[0], data.index[-1])
968974
assert out is None
969975
assert log.called
976+
977+
978+
def test__group_continuous_week_post(mocker, make_observation):
979+
split_dfs = [
980+
pd.DataFrame([(0, LATEST_VERSION_FLAG)],
981+
columns=['value', 'quality_flag'],
982+
index=pd.date_range(
983+
start='2020-05-03T00:00',
984+
end='2020-05-03T23:59',
985+
tz='UTC',
986+
freq='1h')),
987+
# new week split
988+
pd.DataFrame([(0, LATEST_VERSION_FLAG)],
989+
columns=['value', 'quality_flag'],
990+
index=pd.date_range(
991+
start='2020-05-04T00:00',
992+
end='2020-05-04T11:59',
993+
tz='UTC',
994+
freq='1h')),
995+
# missing 12
996+
pd.DataFrame(
997+
[(0, LATEST_VERSION_FLAG | DESCRIPTION_MASK_MAPPING['UNEVEN FREQUENCY'])] + # NOQA
998+
[(1, LATEST_VERSION_FLAG)] * 7,
999+
columns=['value', 'quality_flag'],
1000+
index=pd.date_range(
1001+
start='2020-05-04T13:00',
1002+
end='2020-05-04T20:00',
1003+
tz='UTC',
1004+
freq='1h')),
1005+
# missing a week+
1006+
pd.DataFrame(
1007+
[(9, LATEST_VERSION_FLAG | DESCRIPTION_MASK_MAPPING['UNEVEN FREQUENCY'])] + # NOQA
1008+
[(3, LATEST_VERSION_FLAG)] * 7,
1009+
columns=['value', 'quality_flag'],
1010+
index=pd.date_range(
1011+
start='2020-05-13T09:00',
1012+
end='2020-05-13T16:59',
1013+
tz='UTC',
1014+
freq='1h')),
1015+
]
1016+
ov = pd.concat(split_dfs, axis=0)
1017+
obs = make_observation('ghi')
1018+
session = mocker.MagicMock()
1019+
tasks._group_continuous_week_post(session, obs, ov)
1020+
call_list = session.post_observation_values.call_args_list
1021+
assert len(call_list) == 4
1022+
for i, cal in enumerate(call_list):
1023+
assert_frame_equal(split_dfs[i], cal[0][1])

0 commit comments

Comments
 (0)