Skip to content

Commit a7e0898

Browse files
author
Scott Sanderson
committed
MAINT: Clarify handling of edge cases in fx readers.
- When reading before the start of data, return NaN. We do this because it's hard to reliably apply a lower bound to the queried dates in core-loader style pipeline loaders. - When reading an unknown base currency, return NaN. We might get data from third parties with unknown currencies. Doing so should not be an error. - When reading after the end of data, emit an error rather than forward-filling forever. We may want to revisit this in the future.
1 parent 0e73c46 commit a7e0898

File tree

5 files changed

+111
-70
lines changed

5 files changed

+111
-70
lines changed

tests/data/test_fx.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ def test_scalar_lookup(self):
6666

6767
rates = self.FX_RATES_RATE_NAMES
6868
currencies = self.FX_RATES_CURRENCIES
69-
dates = pd.date_range(self.FX_RATES_START_DATE, self.FX_RATES_END_DATE)
69+
dates = pd.date_range(
70+
self.FX_RATES_START_DATE - pd.Timedelta('1 day'),
71+
self.FX_RATES_END_DATE,
72+
)
7073

7174
cases = itertools.product(rates, currencies, currencies, dates)
7275

@@ -78,7 +81,7 @@ def test_scalar_lookup(self):
7881
assert_equal(result.shape, (1, 1))
7982

8083
result_scalar = result[0, 0]
81-
if quote == base:
84+
if dt >= self.FX_RATES_START_DATE and quote == base:
8285
assert_equal(result_scalar, 1.0)
8386

8487
expected = self.get_expected_fx_rate_scalar(rate, quote, base, dt)
@@ -93,7 +96,10 @@ def test_scalar_lookup(self):
9396
def test_2d_lookup(self):
9497
rand = np.random.RandomState(42)
9598

96-
dates = pd.date_range(self.FX_RATES_START_DATE, self.FX_RATES_END_DATE)
99+
dates = pd.date_range(
100+
self.FX_RATES_START_DATE - pd.Timedelta('2 days'),
101+
self.FX_RATES_END_DATE
102+
)
97103
rates = self.FX_RATES_RATE_NAMES + [DEFAULT_FX_RATE]
98104
currencies = self.FX_RATES_CURRENCIES
99105

@@ -119,7 +125,10 @@ def test_2d_lookup(self):
119125
def test_columnar_lookup(self):
120126
rand = np.random.RandomState(42)
121127

122-
dates = pd.date_range(self.FX_RATES_START_DATE, self.FX_RATES_END_DATE)
128+
dates = pd.date_range(
129+
self.FX_RATES_START_DATE - pd.Timedelta('2 days'),
130+
self.FX_RATES_END_DATE,
131+
)
123132
rates = self.FX_RATES_RATE_NAMES + [DEFAULT_FX_RATE]
124133
currencies = self.FX_RATES_CURRENCIES
125134
reader = self.reader
@@ -175,27 +184,49 @@ def test_load_everything(self):
175184
assert_equal(london_result, london_rates.values)
176185

177186
def test_read_before_start_date(self):
187+
# Reads from before the start of our data should emit NaN. We do this
188+
# because, for some Pipeline loaders, it's hard to put a lower bound on
189+
# input asof dates, so we end up making queries for asof_dates that
190+
# might be before the start of FX data. When that happens, we want to
191+
# emit NaN, but we don't want to fail.
178192
for bad_date in (self.FX_RATES_START_DATE - pd.Timedelta('1 day'),
179193
self.FX_RATES_START_DATE - pd.Timedelta('1000 days')):
180194

181195
for rate in self.FX_RATES_RATE_NAMES:
182196
quote = 'USD'
183197
bases = np.array(['CAD'], dtype=object)
184198
dts = pd.DatetimeIndex([bad_date])
185-
with self.assertRaises(ValueError):
186-
self.reader.get_rates(rate, quote, bases, dts)
199+
result = self.reader.get_rates(rate, quote, bases, dts)
200+
assert_equal(result.shape, (1, 1))
201+
assert_equal(np.nan, result[0, 0])
187202

188203
def test_read_after_end_date(self):
204+
# Reads from **after** the end of our data, on the other hand, should
205+
# fail. We can always upper bound the relevant asofs that we're
206+
# interested in, and having fx rates forward-fill past the end of data
207+
# is confusing and takes a while to debug.
189208
for bad_date in (self.FX_RATES_END_DATE + pd.Timedelta('1 day'),
190209
self.FX_RATES_END_DATE + pd.Timedelta('1000 days')):
191210

192211
for rate in self.FX_RATES_RATE_NAMES:
193212
quote = 'USD'
194213
bases = np.array(['CAD'], dtype=object)
195214
dts = pd.DatetimeIndex([bad_date])
215+
196216
with self.assertRaises(ValueError):
197217
self.reader.get_rates(rate, quote, bases, dts)
198218

219+
with self.assertRaises(ValueError):
220+
self.reader.get_rates_columnar(rate, quote, bases, dts)
221+
222+
def test_read_unknown_base(self):
223+
for rate in self.FX_RATES_RATE_NAMES:
224+
quote = 'USD'
225+
bases = np.array(['XXX'], dtype=object)
226+
dts = pd.DatetimeIndex([self.FX_RATES_START_DATE])
227+
result = self.reader.get_rates(rate, quote, bases, dts)[0, 0]
228+
assert_equal(result, np.nan)
229+
199230

200231
class InMemoryFXReaderTestCase(_FXReaderTestCase):
201232

zipline/data/fx/hdf5.py

Lines changed: 38 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
from zipline.utils.numpy_utils import bytes_array_to_native_str_object_array
105105

106106
from .base import FXRateReader, DEFAULT_FX_RATE
107+
from .utils import check_dts, is_sorted_ascending
107108

108109
HDF5_FX_VERSION = 0
109110

@@ -189,10 +190,7 @@ def get_rates(self, rate, quote, bases, dts):
189190
if rate == DEFAULT_FX_RATE:
190191
rate = self._default_rate
191192

192-
# TODO: Commenting this _check_dts out for now to bypass the
193-
# estimates loader date bounds issue. Will need to address
194-
# this before finalizing anything.
195-
# self._check_dts(self.dts, dts)
193+
check_dts(self.dts, dts)
196194

197195
row_ixs = self.dts.searchsorted(dts, side='right') - 1
198196
col_ixs = self.currencies.get_indexer(bases)
@@ -207,51 +205,48 @@ def get_rates(self, rate, quote, bases, dts):
207205

208206
# OPTIMIZATION: Row indices correspond to dates, which must be in
209207
# sorted order. Rather than reading the entire dataset from h5, we can
210-
# read just the interval from min_row to max_row inclusive.
208+
# read just the interval from min_row to max_row inclusive
211209
#
212-
# We don't bother with a similar optimization for columns because in
213-
# expectation we're going to load most of the
214-
215-
# array, so it's easier to pull all columns and reindex in memory. For
216-
# rows, however, a quick and easy optimization is to pull just the
217-
# slice from min(row_ixs) to max(row_ixs).
218-
min_row = max(row_ixs[0], 0)
219-
max_row = row_ixs[-1]
220-
rows = dataset[min_row:max_row + 1] # +1 to be inclusive of end
221-
222-
out = rows[row_ixs - min_row][:, col_ixs]
210+
# However, we also need to handle two important edge cases:
211+
#
212+
# 1. row_ixs contains -1 for dts before the start of self.dts.
213+
# 2. col_ixs contains -1 for any currencies we don't know about.
214+
#
215+
# If either of the above cases obtains, we want to return NaN for the
216+
# corresponding output locations.
223217

224-
# get_indexer returns -1 for failed lookups. Fill these in with NaN.
218+
# We handle (1) by reading raw data into a buffer with one extra
219+
# row. When we then apply the row index to permute the raw data into
220+
# the correct order, any rows with values of -1 will pull from the
221+
# extra row, which will always contain NaN>
222+
#
223+
# We handle (2) by overwriting columns with indices of -1 with NaN as a
224+
# postprocessing step.
225+
slice_begin = max(row_ixs[0], 0)
226+
slice_end = max(row_ixs[-1], 0) + 1 # +1 to be inclusive of end date.
227+
228+
# Allocate a buffer full of NaNs with one extra row/column. See
229+
# OPTIMIZATION notes above.
230+
buf = np.full(
231+
(slice_end - slice_begin + 1, len(self.currencies)),
232+
np.nan,
233+
)
234+
235+
# Read data into all but the last row/column of the buffer.
236+
dataset.read_direct(
237+
buf[:-1],
238+
np.s_[slice_begin:slice_end],
239+
)
240+
241+
# Permute the rows into place, pulling from the empty NaN locations for
242+
# row/column indices of -1.
243+
out = buf[:, col_ixs][row_ixs - slice_begin]
244+
245+
# Fill missing columns with NaN. See OPTIMIZATION notes above.
225246
out[:, col_ixs == -1] = np.nan
226247

227-
# TODO: searchsorted also gives -1 for failed lookups. However, these
228-
# failed lookups arise due to the estimates date bounds bug that we
229-
# have not yet addressed, so this is a temporary fix.
230-
out[row_ixs == -1, :] = np.nan
231-
232248
return out
233249

234-
def _check_dts(self, stored, requested):
235-
"""Validate that requested dates are in bounds for what we have stored.
236-
"""
237-
request_start, request_end = requested[[0, -1]]
238-
data_start, data_end = stored[[0, -1]]
239-
240-
if request_start < data_start:
241-
raise ValueError(
242-
"Requested fx rates starting at {}, but data starts at {}"
243-
.format(request_start, data_start)
244-
)
245-
246-
if request_end > data_end:
247-
raise ValueError(
248-
"Requested fx rates ending at {}, but data ends at {}"
249-
.format(request_end, data_end)
250-
)
251-
252-
if not is_sorted_ascending(requested):
253-
raise ValueError("Requested fx rates with non-ascending dts.")
254-
255250

256251
class HDF5FXRateWriter(object):
257252
"""Writer class for HDF5 files consumed by HDF5FXRateReader.
@@ -320,7 +315,3 @@ def _write_data_group(self, dts, currencies, data):
320315

321316
def _log_writing(self, *path):
322317
log.debug("Writing {}", '/'.join(path))
323-
324-
325-
def is_sorted_ascending(array):
326-
return (np.maximum.accumulate(array) <= array).all()

zipline/data/fx/in_memory.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""Interface and definitions for foreign exchange rate readers.
22
"""
33
from interface import implements
4+
import numpy as np
45

56
from .base import FXRateReader, DEFAULT_FX_RATE
7+
from .utils import check_dts
68

79

810
class InMemoryFXRateReader(implements(FXRateReader)):
@@ -34,7 +36,7 @@ def get_rates(self, rate, quote, bases, dts):
3436

3537
df = self._data[rate][quote]
3638

37-
self._check_dts(df.index, dts)
39+
check_dts(df.index, dts)
3840

3941
# Get raw values out of the frame.
4042
#
@@ -51,22 +53,11 @@ def get_rates(self, rate, quote, bases, dts):
5153
values = df.values
5254
row_ixs = df.index.searchsorted(dts, side='right') - 1
5355
col_ixs = df.columns.get_indexer(bases)
54-
return values[row_ixs][:, col_ixs]
5556

56-
def _check_dts(self, stored, requested):
57-
"""Validate that requested dates are in bounds for what we have stored.
58-
"""
59-
request_start, request_end = requested[[0, -1]]
60-
data_start, data_end = stored[[0, -1]]
57+
out = values[:, col_ixs][row_ixs]
6158

62-
if request_start < data_start:
63-
raise ValueError(
64-
"Requested fx rates starting at {}, but data starts at {}"
65-
.format(request_start, data_start)
66-
)
59+
# Handle dates before start and unknown bases.
60+
out[row_ixs == -1] = np.nan
61+
out[:, col_ixs == -1] = np.nan
6762

68-
if request_end > data_end:
69-
raise ValueError(
70-
"Requested fx rates ending at {}, but data ends at {}"
71-
.format(request_end, data_end)
72-
)
63+
return out

zipline/data/fx/utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import numpy as np
2+
3+
4+
def check_dts(stored_dts, requested_dts):
5+
"""
6+
Validate that ``requested_dts`` are valid for querying from an FX reader
7+
that has data for ``stored_dts``.
8+
"""
9+
request_end = requested_dts[-1]
10+
data_end = stored_dts[-1]
11+
12+
if not is_sorted_ascending(requested_dts):
13+
raise ValueError("Requested fx rates with non-ascending dts.")
14+
15+
if request_end > data_end:
16+
raise ValueError(
17+
"Requested fx rates ending at {}, but data ends at {}"
18+
.format(request_end, data_end)
19+
)
20+
21+
22+
def is_sorted_ascending(array):
23+
return (np.maximum.accumulate(array) <= array).all()

zipline/testing/fixtures.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2201,6 +2201,11 @@ def get_expected_fx_rate_scalar(cls, rate, quote, base, dt):
22012201
rate = cls.FX_RATES_DEFAULT_RATE
22022202

22032203
col = cls.fx_rates[rate][quote][base]
2204+
if dt < col.index[0]:
2205+
return np.nan
2206+
elif dt > col.index[-1]:
2207+
raise ValueError("dt={} > max dt={}".format(dt, col.index[-1]))
2208+
22042209
# PERF: We call this function a lot in some suites, and get_loc is
22052210
# surprisingly expensive, so optimizing it has a meaningful impact on
22062211
# overall suite performance. See test_fast_get_loc_ffilled_for

0 commit comments

Comments
 (0)