MAINT: Clarify handling of edge cases in fx readers.

Scott Sanderson · Scott Sanderson · commit a7e089809eba · 2020-01-30T10:10:40.000-05:00
- When reading before the start of data, return NaN. We do this because it's
  hard to reliably apply a lower bound to the queried dates in core-loader
  style pipeline loaders.

- When reading an unknown base currency, return NaN. We might get data from
  third parties with unknown currencies. Doing so should not be an error.

- When reading after the end of data, emit an error rather than forward-filling
  forever. We may want to revisit this in the future.
diff --git a/tests/data/test_fx.py b/tests/data/test_fx.py
@@ -66,7 +66,10 @@ def test_scalar_lookup(self):
 
         rates = self.FX_RATES_RATE_NAMES
         currencies = self.FX_RATES_CURRENCIES
-        dates = pd.date_range(self.FX_RATES_START_DATE, self.FX_RATES_END_DATE)
+        dates = pd.date_range(
+            self.FX_RATES_START_DATE - pd.Timedelta('1 day'),
+            self.FX_RATES_END_DATE,
+        )
 
         cases = itertools.product(rates, currencies, currencies, dates)
 
@@ -78,7 +81,7 @@ def test_scalar_lookup(self):
             assert_equal(result.shape, (1, 1))
 
             result_scalar = result[0, 0]
-            if quote == base:
+            if dt >= self.FX_RATES_START_DATE and quote == base:
                 assert_equal(result_scalar, 1.0)
 
             expected = self.get_expected_fx_rate_scalar(rate, quote, base, dt)
@@ -93,7 +96,10 @@ def test_scalar_lookup(self):
     def test_2d_lookup(self):
         rand = np.random.RandomState(42)
 
-        dates = pd.date_range(self.FX_RATES_START_DATE, self.FX_RATES_END_DATE)
+        dates = pd.date_range(
+            self.FX_RATES_START_DATE - pd.Timedelta('2 days'),
+            self.FX_RATES_END_DATE
+        )
         rates = self.FX_RATES_RATE_NAMES + [DEFAULT_FX_RATE]
         currencies = self.FX_RATES_CURRENCIES
 
@@ -119,7 +125,10 @@ def test_2d_lookup(self):
     def test_columnar_lookup(self):
         rand = np.random.RandomState(42)
 
-        dates = pd.date_range(self.FX_RATES_START_DATE, self.FX_RATES_END_DATE)
+        dates = pd.date_range(
+            self.FX_RATES_START_DATE - pd.Timedelta('2 days'),
+            self.FX_RATES_END_DATE,
+        )
         rates = self.FX_RATES_RATE_NAMES + [DEFAULT_FX_RATE]
         currencies = self.FX_RATES_CURRENCIES
         reader = self.reader
@@ -175,27 +184,49 @@ def test_load_everything(self):
             assert_equal(london_result, london_rates.values)
 
     def test_read_before_start_date(self):
+        # Reads from before the start of our data should emit NaN. We do this
+        # because, for some Pipeline loaders, it's hard to put a lower bound on
+        # input asof dates, so we end up making queries for asof_dates that
+        # might be before the start of FX data. When that happens, we want to
+        # emit NaN, but we don't want to fail.
         for bad_date in (self.FX_RATES_START_DATE - pd.Timedelta('1 day'),
                          self.FX_RATES_START_DATE - pd.Timedelta('1000 days')):
 
             for rate in self.FX_RATES_RATE_NAMES:
                 quote = 'USD'
                 bases = np.array(['CAD'], dtype=object)
                 dts = pd.DatetimeIndex([bad_date])
-                with self.assertRaises(ValueError):
-                    self.reader.get_rates(rate, quote, bases, dts)
+                result = self.reader.get_rates(rate, quote, bases, dts)
+                assert_equal(result.shape, (1, 1))
+                assert_equal(np.nan, result[0, 0])
 
     def test_read_after_end_date(self):
+        # Reads from **after** the end of our data, on the other hand, should
+        # fail. We can always upper bound the relevant asofs that we're
+        # interested in, and having fx rates forward-fill past the end of data
+        # is confusing and takes a while to debug.
         for bad_date in (self.FX_RATES_END_DATE + pd.Timedelta('1 day'),
                          self.FX_RATES_END_DATE + pd.Timedelta('1000 days')):
 
             for rate in self.FX_RATES_RATE_NAMES:
                 quote = 'USD'
                 bases = np.array(['CAD'], dtype=object)
                 dts = pd.DatetimeIndex([bad_date])
+
                 with self.assertRaises(ValueError):
                     self.reader.get_rates(rate, quote, bases, dts)
 
+                with self.assertRaises(ValueError):
+                    self.reader.get_rates_columnar(rate, quote, bases, dts)
+
+    def test_read_unknown_base(self):
+        for rate in self.FX_RATES_RATE_NAMES:
+            quote = 'USD'
+            bases = np.array(['XXX'], dtype=object)
+            dts = pd.DatetimeIndex([self.FX_RATES_START_DATE])
+            result = self.reader.get_rates(rate, quote, bases, dts)[0, 0]
+            assert_equal(result, np.nan)
+
 
 class InMemoryFXReaderTestCase(_FXReaderTestCase):
 
diff --git a/zipline/data/fx/hdf5.py b/zipline/data/fx/hdf5.py
@@ -104,6 +104,7 @@
 from zipline.utils.numpy_utils import bytes_array_to_native_str_object_array
 
 from .base import FXRateReader, DEFAULT_FX_RATE
+from .utils import check_dts, is_sorted_ascending
 
 HDF5_FX_VERSION = 0
 
@@ -189,10 +190,7 @@ def get_rates(self, rate, quote, bases, dts):
         if rate == DEFAULT_FX_RATE:
             rate = self._default_rate
 
-        # TODO: Commenting this _check_dts out for now to bypass the
-        # estimates loader date bounds issue.  Will need to address
-        # this before finalizing anything.
-        # self._check_dts(self.dts, dts)
+        check_dts(self.dts, dts)
 
         row_ixs = self.dts.searchsorted(dts, side='right') - 1
         col_ixs = self.currencies.get_indexer(bases)
@@ -207,51 +205,48 @@ def get_rates(self, rate, quote, bases, dts):
 
         # OPTIMIZATION: Row indices correspond to dates, which must be in
         # sorted order. Rather than reading the entire dataset from h5, we can
-        # read just the interval from min_row to max_row inclusive.
+        # read just the interval from min_row to max_row inclusive
         #
-        # We don't bother with a similar optimization for columns because in
-        # expectation we're going to load most of the
-
-        # array, so it's easier to pull all columns and reindex in memory. For
-        # rows, however, a quick and easy optimization is to pull just the
-        # slice from min(row_ixs) to max(row_ixs).
-        min_row = max(row_ixs[0], 0)
-        max_row = row_ixs[-1]
-        rows = dataset[min_row:max_row + 1]  # +1 to be inclusive of end
-
-        out = rows[row_ixs - min_row][:, col_ixs]
+        # However, we also need to handle two important edge cases:
+        #
+        #   1. row_ixs contains -1 for dts before the start of self.dts.
+        #   2. col_ixs contains -1 for any currencies we don't know about.
+        #
+        # If either of the above cases obtains, we want to return NaN for the
+        # corresponding output locations.
 
-        # get_indexer returns -1 for failed lookups. Fill these in with NaN.
+        # We handle (1) by reading raw data into a buffer with one extra
+        # row. When we then apply the row index to permute the raw data into
+        # the correct order, any rows with values of -1 will pull from the
+        # extra row, which will always contain NaN>
+        #
+        # We handle (2) by overwriting columns with indices of -1 with NaN as a
+        # postprocessing step.
+        slice_begin = max(row_ixs[0], 0)
+        slice_end = max(row_ixs[-1], 0) + 1  # +1 to be inclusive of end date.
+
+        # Allocate a buffer full of NaNs with one extra row/column. See
+        # OPTIMIZATION notes above.
+        buf = np.full(
+            (slice_end - slice_begin + 1, len(self.currencies)),
+            np.nan,
+        )
+
+        # Read data into all but the last row/column of the buffer.
+        dataset.read_direct(
+            buf[:-1],
+            np.s_[slice_begin:slice_end],
+        )
+
+        # Permute the rows into place, pulling from the empty NaN locations for
+        # row/column indices of -1.
+        out = buf[:, col_ixs][row_ixs - slice_begin]
+
+        # Fill missing columns with NaN. See OPTIMIZATION notes above.
         out[:, col_ixs == -1] = np.nan
 
-        # TODO: searchsorted also gives -1 for failed lookups. However, these
-        # failed lookups arise due to the estimates date bounds bug that we
-        # have not yet addressed, so this is a temporary fix.
-        out[row_ixs == -1, :] = np.nan
-
         return out
 
-    def _check_dts(self, stored, requested):
-        """Validate that requested dates are in bounds for what we have stored.
-        """
-        request_start, request_end = requested[[0, -1]]
-        data_start, data_end = stored[[0, -1]]
-
-        if request_start < data_start:
-            raise ValueError(
-                "Requested fx rates starting at {}, but data starts at {}"
-                .format(request_start, data_start)
-            )
-
-        if request_end > data_end:
-            raise ValueError(
-                "Requested fx rates ending at {}, but data ends at {}"
-                .format(request_end, data_end)
-            )
-
-        if not is_sorted_ascending(requested):
-            raise ValueError("Requested fx rates with non-ascending dts.")
-
 
 class HDF5FXRateWriter(object):
     """Writer class for HDF5 files consumed by HDF5FXRateReader.
@@ -320,7 +315,3 @@ def _write_data_group(self, dts, currencies, data):
 
     def _log_writing(self, *path):
         log.debug("Writing {}", '/'.join(path))
-
-
-def is_sorted_ascending(array):
-    return (np.maximum.accumulate(array) <= array).all()
diff --git a/zipline/data/fx/in_memory.py b/zipline/data/fx/in_memory.py
@@ -1,8 +1,10 @@
 """Interface and definitions for foreign exchange rate readers.
 """
 from interface import implements
+import numpy as np
 
 from .base import FXRateReader, DEFAULT_FX_RATE
+from .utils import check_dts
 
 
 class InMemoryFXRateReader(implements(FXRateReader)):
@@ -34,7 +36,7 @@ def get_rates(self, rate, quote, bases, dts):
 
         df = self._data[rate][quote]
 
-        self._check_dts(df.index, dts)
+        check_dts(df.index, dts)
 
         # Get raw values out of the frame.
         #
@@ -51,22 +53,11 @@ def get_rates(self, rate, quote, bases, dts):
         values = df.values
         row_ixs = df.index.searchsorted(dts, side='right') - 1
         col_ixs = df.columns.get_indexer(bases)
-        return values[row_ixs][:, col_ixs]
 
-    def _check_dts(self, stored, requested):
-        """Validate that requested dates are in bounds for what we have stored.
-        """
-        request_start, request_end = requested[[0, -1]]
-        data_start, data_end = stored[[0, -1]]
+        out = values[:, col_ixs][row_ixs]
 
-        if request_start < data_start:
-            raise ValueError(
-                "Requested fx rates starting at {}, but data starts at {}"
-                .format(request_start, data_start)
-            )
+        # Handle dates before start and unknown bases.
+        out[row_ixs == -1] = np.nan
+        out[:, col_ixs == -1] = np.nan
 
-        if request_end > data_end:
-            raise ValueError(
-                "Requested fx rates ending at {}, but data ends at {}"
-                .format(request_end, data_end)
-            )
+        return out
diff --git a/zipline/data/fx/utils.py b/zipline/data/fx/utils.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+
+def check_dts(stored_dts, requested_dts):
+    """
+    Validate that ``requested_dts`` are valid for querying from an FX reader
+    that has data for ``stored_dts``.
+    """
+    request_end = requested_dts[-1]
+    data_end = stored_dts[-1]
+
+    if not is_sorted_ascending(requested_dts):
+        raise ValueError("Requested fx rates with non-ascending dts.")
+
+    if request_end > data_end:
+        raise ValueError(
+            "Requested fx rates ending at {}, but data ends at {}"
+            .format(request_end, data_end)
+        )
+
+
+def is_sorted_ascending(array):
+    return (np.maximum.accumulate(array) <= array).all()
diff --git a/zipline/testing/fixtures.py b/zipline/testing/fixtures.py
@@ -2201,6 +2201,11 @@ def get_expected_fx_rate_scalar(cls, rate, quote, base, dt):
             rate = cls.FX_RATES_DEFAULT_RATE
 
         col = cls.fx_rates[rate][quote][base]
+        if dt < col.index[0]:
+            return np.nan
+        elif dt > col.index[-1]:
+            raise ValueError("dt={} > max dt={}".format(dt, col.index[-1]))
+
         # PERF: We call this function a lot in some suites, and get_loc is
         # surprisingly expensive, so optimizing it has a meaningful impact on
         # overall suite performance. See test_fast_get_loc_ffilled_for