FIX: Filter live data by RType in Python client

nmacholl · nmacholl · commit 8a9f14eb9bea · 2023-11-23T07:22:00.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,10 +7,13 @@ This release adds support for DBN v2 as well as Python v3.12.
 #### Enhancements
 - Added support for Python 3.12
 - Improved the performance for stream writes in the `Live` client
-- Improved the performance of `DBNStore.to_ndarray` and `DBNStore.to_df` for heterogeneous DBN data
 - Upgraded `databento-dbn` to 0.14.2
 - Added `databento.common.types` module to hold common type annotations
 
+#### Bug fixes
+- Fixed an issue where specifying an OHLCV schema in `DBNStore.to_ndarray` or `DBNStore.to_df` would not properly filter records by their interval
+- Fixed an issue where `DBNStore.to_ndarray` and `DBNStore.to_df` with a non-zero count could get stuck in a loop if the DBN data did not contain any records
+
 #### Breaking Changes
 - `DBNStore` iteration and `DBNStore.replay` will upgrade DBN version 1 messages to version 2
 - `Live` client iteration and callbacks upgrade DBN version 1 messages to version 2
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -37,7 +37,6 @@
 from databento_dbn import SType
 from databento_dbn import Transcoder
 from databento_dbn import VersionUpgradePolicy
-from pandas.io.common import os
 
 from databento.common.constants import DEFINITION_TYPE_MAX_MAP
 from databento.common.constants import INT64_NULL
@@ -1083,15 +1082,16 @@ def to_ndarray(
             if schema is None:
                 raise ValueError("a schema must be specified for mixed DBN data")
 
-            schema_struct = self._schema_struct_map[schema]
-            schema_rtype = RType.from_schema(schema)
+            # Always use the latest since DBNStore iteration upgrades
+            schema_struct = SCHEMA_STRUCT_MAP[schema]
             schema_dtype = schema_struct._dtypes
+            schema_rtype = RType.from_schema(schema)
+            schema_filter = filter(lambda r: r.rtype == schema_rtype, self)
 
             reader = self.reader
             reader.seek(self._metadata_length)
             ndarray_iter = NDArrayBytesIterator(
-                stream=reader,
-                rtype=schema_rtype,
+                records=map(bytes, schema_filter),
                 dtype=schema_dtype,
                 count=count,
             )
@@ -1234,35 +1234,22 @@ class NDArrayBytesIterator(NDArrayIterator):
 
     def __init__(
         self,
-        stream: IO[bytes],
-        rtype: RType,
+        records: Iterator[bytes],
         dtype: list[tuple[str, str]],
         count: int | None,
     ):
-        self._stream = stream
-        self._rtype = rtype
+        self._records = records
         self._dtype = dtype
         self._count = count
         self._first_next = True
 
     def __iter__(self) -> NDArrayIterator:
         return self
 
-    def __iter_rtype__(self) -> Generator[bytes, None, None]:
-        while header := self._stream.read(2):
-            length, rtype = header[:2]
-            read_size = length * 4 - 2
-            if rtype == self._rtype:
-                yield header + self._stream.read(read_size)
-            else:
-                self._stream.seek(read_size, os.SEEK_CUR)
-        return
-
     def __next__(self) -> np.ndarray[Any, Any]:
         record_bytes = BytesIO()
         num_records = 0
-
-        for record in itertools.islice(self.__iter_rtype__(), self._count):
+        for record in itertools.islice(self._records, self._count):
             num_records += 1
             record_bytes.write(record)
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -142,7 +142,7 @@ def fixture_live_test_data(
     live_test_data_path: pathlib.Path,
 ) -> bytes:
     """
-    Fixture to retrieve stub test data.
+    Fixture to retrieve live stub test data.
 
     Returns
     -------