FIX: Filter live data by RType in Python client

nmacholl · nmacholl · commit d2c0981459fa · 2023-11-23T06:34:27.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ This release adds support for DBN v2 as well as Python v3.12.
 #### Enhancements
 - Added support for Python 3.12
 - Improved the performance for stream writes in the `Live` client
+- Improved the performance of `DBNStore.to_ndarray` and `DBNStore.to_df` for heterogeneous DBN data
 - Upgraded `databento-dbn` to 0.14.2
 - Added `databento.common.types` module to hold common type annotations
 
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -32,10 +32,12 @@
 from databento_dbn import InstrumentDefMsg
 from databento_dbn import InstrumentDefMsgV1
 from databento_dbn import Metadata
+from databento_dbn import RType
 from databento_dbn import Schema
 from databento_dbn import SType
 from databento_dbn import Transcoder
 from databento_dbn import VersionUpgradePolicy
+from pandas.io.common import os
 
 from databento.common.constants import DEFINITION_TYPE_MAX_MAP
 from databento.common.constants import INT64_NULL
@@ -1082,11 +1084,14 @@ def to_ndarray(
                 raise ValueError("a schema must be specified for mixed DBN data")
 
             schema_struct = self._schema_struct_map[schema]
+            schema_rtype = RType.from_schema(schema)
             schema_dtype = schema_struct._dtypes
-            schema_filter = filter(lambda r: isinstance(r, schema_struct), self)
 
+            reader = self.reader
+            reader.seek(self._metadata_length)
             ndarray_iter = NDArrayBytesIterator(
-                records=map(bytes, schema_filter),
+                stream=reader,
+                rtype=schema_rtype,
                 dtype=schema_dtype,
                 count=count,
             )
@@ -1229,31 +1234,45 @@ class NDArrayBytesIterator(NDArrayIterator):
 
     def __init__(
         self,
-        records: Iterator[bytes],
+        stream: IO[bytes],
+        rtype: RType,
         dtype: list[tuple[str, str]],
         count: int | None,
     ):
-        self._records = records
+        self._stream = stream
+        self._rtype = rtype
         self._dtype = dtype
         self._count = count
         self._first_next = True
 
     def __iter__(self) -> NDArrayIterator:
         return self
 
+    def __iter_rtype__(self) -> Generator[bytes, None, None]:
+        while header := self._stream.read(2):
+            length, rtype = header[:2]
+            read_size = length * 4 - 2
+            if rtype == self._rtype:
+                yield header + self._stream.read(read_size)
+            else:
+                self._stream.seek(read_size, os.SEEK_CUR)
+        return
+
     def __next__(self) -> np.ndarray[Any, Any]:
         record_bytes = BytesIO()
         num_records = 0
-        for record in itertools.islice(self._records, self._count):
+
+        for record in itertools.islice(self.__iter_rtype__(), self._count):
             num_records += 1
             record_bytes.write(record)
 
-        if num_records == 0:
-            if self._first_next:
+        if self._first_next:
+            self._first_next = False
+            if num_records == 0:
                 return np.empty([0, 1], dtype=self._dtype)
-            raise StopIteration
 
-        self._first_next = False
+        if num_records == 0:
+            raise StopIteration
 
         try:
             return np.frombuffer(
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -89,6 +89,23 @@ def pytest_collection_modifyitems(
             item.add_marker(skip_release)
 
 
+@pytest.fixture(name="live_test_data_path")
+def fixture_live_test_data_path() -> pathlib.Path:
+    """
+    Fixture to retrieve the live stub data path.
+
+    Returns
+    -------
+    pathlib.Path
+
+    See Also
+    --------
+    live_test_data
+
+    """
+    return TESTS_ROOT / "data" / "LIVE" / "test_data.live.dbn.zst"
+
+
 @pytest.fixture(name="test_data_path")
 def fixture_test_data_path() -> Callable[[Dataset, Schema], pathlib.Path]:
     """
@@ -120,6 +137,25 @@ def func(dataset: Dataset, schema: Schema) -> pathlib.Path:
     return func
 
 
+@pytest.fixture(name="live_test_data")
+def fixture_live_test_data(
+    live_test_data_path: pathlib.Path,
+) -> bytes:
+    """
+    Fixture to retrieve stub test data.
+
+    Returns
+    -------
+    bytes
+
+    See Also
+    --------
+    live_test_data_path
+
+    """
+    return live_test_data_path.read_bytes()
+
+
 @pytest.fixture(name="test_data")
 def fixture_test_data(
     test_data_path: Callable[[Dataset, Schema], pathlib.Path],
diff --git a/tests/data/LIVE/test_data.live.dbn.zst b/tests/data/LIVE/test_data.live.dbn.zst
diff --git a/tests/test_historical_bento.py b/tests/test_historical_bento.py
@@ -937,6 +937,57 @@ def test_dbnstore_to_ndarray_with_count(
     assert np.array_equal(expected, np.concatenate(aggregator))
 
 
+@pytest.mark.parametrize(
+    "schema",
+    [
+        Schema.MBO,
+        Schema.MBP_1,
+        Schema.MBP_10,
+        Schema.TRADES,
+        Schema.OHLCV_1S,
+        Schema.OHLCV_1M,
+        Schema.OHLCV_1H,
+        Schema.OHLCV_1D,
+        Schema.DEFINITION,
+        Schema.STATISTICS,
+    ],
+)
+@pytest.mark.parametrize(
+    "count",
+    [
+        1,
+        2,
+        3,
+    ],
+)
+def test_dbnstore_to_ndarray_with_count_live(
+    schema: Schema,
+    live_test_data: bytes,
+    count: int,
+) -> None:
+    """
+    Test that calling to_ndarray with count produces an identical result to
+    without.
+    """
+    # Arrange
+    dbn_stub_data = zstandard.ZstdDecompressor().stream_reader(live_test_data).read()
+
+    # Act
+    dbnstore = DBNStore.from_bytes(data=dbn_stub_data)
+
+    expected = dbnstore.to_ndarray(schema=schema)
+    nd_iter = dbnstore.to_ndarray(schema=schema, count=count)
+
+    # Assert
+    aggregator: list[np.ndarray[Any, Any]] = []
+
+    for batch in nd_iter:
+        assert len(batch) <= count
+        aggregator.append(batch)
+
+    assert np.array_equal(expected, np.concatenate(aggregator))
+
+
 @pytest.mark.parametrize(
     "schema",
     [pytest.param(schema, id=str(schema)) for schema in Schema.variants()],
@@ -993,6 +1044,38 @@ def test_dbnstore_to_ndarray_with_count_empty(
     assert len(next(nd_iter)) == 0
 
 
+@pytest.mark.parametrize(
+    "schema, expected_count",
+    [
+        (Schema.MBO, 5),
+        (Schema.MBP_1, 2),
+        (Schema.MBP_10, 2),
+        (Schema.TRADES, 2),
+        (Schema.OHLCV_1S, 2),
+        (Schema.OHLCV_1M, 2),
+        (Schema.OHLCV_1H, 0),
+        (Schema.OHLCV_1D, 0),
+        (Schema.DEFINITION, 2),
+        (Schema.STATISTICS, 9),
+    ],
+)
+def test_dbnstore_to_ndarray_with_schema_live(
+    live_test_data: bytes,
+    schema: Schema,
+    expected_count: int,
+) -> None:
+    # Arrange
+    dbn_stub_data = zstandard.ZstdDecompressor().stream_reader(live_test_data).read()
+
+    # Act
+    dbnstore = DBNStore.from_bytes(data=dbn_stub_data)
+
+    array = dbnstore.to_ndarray(schema=schema)
+
+    # Assert
+    assert len(array) == expected_count
+
+
 def test_dbnstore_to_ndarray_with_schema_empty(
     test_data: Callable[[Dataset, Schema], bytes],
 ) -> None:
@@ -1016,6 +1099,23 @@ def test_dbnstore_to_ndarray_with_schema_empty(
     assert len(array) == 0
 
 
+def test_dbnstore_to_ndarray_with_schema_empty_live(
+    live_test_data: bytes,
+) -> None:
+    """
+    Test that a schema must be specified for live data.
+    """
+    # Arrange
+    dbn_stub_data = zstandard.ZstdDecompressor().stream_reader(live_test_data).read()
+
+    # Act
+    dbnstore = DBNStore.from_bytes(data=dbn_stub_data)
+
+    # Assert
+    with pytest.raises(ValueError):
+        dbnstore.to_ndarray()
+
+
 @pytest.mark.parametrize(
     "schema",
     [pytest.param(schema, id=str(schema)) for schema in Schema.variants()],
@@ -1063,32 +1163,38 @@ def test_dbnstore_to_df_with_count(
 
 
 @pytest.mark.parametrize(
-    "schema",
-    [pytest.param(schema, id=str(schema)) for schema in Schema.variants()],
+    "schema, expected_count",
+    [
+        (Schema.MBO, 5),
+        (Schema.MBP_1, 2),
+        (Schema.MBP_10, 2),
+        (Schema.TRADES, 2),
+        (Schema.OHLCV_1S, 2),
+        (Schema.OHLCV_1M, 2),
+        (Schema.OHLCV_1H, 0),
+        (Schema.OHLCV_1D, 0),
+        (Schema.DEFINITION, 2),
+        (Schema.STATISTICS, 9),
+    ],
 )
-def test_dbnstore_to_df_with_schema(
+def test_dbnstore_to_df_with_schema_live(
     schema: Schema,
-    test_data: Callable[[Dataset, Schema], bytes],
+    live_test_data: bytes,
+    expected_count: int,
 ) -> None:
     """
-    Test that calling to_df with schema produces an identical result to
-    without.
+    Test that calling to_df with schema produces a DataFrame for live data.
     """
     # Arrange
-    dbn_stub_data = (
-        zstandard.ZstdDecompressor()
-        .stream_reader(test_data(Dataset.GLBX_MDP3, schema))
-        .read()
-    )
+    dbn_stub_data = zstandard.ZstdDecompressor().stream_reader(live_test_data).read()
 
     # Act
     dbnstore = DBNStore.from_bytes(data=dbn_stub_data)
 
-    expected = dbnstore.to_df()
-    actual = dbnstore.to_df(schema=schema)
+    df = dbnstore.to_df(schema=schema)
 
     # Assert
-    assert actual.equals(expected)
+    assert len(df) == expected_count
 
 
 def test_dbnstore_to_df_with_schema_empty(