FIX: Change Bento iter to not use record_count

nmacholl · nmacholl · commit 7a384653b267 · 2023-03-10T18:50:33.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,9 @@
 # Changelog
 
-## 0.9.0 - TBD
+## 0.8.2 - 2023-03-10
+- Removed `record_count` property from Bento class
+- Fixed bug in `Bento` where invalid metadata would prevent iteration
+- Improved use of the logging module
 - Changed `metadata.get_dataset_condition` response to a list of condition per date
 
 ## 0.8.1 - 2023-03-05
diff --git a/databento/common/bento.py b/databento/common/bento.py
@@ -269,8 +269,6 @@ class Bento:
         The raw compressed data in bytes.
     reader : IO[bytes]
         A zstd decompression stream.
-    record_count : int
-        The record count.
     schema : Schema
         The data record schema.
     start : pd.Timestamp
@@ -347,17 +345,17 @@ def __init__(self, data_source: DataSource) -> None:
 
     def __iter__(self) -> Generator[np.void, None, None]:
         reader = self.reader
-        for _ in range(self.record_count):
+        while True:
             raw = reader.read(self.record_size)
-            rec = np.frombuffer(raw, dtype=STRUCT_MAP[self.schema])
-            yield rec[0]
-
-    def __len__(self) -> int:
-        return self.record_count
+            if raw:
+                rec = np.frombuffer(raw, dtype=STRUCT_MAP[self.schema])
+                yield rec[0]
+            else:
+                break
 
     def __repr__(self) -> str:
         name = self.__class__.__name__
-        return f"<{name}(schema={self.schema}, record_count={self.record_count})>"
+        return f"<{name}(schema={self.schema})>"
 
     def _apply_pretty_ts(self, df: pd.DataFrame) -> pd.DataFrame:
         df.index = pd.to_datetime(df.index, utc=True)
@@ -608,18 +606,6 @@ def reader(self) -> IO[bytes]:
         reader.seek(self._metadata_length)
         return reader
 
-    @property
-    def record_count(self) -> int:
-        """
-        Return the record count.
-
-        Returns
-        -------
-        int
-
-        """
-        return self._metadata["record_count"]
-
     @property
     def schema(self) -> Schema:
         """
diff --git a/notebooks/quickstart.ipynb b/notebooks/quickstart.ipynb
@@ -688,32 +688,6 @@
     "data.compression"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "97bc0031-be8b-43cd-b9a4-1c5d2c388a28",
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1000"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data.record_count"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/tests/test_historical_bento.py b/tests/test_historical_bento.py
@@ -116,7 +116,7 @@ def test_bento_given_initial_nbytes_returns_expected_metadata() -> None:
     assert bento.start == pd.Timestamp("2020-12-28 13:00:00+0000", tz="UTC")
     assert bento.end == pd.Timestamp("2020-12-29 13:00:00+0000", tz="UTC")
     assert bento.limit == 2
-    assert bento.record_count == 2
+    assert len(bento.to_ndarray()) == 2
     assert bento.mappings == {
         "ESH1": [
             {
@@ -443,7 +443,7 @@ def test_from_dbn_alias() -> None:
 
     # Assert
     assert data.schema == Schema.MBO
-    assert data.record_count == 2
+    assert len(data.to_ndarray()) == 2
 
 
 def test_mbo_to_csv_writes_expected_file_to_disk(tmp_path: Path) -> None:
@@ -671,38 +671,6 @@ def test_mbp_1_to_json_with_all_options_writes_expected_file_to_disk(
     )
 
 
-@pytest.mark.parametrize(
-    "schema",
-    [
-        s
-        for s in Schema
-        if s
-        not in (
-            Schema.OHLCV_1H,
-            Schema.OHLCV_1D,
-            Schema.STATUS,
-            Schema.STATISTICS,
-            Schema.DEFINITION,
-            Schema.GATEWAY_ERROR,
-            Schema.SYMBOL_MAPPING,
-        )
-    ],
-)
-def test_bento_len(schema: Schema) -> None:
-    """
-    Check that calling `len()` on a Bento returns
-    the record count.
-    """
-    # Arrange
-    stub_data = get_test_data(schema=schema)
-
-    # Act
-    bento = Bento.from_bytes(data=stub_data)
-
-    # Assert
-    assert len(bento) == bento.record_count
-
-
 @pytest.mark.parametrize(
     "schema",
     [
@@ -732,7 +700,7 @@ def test_bento_repr(schema: Schema) -> None:
     bento = Bento.from_bytes(data=stub_data)
 
     # Assert
-    assert repr(bento) == f"<Bento(schema={schema}, record_count={bento.record_count})>"
+    assert repr(bento) == f"<Bento(schema={schema})>"
 
 
 def test_bento_iterable() -> None:
@@ -820,6 +788,6 @@ def test_bento_compression_equality(schema: Schema) -> None:
     zstd_bento = Bento.from_bytes(zstd_stub_data)
     dbn_bento = Bento.from_bytes(dbn_stub_data)
 
-    assert zstd_bento.record_count == dbn_bento.record_count
+    assert len(zstd_bento.to_ndarray()) == len(dbn_bento.to_ndarray())
     assert zstd_bento.metadata == dbn_bento.metadata
     assert zstd_bento.reader.read() == dbn_bento.reader.read()