Skip to content

Commit 4a84e46

Browse files
committed
MOD: Standardize batch model with DBZ metadata
1 parent 6726ae4 commit 4a84e46

File tree

4 files changed

+23
-30
lines changed

4 files changed

+23
-30
lines changed

databento/common/bento.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55
import numpy as np
66
import pandas as pd
77
import zstandard
8-
from databento.common.data import DBZ_COLUMNS, DBZ_STRUCT_MAP, DERIV_SCHEMAS
8+
from databento.common.data import (
9+
CSV_HEADERS,
10+
DBZ_COLUMNS,
11+
DBZ_STRUCT_MAP,
12+
DERIV_SCHEMAS,
13+
)
914
from databento.common.enums import Compression, Encoding, Schema, SType
1015
from databento.common.logging import log_debug
1116
from databento.common.metadata import MetadataDecoder
@@ -28,9 +33,7 @@ def __init__(self):
2833
self._limit: Optional[int] = None
2934
self._encoding: Optional[Encoding] = None
3035
self._compression: Optional[Compression] = None
31-
self._shape: Optional[Tuple[int, int]] = None
32-
self._rows: Optional[int] = None
33-
self._cols: Optional[int] = None
36+
self._shape: Optional[Tuple] = None
3437

3538
def _check_metadata(self) -> None:
3639
if not self._metadata:
@@ -344,19 +347,26 @@ def compression(self) -> Compression:
344347
return self._compression
345348

346349
@property
347-
def shape(self) -> Tuple[int, int]:
350+
def shape(self) -> Tuple:
348351
"""
349352
Return the shape of the data.
350353
351354
Returns
352355
-------
353-
Tuple[int, int]
354-
The rows and columns.
356+
Tuple
357+
The data shape.
355358
356359
"""
357360
if self._shape is None:
358361
self._check_metadata()
359-
self._shape = (self._metadata["nrows"], self._metadata["ncols"])
362+
if self.encoding == Encoding.DBZ:
363+
ncols = len(DBZ_STRUCT_MAP[self.schema])
364+
else:
365+
ncols = len(CSV_HEADERS[self.schema])
366+
self._shape = (
367+
self._metadata["record_count"],
368+
ncols,
369+
)
360370

361371
return self._shape
362372

databento/common/metadata.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,6 @@ class MetadataDecoder:
1616
"""
1717
Provides a decoder for Databento metadata headers.
1818
19-
Fixed query and shape metadata
20-
------------------------------
21-
version UInt8 1 1
22-
dataset Char[16] 16 17
23-
schema UInt8 1 18
24-
stype_in UInt8 1 19
25-
stype_out UInt8 1 20
26-
start UInt64 8 28
27-
end UInt64 8 36
28-
limit UInt64 8 44
29-
encoding UInt8 1 45
30-
compression UInt8 1 46
31-
nrows UInt64 8 54
32-
ncols UInt16 2 56
33-
padding x 40 96
34-
3519
References
3620
----------
3721
https://github.com/facebook/zstd/wiki

notebooks/quickstart.ipynb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,13 +1393,12 @@
13931393
" 'limit': 100,\n",
13941394
" 'encoding': 'dbz',\n",
13951395
" 'compression': 'zstd',\n",
1396-
" 'nrows': 100,\n",
1397-
" 'ncols': 14,\n",
13981396
" 'split_duration': 'day',\n",
13991397
" 'split_size': None,\n",
14001398
" 'packaging': 'none',\n",
14011399
" 'delivery': 'download',\n",
14021400
" 'is_example': False,\n",
1401+
" 'record_count': 100,\n",
14031402
" 'billed_size': 4800,\n",
14041403
" 'actual_size': None,\n",
14051404
" 'package_size': None,\n",
@@ -1462,13 +1461,12 @@
14621461
" 'limit': 100,\n",
14631462
" 'encoding': 'dbz',\n",
14641463
" 'compression': 'zstd',\n",
1465-
" 'nrows': 100,\n",
1466-
" 'ncols': 14,\n",
14671464
" 'split_duration': 'day',\n",
14681465
" 'split_size': None,\n",
14691466
" 'packaging': 'none',\n",
14701467
" 'delivery': 'download',\n",
14711468
" 'is_example': False,\n",
1469+
" 'record_count': 100,\n",
14721470
" 'billed_size': 4800,\n",
14731471
" 'actual_size': None,\n",
14741472
" 'package_size': None,\n",

tests/test_historical_bento.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def test_dataset_when_metadata_with_empty_bento_raises_runtime_error(self) -> No
3636
with pytest.raises(RuntimeError):
3737
data.dataset
3838

39+
@pytest.mark.skip(reason="skip until dbz-lib is integrated")
3940
def test_sources_metadata_returns_expected_json_as_dict(self) -> None:
4041
# Arrange
4142
stub_data = get_test_data(schema=Schema.MBO)
@@ -57,8 +58,7 @@ def test_sources_metadata_returns_expected_json_as_dict(self) -> None:
5758
"limit": 2,
5859
"encoding": "dbz",
5960
"compression": "zstd",
60-
"nrows": 2,
61-
"ncols": 14,
61+
"record_count": 2,
6262
"symbols": ["ESH1"],
6363
"status": 0,
6464
"partial": [],
@@ -70,6 +70,7 @@ def test_sources_metadata_returns_expected_json_as_dict(self) -> None:
7070
}
7171
assert data.metadata == metadata
7272

73+
@pytest.mark.skip(reason="skip until dbz-lib is integrated")
7374
def test_bento_given_initial_nbytes_returns_expected_metadata(self) -> None:
7475
# Arrange
7576
stub_data = get_test_data(schema=Schema.MBO)

0 commit comments

Comments
 (0)