Skip to content

Commit 1952440

Browse files
committed
MOD: Improve DBNStore handling of truncated DBN
1 parent 14c452f commit 1952440

File tree

3 files changed

+215
-35
lines changed

3 files changed

+215
-35
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- Added `mode` parameter to `DBNStore.to_parquet` to control the file writing mode
99
- Added `compression` parameter to `DBNStore.to_file` which controls the output compression format
1010
- Added new consolidated publisher values for `XNAS.BASIC` and `DBEQ.MAX`
11+
- Changed `DBNStore` to be more tolerant of truncated DBN streams
1112

1213
#### Breaking changes
1314
- Changed default write mode for `DBNStore.to_csv` to overwrite ("w")

databento/common/dbnstore.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import decimal
55
import itertools
66
import logging
7+
import warnings
78
from collections.abc import Generator
89
from collections.abc import Iterator
910
from collections.abc import Mapping
11+
from io import BufferedReader
1012
from io import BytesIO
1113
from os import PathLike
1214
from pathlib import Path
@@ -46,6 +48,7 @@
4648
from databento.common.constants import SCHEMA_STRUCT_MAP
4749
from databento.common.constants import SCHEMA_STRUCT_MAP_V1
4850
from databento.common.error import BentoError
51+
from databento.common.error import BentoWarning
4952
from databento.common.symbology import InstrumentMap
5053
from databento.common.types import DBNRecord
5154
from databento.common.types import Default
@@ -150,7 +153,7 @@ def __init__(self, source: PathLike[str] | str):
150153
)
151154

152155
self._name = self._path.name
153-
self.__buffer: IO[bytes] | None = None
156+
self.__buffer: BufferedReader | None = None
154157

155158
@property
156159
def name(self) -> str:
@@ -189,13 +192,13 @@ def path(self) -> Path:
189192
return self._path
190193

191194
@property
192-
def reader(self) -> IO[bytes]:
195+
def reader(self) -> BufferedReader:
193196
"""
194197
Return a reader for this file.
195198
196199
Returns
197200
-------
198-
IO
201+
BufferedReader
199202
200203
"""
201204
if self.__buffer is None:
@@ -259,14 +262,14 @@ def nbytes(self) -> int:
259262
return self.__buffer.getbuffer().nbytes
260263

261264
@property
262-
def reader(self) -> IO[bytes]:
265+
def reader(self) -> BytesIO:
263266
"""
264267
Return a reader for this buffer. The reader beings at the start of the
265268
buffer.
266269
267270
Returns
268271
-------
269-
IO
272+
BytesIO
270273
271274
"""
272275
self.__buffer.seek(0)
@@ -391,8 +394,8 @@ def __iter__(self) -> Generator[DBNRecord, None, None]:
391394
yield record
392395
else:
393396
if len(decoder.buffer()) > 0:
394-
raise BentoError(
395-
"DBN file is truncated or contains an incomplete record",
397+
warnings.warn(
398+
BentoWarning("DBN file is truncated or contains an incomplete record"),
396399
)
397400
break
398401

@@ -516,21 +519,18 @@ def reader(self) -> IO[bytes]:
516519
517520
Returns
518521
-------
519-
BinaryIO
522+
IO[bytes]
520523
521524
See Also
522525
--------
523526
DBNStore.raw
524527
525528
"""
526529
if self.compression == Compression.ZSTD:
527-
reader: IO[bytes] = zstandard.ZstdDecompressor().stream_reader(
530+
return zstandard.ZstdDecompressor().stream_reader(
528531
self._data_source.reader,
529532
)
530-
else:
531-
reader = self._data_source.reader
532-
533-
return reader
533+
return self._data_source.reader
534534

535535
@property
536536
def schema(self) -> Schema | None:
@@ -1281,8 +1281,10 @@ def _transcode(
12811281
transcoder.write(byte_chunk)
12821282

12831283
if transcoder.buffer():
1284-
raise BentoError(
1285-
"DBN file is truncated or contains an incomplete record",
1284+
warnings.warn(
1285+
BentoWarning(
1286+
"DBN file is truncated or contains an incomplete record",
1287+
),
12861288
)
12871289

12881290
transcoder.flush()
@@ -1327,28 +1329,38 @@ def __init__(
13271329
self._dtype = np.dtype(dtype)
13281330
self._offset = offset
13291331
self._count = count
1332+
self._close_on_next = False
13301333

13311334
self._reader.seek(offset)
13321335

13331336
def __iter__(self) -> NDArrayStreamIterator:
13341337
return self
13351338

13361339
def __next__(self) -> np.ndarray[Any, Any]:
1340+
if self._close_on_next:
1341+
raise StopIteration
1342+
13371343
if self._count is None:
13381344
read_size = -1
13391345
else:
13401346
read_size = self._dtype.itemsize * max(self._count, 1)
13411347

13421348
if buffer := self._reader.read(read_size):
1349+
loose_bytes = len(buffer) % self._dtype.itemsize
1350+
if loose_bytes != 0:
1351+
warnings.warn(
1352+
BentoWarning("DBN file is truncated or contains an incomplete record"),
1353+
)
1354+
buffer = buffer[:-loose_bytes]
1355+
self._close_on_next = True # decode one more buffer before stopping
1356+
13431357
try:
13441358
return np.frombuffer(
13451359
buffer=buffer,
13461360
dtype=self._dtype,
13471361
)
1348-
except ValueError:
1349-
raise BentoError(
1350-
"DBN file is truncated or contains an incomplete record",
1351-
)
1362+
except ValueError as exc:
1363+
raise BentoError("Cannot decode DBN stream") from exc
13521364

13531365
raise StopIteration
13541366

@@ -1393,10 +1405,8 @@ def __next__(self) -> np.ndarray[Any, Any]:
13931405
dtype=self._dtype,
13941406
count=num_records,
13951407
)
1396-
except ValueError:
1397-
raise BentoError(
1398-
"DBN file is truncated or contains an incomplete record",
1399-
) from None
1408+
except ValueError as exc:
1409+
raise BentoError("Cannot decode DBN stream") from exc
14001410

14011411

14021412
class DataFrameIterator:

0 commit comments

Comments
 (0)