|
4 | 4 | import decimal |
5 | 5 | import itertools |
6 | 6 | import logging |
| 7 | +import warnings |
7 | 8 | from collections.abc import Generator |
8 | 9 | from collections.abc import Iterator |
9 | 10 | from collections.abc import Mapping |
| 11 | +from io import BufferedReader |
10 | 12 | from io import BytesIO |
11 | 13 | from os import PathLike |
12 | 14 | from pathlib import Path |
|
46 | 48 | from databento.common.constants import SCHEMA_STRUCT_MAP |
47 | 49 | from databento.common.constants import SCHEMA_STRUCT_MAP_V1 |
48 | 50 | from databento.common.error import BentoError |
| 51 | +from databento.common.error import BentoWarning |
49 | 52 | from databento.common.symbology import InstrumentMap |
50 | 53 | from databento.common.types import DBNRecord |
51 | 54 | from databento.common.types import Default |
@@ -150,7 +153,7 @@ def __init__(self, source: PathLike[str] | str): |
150 | 153 | ) |
151 | 154 |
|
152 | 155 | self._name = self._path.name |
153 | | - self.__buffer: IO[bytes] | None = None |
| 156 | + self.__buffer: BufferedReader | None = None |
154 | 157 |
|
155 | 158 | @property |
156 | 159 | def name(self) -> str: |
@@ -189,13 +192,13 @@ def path(self) -> Path: |
189 | 192 | return self._path |
190 | 193 |
|
191 | 194 | @property |
192 | | - def reader(self) -> IO[bytes]: |
| 195 | + def reader(self) -> BufferedReader: |
193 | 196 | """ |
194 | 197 | Return a reader for this file. |
195 | 198 |
|
196 | 199 | Returns |
197 | 200 | ------- |
198 | | - IO |
| 201 | + BufferedReader |
199 | 202 |
|
200 | 203 | """ |
201 | 204 | if self.__buffer is None: |
@@ -259,14 +262,14 @@ def nbytes(self) -> int: |
259 | 262 | return self.__buffer.getbuffer().nbytes |
260 | 263 |
|
261 | 264 | @property |
262 | | - def reader(self) -> IO[bytes]: |
| 265 | + def reader(self) -> BytesIO: |
263 | 266 | """ |
264 | 267 | Return a reader for this buffer. The reader beings at the start of the |
265 | 268 | buffer. |
266 | 269 |
|
267 | 270 | Returns |
268 | 271 | ------- |
269 | | - IO |
| 272 | + BytesIO |
270 | 273 |
|
271 | 274 | """ |
272 | 275 | self.__buffer.seek(0) |
@@ -391,8 +394,8 @@ def __iter__(self) -> Generator[DBNRecord, None, None]: |
391 | 394 | yield record |
392 | 395 | else: |
393 | 396 | if len(decoder.buffer()) > 0: |
394 | | - raise BentoError( |
395 | | - "DBN file is truncated or contains an incomplete record", |
| 397 | + warnings.warn( |
| 398 | + BentoWarning("DBN file is truncated or contains an incomplete record"), |
396 | 399 | ) |
397 | 400 | break |
398 | 401 |
|
@@ -516,21 +519,18 @@ def reader(self) -> IO[bytes]: |
516 | 519 |
|
517 | 520 | Returns |
518 | 521 | ------- |
519 | | - BinaryIO |
| 522 | + IO[bytes] |
520 | 523 |
|
521 | 524 | See Also |
522 | 525 | -------- |
523 | 526 | DBNStore.raw |
524 | 527 |
|
525 | 528 | """ |
526 | 529 | if self.compression == Compression.ZSTD: |
527 | | - reader: IO[bytes] = zstandard.ZstdDecompressor().stream_reader( |
| 530 | + return zstandard.ZstdDecompressor().stream_reader( |
528 | 531 | self._data_source.reader, |
529 | 532 | ) |
530 | | - else: |
531 | | - reader = self._data_source.reader |
532 | | - |
533 | | - return reader |
| 533 | + return self._data_source.reader |
534 | 534 |
|
535 | 535 | @property |
536 | 536 | def schema(self) -> Schema | None: |
@@ -1281,8 +1281,10 @@ def _transcode( |
1281 | 1281 | transcoder.write(byte_chunk) |
1282 | 1282 |
|
1283 | 1283 | if transcoder.buffer(): |
1284 | | - raise BentoError( |
1285 | | - "DBN file is truncated or contains an incomplete record", |
| 1284 | + warnings.warn( |
| 1285 | + BentoWarning( |
| 1286 | + "DBN file is truncated or contains an incomplete record", |
| 1287 | + ), |
1286 | 1288 | ) |
1287 | 1289 |
|
1288 | 1290 | transcoder.flush() |
@@ -1327,28 +1329,38 @@ def __init__( |
1327 | 1329 | self._dtype = np.dtype(dtype) |
1328 | 1330 | self._offset = offset |
1329 | 1331 | self._count = count |
| 1332 | + self._close_on_next = False |
1330 | 1333 |
|
1331 | 1334 | self._reader.seek(offset) |
1332 | 1335 |
|
1333 | 1336 | def __iter__(self) -> NDArrayStreamIterator: |
1334 | 1337 | return self |
1335 | 1338 |
|
1336 | 1339 | def __next__(self) -> np.ndarray[Any, Any]: |
| 1340 | + if self._close_on_next: |
| 1341 | + raise StopIteration |
| 1342 | + |
1337 | 1343 | if self._count is None: |
1338 | 1344 | read_size = -1 |
1339 | 1345 | else: |
1340 | 1346 | read_size = self._dtype.itemsize * max(self._count, 1) |
1341 | 1347 |
|
1342 | 1348 | if buffer := self._reader.read(read_size): |
| 1349 | + loose_bytes = len(buffer) % self._dtype.itemsize |
| 1350 | + if loose_bytes != 0: |
| 1351 | + warnings.warn( |
| 1352 | + BentoWarning("DBN file is truncated or contains an incomplete record"), |
| 1353 | + ) |
| 1354 | + buffer = buffer[:-loose_bytes] |
| 1355 | + self._close_on_next = True # decode one more buffer before stopping |
| 1356 | + |
1343 | 1357 | try: |
1344 | 1358 | return np.frombuffer( |
1345 | 1359 | buffer=buffer, |
1346 | 1360 | dtype=self._dtype, |
1347 | 1361 | ) |
1348 | | - except ValueError: |
1349 | | - raise BentoError( |
1350 | | - "DBN file is truncated or contains an incomplete record", |
1351 | | - ) |
| 1362 | + except ValueError as exc: |
| 1363 | + raise BentoError("Cannot decode DBN stream") from exc |
1352 | 1364 |
|
1353 | 1365 | raise StopIteration |
1354 | 1366 |
|
@@ -1393,10 +1405,8 @@ def __next__(self) -> np.ndarray[Any, Any]: |
1393 | 1405 | dtype=self._dtype, |
1394 | 1406 | count=num_records, |
1395 | 1407 | ) |
1396 | | - except ValueError: |
1397 | | - raise BentoError( |
1398 | | - "DBN file is truncated or contains an incomplete record", |
1399 | | - ) from None |
| 1408 | + except ValueError as exc: |
| 1409 | + raise BentoError("Cannot decode DBN stream") from exc |
1400 | 1410 |
|
1401 | 1411 |
|
1402 | 1412 | class DataFrameIterator: |
|
0 commit comments