MOD: Use Transcoder for DBNStore encoders

nmacholl · nmacholl · commit c1749aac7bd7 · 2023-10-17T00:50:25.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@
 
 #### Enhancements
 - Added `price_type` argument for `DBNStore.to_df` to specify if price fields should be `fixed`, `float` or `decimal.Decimal`
+- Upgraded `databento-dbn` to 0.12.0
+
+#### Breaking Changes
+- Changed outputs of `DBNStore.to_csv` and `DBNStore.to_json` to match the encoding formats from the Databento API
 
 #### Deprecations
 - Deprecated `pretty_px` argument for `DBNStore.to_df` to be removed in a future release; the default `pretty_px=True` is now equivalent to `price_type="float"` and `pretty_px=False` is now equivalent to `price_type="fixed"`
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ The library is fully compatible with the latest distribution of Anaconda 3.8 and
 The minimum dependencies as found in the `pyproject.toml` are also listed below:
 - python = "^3.8"
 - aiohttp = "^3.8.3"
-- databento-dbn = "0.11.1"
+- databento-dbn = "0.12.0"
 - numpy= ">=1.23.5"
 - pandas = ">=1.5.3"
 - requests = ">=2.24.0"
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -11,7 +11,7 @@
 from io import BytesIO
 from os import PathLike
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Literal, overload
+from typing import IO, TYPE_CHECKING, Any, BinaryIO, Callable, Literal, overload
 
 import databento_dbn
 import numpy as np
@@ -20,19 +20,24 @@
 from databento_dbn import FIXED_PRICE_SCALE
 from databento_dbn import Compression
 from databento_dbn import DBNDecoder
+from databento_dbn import Encoding
 from databento_dbn import ErrorMsg
 from databento_dbn import Metadata
 from databento_dbn import Schema
 from databento_dbn import SType
 from databento_dbn import SymbolMappingMsg
 from databento_dbn import SystemMsg
+from databento_dbn import Transcoder
 
 from databento.common.data import DEFINITION_TYPE_MAX_MAP
 from databento.common.data import SCHEMA_COLUMNS
 from databento.common.data import SCHEMA_DTYPES_MAP
 from databento.common.data import SCHEMA_STRUCT_MAP
 from databento.common.error import BentoError
+from databento.common.iterator import chunk
 from databento.common.symbology import InstrumentMap
+from databento.common.symbology import SymbolInterval
+from databento.common.validation import validate_enum
 from databento.common.validation import validate_file_write_path
 from databento.common.validation import validate_maybe_enum
 from databento.live import DBNRecord
@@ -763,6 +768,7 @@ def to_csv(
         pretty_px: bool = True,
         pretty_ts: bool = True,
         map_symbols: bool = True,
+        compression: Compression | str = Compression.NONE,
         schema: Schema | str | None = None,
     ) -> None:
         """
@@ -783,6 +789,8 @@ def to_csv(
             If symbology mappings from the metadata should be used to create
             a 'symbol' column, mapping the instrument ID to its requested symbol for
             every record.
+        compression : Compression or str, default `Compression.NONE`
+            The output compression for writing.
         schema : Schema or str, optional
             The schema for the csv.
             This is only required when reading a DBN stream with mixed record types.
@@ -797,24 +805,33 @@ def to_csv(
         Requires all the data to be brought up into memory to then be written.
 
         """
-        price_type: Literal["fixed", "float"] = "fixed"
-        if pretty_px is True:
-            price_type = "float"
+        compression = validate_enum(compression, Compression, "compression")
+        schema = validate_maybe_enum(schema, Schema, "schema")
+        if schema is None:
+            if self.schema is None:
+                raise ValueError("a schema must be specified for mixed DBN data")
+            schema = self.schema
 
-        df_iter = self.to_df(
-            price_type=price_type,
-            pretty_ts=pretty_ts,
-            map_symbols=map_symbols,
-            schema=schema,
-            count=2**16,
-        )
+        record_type = SCHEMA_STRUCT_MAP[schema]
+        record_iter = filter(lambda r: isinstance(r, record_type), self)
 
-        with open(path, "x", newline="") as csv_file:
-            for i, frame in enumerate(df_iter):
-                frame.to_csv(
-                    csv_file,
-                    header=(i == 0),
-                )
+        if map_symbols:
+            self._instrument_map.insert_metadata(self.metadata)
+            symbol_map = self._instrument_map._data
+        else:
+            symbol_map = None
+
+        with open(path, "xb") as output:
+            self._transcode(
+                output=output,
+                records_iter=record_iter,
+                encoding=Encoding.CSV,
+                pretty_px=pretty_px,
+                pretty_ts=pretty_ts,
+                symbol_map=symbol_map,
+                compression=compression,
+                schema=schema,
+            )
 
     @overload
     def to_df(
@@ -965,6 +982,7 @@ def to_json(
         pretty_px: bool = True,
         pretty_ts: bool = True,
         map_symbols: bool = True,
+        compression: Compression | str = Compression.NONE,
         schema: Schema | str | None = None,
     ) -> None:
         """
@@ -984,6 +1002,8 @@ def to_json(
             If symbology mappings from the metadata should be used to create
             a 'symbol' column, mapping the instrument ID to its requested symbol for
             every record.
+        compression : Compression or str, default `Compression.NONE`
+            The output compression for writing.
         schema : Schema or str, optional
             The schema for the json.
             This is only required when reading a DBN stream with mixed record types.
@@ -998,27 +1018,33 @@ def to_json(
         Requires all the data to be brought up into memory to then be written.
 
         """
-        price_type: Literal["fixed", "float"] = "fixed"
-        if pretty_px is True:
-            price_type = "float"
+        compression = validate_enum(compression, Compression, "compression")
+        schema = validate_maybe_enum(schema, Schema, "schema")
+        if schema is None:
+            if self.schema is None:
+                raise ValueError("a schema must be specified for mixed DBN data")
+            schema = self.schema
 
-        df_iter = self.to_df(
-            price_type=price_type,
-            pretty_ts=pretty_ts,
-            map_symbols=map_symbols,
-            schema=schema,
-            count=2**16,
-        )
+        record_type = SCHEMA_STRUCT_MAP[schema]
+        record_iter = filter(lambda r: isinstance(r, record_type), self)
 
-        with open(path, "x") as json_path:
-            for frame in df_iter:
-                frame.reset_index(inplace=True)
-                frame.to_json(
-                    json_path,
-                    orient="records",
-                    date_unit="ns",
-                    lines=True,
-                )
+        if map_symbols:
+            self._instrument_map.insert_metadata(self.metadata)
+            symbol_map = self._instrument_map._data
+        else:
+            symbol_map = None
+
+        with open(path, "xb") as output:
+            self._transcode(
+                output=output,
+                records_iter=record_iter,
+                encoding=Encoding.JSON,
+                pretty_px=pretty_px,
+                pretty_ts=pretty_ts,
+                symbol_map=symbol_map,
+                compression=compression,
+                schema=schema,
+            )
 
     @overload
     def to_ndarray(  # type: ignore [misc]
@@ -1085,6 +1111,35 @@ def to_ndarray(
 
         return ndarray_iter
 
+    def _transcode(
+        self,
+        output: BinaryIO,
+        records_iter: Iterator[DBNRecord],
+        encoding: Encoding,
+        pretty_px: bool,
+        pretty_ts: bool,
+        symbol_map: dict[int, list[SymbolInterval]] | None,
+        compression: Compression,
+        schema: Schema,
+    ) -> None:
+        transcoder = Transcoder(
+            file=output,
+            encoding=encoding,
+            compression=compression,
+            pretty_px=pretty_px,
+            pretty_ts=pretty_ts,
+            has_metadata=True,
+            input_compression=Compression.NONE,
+            symbol_map=symbol_map,  # type: ignore [arg-type]
+            schema=schema,
+        )
+
+        transcoder.write(bytes(self.metadata))
+        for records in chunk(records_iter, 2**16):
+            for record in records:
+                transcoder.write(bytes(record))
+            transcoder.flush()
+
 
 class NDArrayIterator:
     def __init__(
diff --git a/databento/common/iterator.py b/databento/common/iterator.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+import itertools
+from collections.abc import Iterable
+from typing import TypeVar
+
+
+_C = TypeVar("_C")
+
+
+def chunk(iterable: Iterable[_C], size: int) -> Iterable[tuple[_C, ...]]:
+    """
+    Break an iterable into chunks with a length of at most `size`.
+
+    Parameters
+    ----------
+    iterable: Iterable[_C]
+        The iterable to break up.
+    size : int
+        The maximum size of each chunk.
+
+    Returns
+    -------
+    Iterable[_C]
+
+    Raises
+    ------
+    ValueError
+        If `size` is less than 1.
+
+    """
+    if size < 1:
+        raise ValueError("size must be at least 1")
+
+    it = iter(iterable)
+    return iter(lambda: tuple(itertools.islice(it, size)), ())
diff --git a/databento/live/protocol.py b/databento/live/protocol.py
@@ -1,19 +1,18 @@
 from __future__ import annotations
 
 import asyncio
-import itertools
 import logging
 from collections.abc import Iterable
 from functools import singledispatchmethod
 from numbers import Number
-from typing import TypeVar
 
 import databento_dbn
 from databento_dbn import Schema
 from databento_dbn import SType
 
 from databento.common import cram
 from databento.common.error import BentoError
+from databento.common.iterator import chunk
 from databento.common.parsing import optional_datetime_to_unix_nanoseconds
 from databento.common.parsing import optional_symbols_list_to_list
 from databento.common.publishers import Dataset
@@ -36,37 +35,6 @@
 logger = logging.getLogger(__name__)
 
 
-_C = TypeVar("_C")
-
-
-def chunk(iterable: Iterable[_C], size: int) -> Iterable[tuple[_C, ...]]:
-    """
-    Break an iterable into chunks with a length of at most `size`.
-
-    Parameters
-    ----------
-    iterable: Iterable[_C]
-        The iterable to break up.
-    size : int
-        The maximum size of each chunk.
-
-    Returns
-    -------
-    Iterable[_C]
-
-    Raises
-    ------
-    ValueError
-        If `size` is less than 1.
-
-    """
-    if size < 1:
-        raise ValueError("size must be at least 1")
-
-    it = iter(iterable)
-    return iter(lambda: tuple(itertools.islice(it, size)), ())
-
-
 class DatabentoLiveProtocol(asyncio.BufferedProtocol):
     """
     A BufferedProtocol implementation for the Databento live subscription
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ repository = "https://github.com/databento/databento-python"
 [tool.poetry.dependencies]
 python = "^3.8"
 aiohttp = "^3.8.3"
-databento-dbn = "0.11.1"
+databento-dbn = "0.12.0"
 numpy = ">=1.23.5"
 pandas = ">=1.5.3"
 requests = ">=2.24.0"
diff --git a/tests/test_common_iterator.py b/tests/test_common_iterator.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import pytest
+from databento.common import iterator
+
+
+@pytest.mark.parametrize(
+    "things, size, expected",
+    [
+        (
+            "abcdefg",
+            2,
+            [
+                ("a", "b"),
+                ("c", "d"),
+                ("e", "f"),
+                ("g",),
+            ],
+        ),
+    ],
+)
+def test_chunk(
+    things: Iterable[object],
+    size: int,
+    expected: Iterable[tuple[object]],
+) -> None:
+    """
+    Test that an iterable is chunked property.
+    """
+    chunks = [chunk for chunk in iterator.chunk(things, size)]
+    assert chunks == expected
diff --git a/tests/test_historical_bento.py b/tests/test_historical_bento.py