ADD: Add native symbol mapping for dataframe column

cjdsellers · cjdsellers · commit 2be86d2647f3 · 2022-09-13T01:59:01.000Z
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Key features include:
 - [Fully normalized](https://docs0.databento.com/knowledge-base/new-users/normalization?historical=python&live=python), i.e. identical message schemas for both live and historical data, across multiple asset classes.
 - Provides mappings between different symbology systems, including [smart symbology](https://docs0.databento.com/reference-historical/basics/symbology?historical=python&live=python) for futures rollovers.
 - [Point-in-time]() instrument definitions, free of look-ahead bias and retroactive adjustments.
-- Reads and stores market data in an extremely efficient file format using [Databento Binary Encoding]().
+- Reads and stores market data in an extremely efficient file format using [Databento Binary Encoding](https://docs0.databento.com/knowledge-base/new-users/dbz-format?historical=python&live=python).
 - Event-driven [market replay](https://docs0.databento.com/reference-historical/helpers/bento-replay?historical=python&live=python), including at high-frequency order book granularity.
 - Support for [batch download](https://docs0.databento.com/knowledge-base/new-users/historical-data-streaming-vs-batch-download?historical=python&live=python) of flat files.
 - Support for [pandas](https://pandas.pydata.org/docs/), CSV, and JSON.
diff --git a/databento/common/bento.py b/databento/common/bento.py
@@ -1,3 +1,4 @@
+import datetime as dt
 import io
 import os.path
 from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple
@@ -9,6 +10,7 @@
 from databento.common.enums import Compression, Encoding, Schema, SType
 from databento.common.logging import log_debug
 from databento.common.metadata import MetadataDecoder
+from databento.common.symbology import ProductIdMappingInterval
 
 
 class Bento:
@@ -17,6 +19,7 @@ class Bento:
     def __init__(self):
         self._metadata: Dict[str, Any] = {}
         self._dtype: Optional[np.dtype] = None
+        self._product_id_index: Dict[dt.date, Dict[int, str]] = {}
 
         self._dataset: Optional[str] = None
         self._schema: Optional[Schema] = None
@@ -353,13 +356,13 @@ def shape(self) -> Tuple:
         return self._shape
 
     @property
-    def mappings(self) -> List[Dict[str, List[Dict[str, str]]]]:
+    def mappings(self) -> Dict[str, List[Dict[str, Any]]]:
         """
         Return the symbology mappings for the data.
 
         Returns
         -------
-        List[Dict[str, List[Dict[str, str]]]]
+        Dict[str, List[Dict[str, Any]]]
 
         """
         self._check_metadata()
@@ -369,7 +372,7 @@ def mappings(self) -> List[Dict[str, List[Dict[str, str]]]]:
     @property
     def symbology(self) -> Dict[str, Any]:
         """
-        Return the symbology resolution information for the query.
+        Return the symbology resolution mappings for the data.
 
         Returns
         -------
@@ -378,30 +381,18 @@ def symbology(self) -> Dict[str, Any]:
         """
         self._check_metadata()
 
-        status = 0
-        if self._metadata["partial"]:
-            status = 1
-            message = "Partially resolved"
-        elif self._metadata["not_found"]:
-            status = 2
-            message = "Not found"
-        else:
-            message = "OK"
-
-        response: Dict[str, Any] = {
-            "result": self.mappings,
+        symbology: Dict[str, Any] = {
             "symbols": self.symbols,
             "stype_in": self.stype_in.value,
             "stype_out": self.stype_out.value,
             "start_date": str(self.start.date()),
             "end_date": str(self.end.date()),
             "partial": self._metadata["partial"],
             "not_found": self._metadata["not_found"],
-            "message": message,
-            "status": status,
+            "mappings": self.mappings,
         }
 
-        return response
+        return symbology
 
     def to_ndarray(self) -> np.ndarray:
         """
@@ -415,7 +406,12 @@ def to_ndarray(self) -> np.ndarray:
         data: bytes = self.reader(decompress=True).read()
         return np.frombuffer(data, dtype=DBZ_STRUCT_MAP[self.schema])
 
-    def to_df(self, pretty_ts: bool = False, pretty_px: bool = False) -> pd.DataFrame:
+    def to_df(
+        self,
+        pretty_ts: bool = False,
+        pretty_px: bool = False,
+        map_symbols: bool = False,
+    ) -> pd.DataFrame:
         """
         Return the data as a `pd.DataFrame`.
 
@@ -427,6 +423,10 @@ def to_df(self, pretty_ts: bool = False, pretty_px: bool = False) -> pd.DataFram
         pretty_px : bool, default False
             If all price columns should be converted from `int` to `float` at
             the correct scale (using the fixed precision scalar 1e-9).
+        map_symbols : bool, default False
+            If symbology mappings from the metadata should be used to create
+            a 'symbol' column, mapping the product ID to its native symbol for
+            every record.
 
         Returns
         -------
@@ -467,6 +467,20 @@ def to_df(self, pretty_ts: bool = False, pretty_px: bool = False) -> pd.DataFram
                 ):
                     df[column] = df[column] * 1e-9
 
+        if map_symbols:
+            # Build product ID index
+            if not self._product_id_index:
+                self._product_id_index = self._build_product_id_index()
+
+            # Map product IDs to native symbols
+            if self._product_id_index:
+                df_index = df.index if pretty_ts else pd.to_datetime(df.index, utc=True)
+                dates = [ts.date() for ts in df_index]
+                df["symbol"] = [
+                    self._product_id_index[dates[i]][p]
+                    for i, p in enumerate(df["product_id"])
+                ]
+
         return df
 
     def replay(self, callback: Callable[[Any], None]) -> None:
@@ -643,6 +657,37 @@ def request_full_definitions(
             path=path,
         )
 
+    def _build_product_id_index(self) -> Dict[dt.date, Dict[int, str]]:
+        intervals: List[ProductIdMappingInterval] = []
+        for native, i in self.mappings.items():
+            for row in i:
+                symbol = row["symbol"]
+                if symbol == "":
+                    continue
+                intervals.append(
+                    ProductIdMappingInterval(
+                        start_date=row["start_date"],
+                        end_date=row["end_date"],
+                        native=native,
+                        product_id=int(row["symbol"]),
+                    )
+                )
+
+        product_id_index: Dict[dt.date, Dict[int, str]] = {}
+        for interval in intervals:
+            for ts in pd.date_range(
+                start=interval.start_date,
+                end=interval.end_date,
+                inclusive="left",
+            ):
+                d: dt.date = ts.date()
+                date_map: Dict[int, str] = product_id_index.get(d, {})
+                if not date_map:
+                    product_id_index[d] = date_map
+                date_map[interval.product_id] = interval.native
+
+        return product_id_index
+
 
 class MemoryBento(Bento):
     """
diff --git a/databento/common/metadata.py b/databento/common/metadata.py
@@ -37,6 +37,8 @@ def enum_value(fn):
             "stype_in": enum_value(int_to_stype),
             "stype_out": enum_value(int_to_stype),
         }
+
         for key, conv_fn in conversion_mapping.items():
             metadata[key] = conv_fn(metadata[key])
+
         return metadata
diff --git a/databento/common/symbology.py b/databento/common/symbology.py
@@ -0,0 +1,40 @@
+import datetime as dt
+
+
+class ProductIdMappingInterval:
+    """
+    Represents a product ID to native symbol mapping over a start and end date
+    range interval.
+
+    Parameters
+    ----------
+    start_date : dt.date
+        The start of the mapping period.
+    end_date : dt.date
+        The end of the mapping period.
+    native : str
+        The native symbol value.
+    product_id : int
+        The product ID value.
+    """
+
+    def __init__(
+        self,
+        start_date: dt.date,
+        end_date: dt.date,
+        native: str,
+        product_id: int,
+    ):
+        self.start_date = start_date
+        self.end_date = end_date
+        self.native = native
+        self.product_id = product_id
+
+    def __repr__(self):
+        return (
+            f"{type(self).__name__}("
+            f"start_date={self.start_date}, "
+            f"end_date={self.end_date}, "
+            f"native='{self.native}', "
+            f"product_id={self.product_id})"
+        )
diff --git a/examples/historical_timeseries_to_df.py b/examples/historical_timeseries_to_df.py
@@ -20,4 +20,4 @@
     )
 
     # Convert to pandas dataframe
-    pprint(data.to_df())
+    pprint(data.to_df(map_symbols=True))
diff --git a/notebooks/symbology.ipynb b/notebooks/symbology.ipynb
diff --git a/tests/test_historical_bento.py b/tests/test_historical_bento.py

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,8 @@ def enum_value(fn):`
`37`	`37`	`"stype_in": enum_value(int_to_stype),`
`38`	`38`	`"stype_out": enum_value(int_to_stype),`
`39`	`39`	`}`
	`40`	`+`
`40`	`41`	`for key, conv_fn in conversion_mapping.items():`
`41`	`42`	`metadata[key] = conv_fn(metadata[key])`
	`43`	`+`
`42`	`44`	`return metadata`
Original file line number	Diff line number	Diff line change
`@@ -20,4 +20,4 @@`
`20`	`20`	`)`
`21`	`21`
`22`	`22`	`# Convert to pandas dataframe`
`23`		`- pprint(data.to_df())`
	`23`	`+ pprint(data.to_df(map_symbols=True))`