Skip to content

Commit 2be86d2

Browse files
committed
ADD: Add native symbol mapping for dataframe column
1 parent 304e36b commit 2be86d2

File tree

7 files changed

+127
-343
lines changed

7 files changed

+127
-343
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Key features include:
1414
- [Fully normalized](https://docs0.databento.com/knowledge-base/new-users/normalization?historical=python&live=python), i.e. identical message schemas for both live and historical data, across multiple asset classes.
1515
- Provides mappings between different symbology systems, including [smart symbology](https://docs0.databento.com/reference-historical/basics/symbology?historical=python&live=python) for futures rollovers.
1616
- [Point-in-time]() instrument definitions, free of look-ahead bias and retroactive adjustments.
17-
- Reads and stores market data in an extremely efficient file format using [Databento Binary Encoding]().
17+
- Reads and stores market data in an extremely efficient file format using [Databento Binary Encoding](https://docs0.databento.com/knowledge-base/new-users/dbz-format?historical=python&live=python).
1818
- Event-driven [market replay](https://docs0.databento.com/reference-historical/helpers/bento-replay?historical=python&live=python), including at high-frequency order book granularity.
1919
- Support for [batch download](https://docs0.databento.com/knowledge-base/new-users/historical-data-streaming-vs-batch-download?historical=python&live=python) of flat files.
2020
- Support for [pandas](https://pandas.pydata.org/docs/), CSV, and JSON.

databento/common/bento.py

Lines changed: 64 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime as dt
12
import io
23
import os.path
34
from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple
@@ -9,6 +10,7 @@
910
from databento.common.enums import Compression, Encoding, Schema, SType
1011
from databento.common.logging import log_debug
1112
from databento.common.metadata import MetadataDecoder
13+
from databento.common.symbology import ProductIdMappingInterval
1214

1315

1416
class Bento:
@@ -17,6 +19,7 @@ class Bento:
1719
def __init__(self):
1820
self._metadata: Dict[str, Any] = {}
1921
self._dtype: Optional[np.dtype] = None
22+
self._product_id_index: Dict[dt.date, Dict[int, str]] = {}
2023

2124
self._dataset: Optional[str] = None
2225
self._schema: Optional[Schema] = None
@@ -353,13 +356,13 @@ def shape(self) -> Tuple:
353356
return self._shape
354357

355358
@property
356-
def mappings(self) -> List[Dict[str, List[Dict[str, str]]]]:
359+
def mappings(self) -> Dict[str, List[Dict[str, Any]]]:
357360
"""
358361
Return the symbology mappings for the data.
359362
360363
Returns
361364
-------
362-
List[Dict[str, List[Dict[str, str]]]]
365+
Dict[str, List[Dict[str, Any]]]
363366
364367
"""
365368
self._check_metadata()
@@ -369,7 +372,7 @@ def mappings(self) -> List[Dict[str, List[Dict[str, str]]]]:
369372
@property
370373
def symbology(self) -> Dict[str, Any]:
371374
"""
372-
Return the symbology resolution information for the query.
375+
Return the symbology resolution mappings for the data.
373376
374377
Returns
375378
-------
@@ -378,30 +381,18 @@ def symbology(self) -> Dict[str, Any]:
378381
"""
379382
self._check_metadata()
380383

381-
status = 0
382-
if self._metadata["partial"]:
383-
status = 1
384-
message = "Partially resolved"
385-
elif self._metadata["not_found"]:
386-
status = 2
387-
message = "Not found"
388-
else:
389-
message = "OK"
390-
391-
response: Dict[str, Any] = {
392-
"result": self.mappings,
384+
symbology: Dict[str, Any] = {
393385
"symbols": self.symbols,
394386
"stype_in": self.stype_in.value,
395387
"stype_out": self.stype_out.value,
396388
"start_date": str(self.start.date()),
397389
"end_date": str(self.end.date()),
398390
"partial": self._metadata["partial"],
399391
"not_found": self._metadata["not_found"],
400-
"message": message,
401-
"status": status,
392+
"mappings": self.mappings,
402393
}
403394

404-
return response
395+
return symbology
405396

406397
def to_ndarray(self) -> np.ndarray:
407398
"""
@@ -415,7 +406,12 @@ def to_ndarray(self) -> np.ndarray:
415406
data: bytes = self.reader(decompress=True).read()
416407
return np.frombuffer(data, dtype=DBZ_STRUCT_MAP[self.schema])
417408

418-
def to_df(self, pretty_ts: bool = False, pretty_px: bool = False) -> pd.DataFrame:
409+
def to_df(
410+
self,
411+
pretty_ts: bool = False,
412+
pretty_px: bool = False,
413+
map_symbols: bool = False,
414+
) -> pd.DataFrame:
419415
"""
420416
Return the data as a `pd.DataFrame`.
421417
@@ -427,6 +423,10 @@ def to_df(self, pretty_ts: bool = False, pretty_px: bool = False) -> pd.DataFram
427423
pretty_px : bool, default False
428424
If all price columns should be converted from `int` to `float` at
429425
the correct scale (using the fixed precision scalar 1e-9).
426+
map_symbols : bool, default False
427+
If symbology mappings from the metadata should be used to create
428+
a 'symbol' column, mapping the product ID to its native symbol for
429+
every record.
430430
431431
Returns
432432
-------
@@ -467,6 +467,20 @@ def to_df(self, pretty_ts: bool = False, pretty_px: bool = False) -> pd.DataFram
467467
):
468468
df[column] = df[column] * 1e-9
469469

470+
if map_symbols:
471+
# Build product ID index
472+
if not self._product_id_index:
473+
self._product_id_index = self._build_product_id_index()
474+
475+
# Map product IDs to native symbols
476+
if self._product_id_index:
477+
df_index = df.index if pretty_ts else pd.to_datetime(df.index, utc=True)
478+
dates = [ts.date() for ts in df_index]
479+
df["symbol"] = [
480+
self._product_id_index[dates[i]][p]
481+
for i, p in enumerate(df["product_id"])
482+
]
483+
470484
return df
471485

472486
def replay(self, callback: Callable[[Any], None]) -> None:
@@ -643,6 +657,37 @@ def request_full_definitions(
643657
path=path,
644658
)
645659

660+
def _build_product_id_index(self) -> Dict[dt.date, Dict[int, str]]:
661+
intervals: List[ProductIdMappingInterval] = []
662+
for native, i in self.mappings.items():
663+
for row in i:
664+
symbol = row["symbol"]
665+
if symbol == "":
666+
continue
667+
intervals.append(
668+
ProductIdMappingInterval(
669+
start_date=row["start_date"],
670+
end_date=row["end_date"],
671+
native=native,
672+
product_id=int(row["symbol"]),
673+
)
674+
)
675+
676+
product_id_index: Dict[dt.date, Dict[int, str]] = {}
677+
for interval in intervals:
678+
for ts in pd.date_range(
679+
start=interval.start_date,
680+
end=interval.end_date,
681+
inclusive="left",
682+
):
683+
d: dt.date = ts.date()
684+
date_map: Dict[int, str] = product_id_index.get(d, {})
685+
if not date_map:
686+
product_id_index[d] = date_map
687+
date_map[interval.product_id] = interval.native
688+
689+
return product_id_index
690+
646691

647692
class MemoryBento(Bento):
648693
"""

databento/common/metadata.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ def enum_value(fn):
3737
"stype_in": enum_value(int_to_stype),
3838
"stype_out": enum_value(int_to_stype),
3939
}
40+
4041
for key, conv_fn in conversion_mapping.items():
4142
metadata[key] = conv_fn(metadata[key])
43+
4244
return metadata

databento/common/symbology.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import datetime as dt
2+
3+
4+
class ProductIdMappingInterval:
5+
"""
6+
Represents a product ID to native symbol mapping over a start and end date
7+
range interval.
8+
9+
Parameters
10+
----------
11+
start_date : dt.date
12+
The start of the mapping period.
13+
end_date : dt.date
14+
The end of the mapping period.
15+
native : str
16+
The native symbol value.
17+
product_id : int
18+
The product ID value.
19+
"""
20+
21+
def __init__(
22+
self,
23+
start_date: dt.date,
24+
end_date: dt.date,
25+
native: str,
26+
product_id: int,
27+
):
28+
self.start_date = start_date
29+
self.end_date = end_date
30+
self.native = native
31+
self.product_id = product_id
32+
33+
def __repr__(self):
34+
return (
35+
f"{type(self).__name__}("
36+
f"start_date={self.start_date}, "
37+
f"end_date={self.end_date}, "
38+
f"native='{self.native}', "
39+
f"product_id={self.product_id})"
40+
)

examples/historical_timeseries_to_df.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020
)
2121

2222
# Convert to pandas dataframe
23-
pprint(data.to_df())
23+
pprint(data.to_df(map_symbols=True))

0 commit comments

Comments
 (0)