ADD: Add pretty and split_symbol customizations

cjdsellers · cjdsellers · commit e4135a921925 · 2023-09-20T00:08:59.000Z
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -113,14 +113,14 @@ def format_dataframe(
             if column in df.columns:
                 df[column] = df[column].where(df[column] != type_max, np.nan)
 
-    if pretty_ts:
-        for ts_field in struct._timestamp_fields:
-            df[ts_field] = pd.to_datetime(df[ts_field], errors="coerce", utc=True)
-
     if pretty_px:
         for px_field in struct._price_fields:
             df[px_field] = df[px_field].replace(INT64_NULL, np.nan) / FIXED_PRICE_SCALE
 
+    if pretty_ts:
+        for ts_field in struct._timestamp_fields:
+            df[ts_field] = pd.to_datetime(df[ts_field], errors="coerce", utc=True)
+
     for column, dtype in SCHEMA_DTYPES_MAP[schema]:
         if dtype.startswith("S") and column not in struct._hidden_fields:
             df[column] = df[column].str.decode("utf-8")
@@ -826,8 +826,8 @@ def request_symbology(self, client: Historical) -> dict[str, Any]:
     def to_csv(
         self,
         path: Path | str,
-        pretty_ts: bool = True,
         pretty_px: bool = True,
+        pretty_ts: bool = True,
         map_symbols: bool = True,
         schema: Schema | str | None = None,
     ) -> None:
@@ -838,13 +838,13 @@ def to_csv(
         ----------
         path : Path or str
             The file path to write to.
-        pretty_ts : bool, default True
-            If all timestamp columns should be converted from UNIX nanosecond
-            `int` to `pd.Timestamp` tz-aware (UTC).
         pretty_px : bool, default True
             If all price columns should be converted from `int` to `float` at
-            the correct scale (using the fixed precision scalar 1e-9). Null
+            the correct scale (using the fixed-precision scalar 1e-9). Null
             prices are replaced with an empty string.
+        pretty_ts : bool, default True
+            If all timestamp columns should be converted from UNIX nanosecond
+            `int` to tz-aware UTC `pd.Timestamp`.
         map_symbols : bool, default True
             If symbology mappings from the metadata should be used to create
             a 'symbol' column, mapping the instrument ID to its requested symbol for
@@ -864,8 +864,8 @@ def to_csv(
 
         """
         df_iter = self.to_df(
-            pretty_ts=pretty_ts,
             pretty_px=pretty_px,
+            pretty_ts=pretty_ts,
             map_symbols=map_symbols,
             schema=schema,
             count=2**16,
@@ -881,8 +881,8 @@ def to_csv(
     @overload
     def to_df(
         self,
-        pretty_ts: bool = ...,
         pretty_px: bool = ...,
+        pretty_ts: bool = ...,
         map_symbols: bool = ...,
         schema: Schema | str | None = ...,
         count: None = ...,
@@ -892,8 +892,8 @@ def to_df(
     @overload
     def to_df(
         self,
-        pretty_ts: bool = ...,
         pretty_px: bool = ...,
+        pretty_ts: bool = ...,
         map_symbols: bool = ...,
         schema: Schema | str | None = ...,
         count: int = ...,
@@ -902,8 +902,8 @@ def to_df(
 
     def to_df(
         self,
-        pretty_ts: bool = True,
         pretty_px: bool = True,
+        pretty_ts: bool = True,
         map_symbols: bool = True,
         schema: Schema | str | None = None,
         count: int | None = None,
@@ -913,13 +913,13 @@ def to_df(
 
         Parameters
         ----------
-        pretty_ts : bool, default True
-            If all timestamp columns should be converted from UNIX nanosecond
-            `int` to `pd.Timestamp` tz-aware (UTC).
         pretty_px : bool, default True
             If all price columns should be converted from `int` to `float` at
-            the correct scale (using the fixed precision scalar 1e-9). Null
+            the correct scale (using the fixed-precision scalar 1e-9). Null
             prices are replaced with NaN.
+        pretty_ts : bool, default True
+            If all timestamp columns should be converted from UNIX nanosecond
+            `int` to tz-aware UTC `pd.Timestamp`.
         map_symbols : bool, default True
             If symbology mappings from the metadata should be used to create
             a 'symbol' column, mapping the instrument ID to its requested symbol for
@@ -1000,8 +1000,8 @@ def to_file(self, path: Path | str) -> None:
     def to_json(
         self,
         path: Path | str,
-        pretty_ts: bool = True,
         pretty_px: bool = True,
+        pretty_ts: bool = True,
         map_symbols: bool = True,
         schema: Schema | str | None = None,
     ) -> None:
@@ -1012,12 +1012,12 @@ def to_json(
         ----------
         path : Path or str
             The file path to write to.
-        pretty_ts : bool, default True
-            If all timestamp columns should be converted from UNIX nanosecond
-            `int` to `pd.Timestamp` tz-aware (UTC).
         pretty_px : bool, default True
             If all price columns should be converted from `int` to `float` at
-            the correct scale (using the fixed precision scalar 1e-9).
+            the correct scale (using the fixed-precision scalar 1e-9).
+        pretty_ts : bool, default True
+            If all timestamp columns should be converted from UNIX nanosecond
+            `int` to tz-aware UTC `pd.Timestamp`.
         map_symbols : bool, default True
             If symbology mappings from the metadata should be used to create
             a 'symbol' column, mapping the instrument ID to its requested symbol for
@@ -1037,8 +1037,8 @@ def to_json(
 
         """
         df_iter = self.to_df(
-            pretty_ts=pretty_ts,
             pretty_px=pretty_px,
+            pretty_ts=pretty_ts,
             map_symbols=map_symbols,
             schema=schema,
             count=2**16,
diff --git a/databento/historical/api/batch.py b/databento/historical/api/batch.py
@@ -54,7 +54,10 @@ def submit_job(
         end: pd.Timestamp | date | str | int | None = None,
         encoding: Encoding | str = "dbn",
         compression: Compression | str = "zstd",
+        pretty_px: bool = False,
+        pretty_ts: bool = False,
         map_symbols: bool = False,
+        split_symbols: bool = False,
         split_duration: SplitDuration | str = "day",
         split_size: int | None = None,
         packaging: Packaging | str | None = None,
@@ -92,9 +95,17 @@ def submit_job(
             The data encoding.
         compression : Compression or str {'none', 'zstd'}, default 'zstd'
             The data compression format (if any).
+        pretty_px : bool, default False
+            If prices should be formatted to the correct scale (using the fixed-precision scalar 1e-9).
+            Only applicable for 'csv' or 'json' encodings.
+        pretty_ts : bool, default False
+            If timestamps should be formatted as ISO 8601 strings.
+            Only applicable for 'csv' or 'json' encodings.
         map_symbols : bool, default False
-            If the raw symbol should be appended to every text encoded record.
-            Must be requested with either 'csv' or 'json' encoding.
+            If the requested symbol should be appended to every text encoded record.
+            Only applicable for 'csv' or 'json' encodings.
+        split_symbols : bool, default False
+            If files should be split by raw symbol. Cannot be requested with `'ALL_SYMBOLS'`.
         split_duration : SplitDuration or str {'day', 'week', 'month', 'none'}, default 'day'
             The maximum time duration before batched data is split into multiple files.
             A week starts on Sunday UTC.
@@ -135,7 +146,10 @@ def submit_job(
             "compression": str(validate_enum(compression, Compression, "compression"))
             if compression
             else None,
+            "pretty_px": pretty_px,
+            "pretty_ts": pretty_ts,
             "map_symbols": map_symbols,
+            "split_symbols": split_symbols,
             "split_duration": str(
                 validate_enum(split_duration, SplitDuration, "split_duration"),
             ),
diff --git a/tests/test_historical_batch.py b/tests/test_historical_batch.py
@@ -93,7 +93,10 @@ def test_batch_submit_job_sends_expected_request(
         "stype_out": "instrument_id",
         "encoding": "csv",
         "compression": "zstd",
+        "pretty_px": False,
+        "pretty_ts": False,
         "map_symbols": False,
+        "split_symbols": False,
         "split_duration": "day",
         "packaging": "none",
         "delivery": "download",