DOC: Improve and document to_parquet kwargs

nmacholl · nmacholl · commit 8d4815f8e647 · 2024-10-14T23:13:56.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## 0.43.1 - TBD
 
+#### Enhancements
+- Keyword arguments to `DBNStore.to_parquet` will now allow `where` and `schema` to be specified
+
 #### Bug fixes
 - Fixed an issue where validating the checksum of a batch file loaded the entire file into memory
 
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -963,7 +963,7 @@ def to_df(
     def to_parquet(
         self,
         path: PathLike[str] | str,
-        price_type: Literal["fixed", "float"] = "float",
+        price_type: PriceType | str = PriceType.FLOAT,
         pretty_ts: bool = True,
         map_symbols: bool = True,
         schema: Schema | str | None = None,
@@ -992,6 +992,9 @@ def to_parquet(
             This is only required when reading a DBN stream with mixed record types.
         mode : str, default "w"
             The file write mode to use, either "x" or "w".
+        **kwargs : Any
+            Keyword arguments to pass to the `pyarrow.parquet.ParquetWriter`.
+            These can be used to override the default behavior of the writer.
 
         Raises
         ------
@@ -1000,10 +1003,12 @@ def to_parquet(
             If the DBN schema is unspecified and cannot be determined.
 
         """
-        if price_type == "decimal":
+        file_path = validate_file_write_path(path, "path", exist_ok=mode == "w")
+        price_type = validate_enum(price_type, PriceType, "price_type")
+
+        if price_type == PriceType.DECIMAL:
             raise ValueError("the 'decimal' price type is not currently supported")
 
-        file_path = validate_file_write_path(path, "path", exist_ok=mode == "w")
         schema = validate_maybe_enum(schema, Schema, "schema")
         if schema is None:
             if self.schema is None:
@@ -1025,8 +1030,8 @@ def to_parquet(
                     # Initialize the writer using the first DataFrame
                     parquet_schema = pa.Schema.from_pandas(frame)
                     writer = pq.ParquetWriter(
-                        where=file_path,
-                        schema=parquet_schema,
+                        where=kwargs.pop("where", file_path),
+                        schema=kwargs.pop("schema", parquet_schema),
                         **kwargs,
                     )
                 writer.write_table(
diff --git a/tests/test_historical_bento.py b/tests/test_historical_bento.py
@@ -731,6 +731,33 @@ def test_to_parquet(
     pd.testing.assert_frame_equal(actual, expected)
 
 
+def test_to_parquet_kwargs(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    test_data: Callable[[Dataset, Schema], bytes],
+) -> None:
+    # Arrange
+    monkeypatch.setattr(databento.common.dbnstore, "PARQUET_CHUNK_SIZE", 1)
+    stub_data = test_data(Dataset.GLBX_MDP3, Schema.MBO)
+    data = DBNStore.from_bytes(data=stub_data)
+    parquet_file = tmp_path / "test.parquet"
+
+    # Act
+    expected = data.to_df()
+    data.to_parquet(
+        parquet_file,
+        compression="zstd",
+        write_statistics="false",
+    )
+    actual = pd.read_parquet(parquet_file)
+
+    # Replace None values with np.nan
+    actual.fillna(value=np.nan)
+
+    # Assert
+    pd.testing.assert_frame_equal(actual, expected)
+
+
 @pytest.mark.parametrize(
     "expected_schema",
     [pytest.param(schema, id=str(schema)) for schema in Schema.variants()],