MOD: Improve DBNStore file writing

nmacholl · nmacholl · commit 90e9a64f560c · 2023-08-25T00:18:13.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 #### Enhancements
 - Added `count` parameter to `DBNStore.to_df` and `DBNStore.to_ndarray` to help process large files incrementally
+- Improved memory usage of `DBNStore.to_csv` and `DBNStore.to_json`
 - Added the `Publisher`, `Venue`, and `Dataset` enums
 - Replace null prices with `NaN` when `pretty_px=True` in `DBNStore.to_df()`
 - Upgraded `databento-dbn` to 0.8.3
diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py
@@ -863,39 +863,40 @@ def to_csv(
         Requires all the data to be brought up into memory to then be written.
 
         """
-        self.to_df(
+        df_iter = self.to_df(
             pretty_ts=pretty_ts,
             pretty_px=pretty_px,
             map_symbols=map_symbols,
             schema=schema,
-        ).to_csv(path)
+            count=2**16,
+        )
 
-    @overload
-    def to_df(
-        self,
-        pretty_ts: bool = True,
-        pretty_px: bool = True,
-        map_symbols: bool = True,
-        schema: Schema | str | None = None,
-        count: None = None,
-    ) -> pd.DataFrame:
-        ...
+        with open(path, "x") as csv_file:
+            for i, frame in enumerate(df_iter):
+                frame.to_csv(
+                    csv_file,
+                    header=(i == 0),
+                )
 
     @overload
     def to_df(
         self,
-        *,
-        schema: Schema | str | None,
-        count: int,
-    ) -> DataFrameIterator:
+        pretty_ts: bool = ...,
+        pretty_px: bool = ...,
+        map_symbols: bool = ...,
+        schema: Schema | str | None = ...,
+        count: None = ...,
+    ) -> pd.DataFrame:
         ...
 
-    # Required to handle default schema but set count.
     @overload
     def to_df(
         self,
-        *,
-        count: int,
+        pretty_ts: bool = ...,
+        pretty_px: bool = ...,
+        map_symbols: bool = ...,
+        schema: Schema | str | None = ...,
+        count: int = ...,
     ) -> DataFrameIterator:
         ...
 
@@ -1035,35 +1036,35 @@ def to_json(
         Requires all the data to be brought up into memory to then be written.
 
         """
-        self.to_df(
+        df_iter = self.to_df(
             pretty_ts=pretty_ts,
             pretty_px=pretty_px,
             map_symbols=map_symbols,
             schema=schema,
-        ).to_json(path, orient="records", lines=True)
+            count=2**16,
+        )
 
-    @overload
-    def to_ndarray(
-        self,
-        schema: Schema | str | None = None,
-        count: None = None,
-    ) -> np.ndarray[Any, Any]:
-        ...
+        with open(path, "x") as json_path:
+            for frame in df_iter:
+                frame.to_json(
+                    json_path,
+                    orient="records",
+                    lines=True,
+                )
 
     @overload
-    def to_ndarray(
+    def to_ndarray(  # type: ignore [misc]
         self,
-        schema: Schema | str | None,
-        count: int,
-    ) -> NDArrayIterator:
+        schema: Schema | str | None = ...,
+        count: None = ...,
+    ) -> np.ndarray[Any, Any]:
         ...
 
-    # Required to handle default schema but set count.
     @overload
     def to_ndarray(
         self,
-        *,
-        count: int,
+        schema: Schema | str | None = ...,
+        count: int = ...,
     ) -> NDArrayIterator:
         ...