Skip to content

Commit 90e9a64

Browse files
committed
MOD: Improve DBNStore file writing
1 parent 19b852d commit 90e9a64

File tree

2 files changed

+37
-35
lines changed

2 files changed

+37
-35
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#### Enhancements
66
- Added `count` parameter to `DBNStore.to_df` and `DBNStore.to_ndarray` to help process large files incrementally
7+
- Improved memory usage of `DBNStore.to_csv` and `DBNStore.to_json`
78
- Added the `Publisher`, `Venue`, and `Dataset` enums
89
- Replace null prices with `NaN` when `pretty_px=True` in `DBNStore.to_df()`
910
- Upgraded `databento-dbn` to 0.8.3

databento/common/dbnstore.py

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -863,39 +863,40 @@ def to_csv(
863863
Requires all the data to be brought up into memory to then be written.
864864
865865
"""
866-
self.to_df(
866+
df_iter = self.to_df(
867867
pretty_ts=pretty_ts,
868868
pretty_px=pretty_px,
869869
map_symbols=map_symbols,
870870
schema=schema,
871-
).to_csv(path)
871+
count=2**16,
872+
)
872873

873-
@overload
874-
def to_df(
875-
self,
876-
pretty_ts: bool = True,
877-
pretty_px: bool = True,
878-
map_symbols: bool = True,
879-
schema: Schema | str | None = None,
880-
count: None = None,
881-
) -> pd.DataFrame:
882-
...
874+
with open(path, "x") as csv_file:
875+
for i, frame in enumerate(df_iter):
876+
frame.to_csv(
877+
csv_file,
878+
header=(i == 0),
879+
)
883880

884881
@overload
885882
def to_df(
886883
self,
887-
*,
888-
schema: Schema | str | None,
889-
count: int,
890-
) -> DataFrameIterator:
884+
pretty_ts: bool = ...,
885+
pretty_px: bool = ...,
886+
map_symbols: bool = ...,
887+
schema: Schema | str | None = ...,
888+
count: None = ...,
889+
) -> pd.DataFrame:
891890
...
892891

893-
# Required to handle default schema but set count.
894892
@overload
895893
def to_df(
896894
self,
897-
*,
898-
count: int,
895+
pretty_ts: bool = ...,
896+
pretty_px: bool = ...,
897+
map_symbols: bool = ...,
898+
schema: Schema | str | None = ...,
899+
count: int = ...,
899900
) -> DataFrameIterator:
900901
...
901902

@@ -1035,35 +1036,35 @@ def to_json(
10351036
Requires all the data to be brought up into memory to then be written.
10361037
10371038
"""
1038-
self.to_df(
1039+
df_iter = self.to_df(
10391040
pretty_ts=pretty_ts,
10401041
pretty_px=pretty_px,
10411042
map_symbols=map_symbols,
10421043
schema=schema,
1043-
).to_json(path, orient="records", lines=True)
1044+
count=2**16,
1045+
)
10441046

1045-
@overload
1046-
def to_ndarray(
1047-
self,
1048-
schema: Schema | str | None = None,
1049-
count: None = None,
1050-
) -> np.ndarray[Any, Any]:
1051-
...
1047+
with open(path, "x") as json_path:
1048+
for frame in df_iter:
1049+
frame.to_json(
1050+
json_path,
1051+
orient="records",
1052+
lines=True,
1053+
)
10521054

10531055
@overload
1054-
def to_ndarray(
1056+
def to_ndarray( # type: ignore [misc]
10551057
self,
1056-
schema: Schema | str | None,
1057-
count: int,
1058-
) -> NDArrayIterator:
1058+
schema: Schema | str | None = ...,
1059+
count: None = ...,
1060+
) -> np.ndarray[Any, Any]:
10591061
...
10601062

1061-
# Required to handle default schema but set count.
10621063
@overload
10631064
def to_ndarray(
10641065
self,
1065-
*,
1066-
count: int,
1066+
schema: Schema | str | None = ...,
1067+
count: int = ...,
10671068
) -> NDArrayIterator:
10681069
...
10691070

0 commit comments

Comments
 (0)