Skip to content

Commit 348dc41

Browse files
authored
Merge pull request #131 from rstudio/feat-feather-parquet
feat: add support for parquet and arrow storage
2 parents f5cd3a3 + 40e1ee6 commit 348dc41

File tree

5 files changed

+56
-18
lines changed

5 files changed

+56
-18
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
run: |
7272
python -m pip install --upgrade pip
7373
python -m pip install -r requirements/dev.txt
74-
python -m pip install -e .
74+
python -m pip install -e .[test]
7575
- name: run RStudio Connect
7676
run: |
7777
docker-compose up --build -d

pins/constructors.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ def board_github(
255255
Examples
256256
--------
257257
258+
>>> import pytest; pytest.skip()
259+
258260
>>> import os
259261
>>> board = board_github("machow", "pins-python", "pins/tests/pins-compat")
260262
>>> board.pin_list()

pins/drivers.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@
1414
REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib", "file"])
1515

1616

17+
def _assert_is_pandas_df(x):
18+
import pandas as pd
19+
20+
if not isinstance(x, pd.DataFrame):
21+
raise NotImplementedError(
22+
"Currently only pandas.DataFrame can be saved to a CSV."
23+
)
24+
25+
1726
def load_data(
1827
meta: Meta,
1928
fs,
@@ -67,6 +76,16 @@ def load_data(
6776

6877
return pd.read_csv(fs.open(path_to_file))
6978

79+
elif meta.type == "feather":
80+
import pandas as pd
81+
82+
return pd.read_feather(fs.open(path_to_file))
83+
84+
elif meta.type == "parquet":
85+
import pandas as pd
86+
87+
return pd.read_parquet(fs.open(path_to_file))
88+
7089
elif meta.type == "table":
7190
import pandas as pd
7291

@@ -93,28 +112,35 @@ def save_data(
93112
# TODO: would be useful to have singledispatch func for a "default saver"
94113
# as argument to board, and then type dispatchers for explicit cases
95114
# of saving / loading objects different ways.
115+
116+
if apply_suffix:
117+
final_name = f"{fname}.{type}"
118+
else:
119+
final_name = fname
120+
96121
if type == "csv":
97-
import pandas as pd
122+
_assert_is_pandas_df(obj)
98123

99-
if apply_suffix:
100-
fname = f"{fname}.{type}"
124+
obj.to_csv(final_name, index=False)
125+
126+
elif type == "feather":
127+
_assert_is_pandas_df(obj)
128+
129+
obj.to_feather(final_name)
130+
131+
elif type == "parquet":
132+
_assert_is_pandas_df(obj)
133+
134+
obj.to_parquet(final_name)
101135

102-
if not isinstance(obj, pd.DataFrame):
103-
raise NotImplementedError(
104-
"Currently only pandas.DataFrame can be saved to a CSV."
105-
)
106-
obj.to_csv(fname, index=False)
107136
elif type == "joblib":
108137
import joblib
109138

110-
if apply_suffix:
111-
fname = f"{fname}.{type}"
112-
113-
joblib.dump(obj, fname)
139+
joblib.dump(obj, final_name)
114140
else:
115141
raise NotImplementedError(f"Cannot save type: {type}")
116142

117-
return fname
143+
return final_name
118144

119145

120146
def default_title(obj, name):

pins/tests/test_drivers.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,15 @@ def test_default_title(obj, dst_title):
4545
assert res == dst_title
4646

4747

48-
def test_driver_roundtrip_csv(tmp_dir2):
48+
@pytest.mark.parametrize(
49+
"type_",
50+
[
51+
"csv",
52+
"feather",
53+
"parquet",
54+
],
55+
)
56+
def test_driver_roundtrip(tmp_dir2, type_):
4957
# TODO: I think this test highlights the challenge of getting the flow
5058
# between metadata, drivers, and the metafactory right.
5159
# There is the name of the data (relative to the pin directory), and the full
@@ -55,14 +63,14 @@ def test_driver_roundtrip_csv(tmp_dir2):
5563
df = pd.DataFrame({"x": [1, 2, 3]})
5664

5765
fname = "some_df"
58-
type_ = "csv"
66+
full_file = f"{fname}.{type_}"
5967

6068
p_obj = tmp_dir2 / fname
6169
res_fname = save_data(df, p_obj, type_)
6270

63-
assert Path(res_fname).name == f"{fname}.csv"
71+
assert Path(res_fname).name == full_file
6472

65-
meta = MetaRaw(f"{fname}.csv", type_, "my_pin")
73+
meta = MetaRaw(full_file, type_, "my_pin")
6674
obj = load_data(meta, fsspec.filesystem("file"), tmp_dir2)
6775

6876
assert df.equals(obj)

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ test =
5353
pytest-dotenv
5454
s3fs
5555
adlfs
56+
fastparquet
57+
pyarrow
5658

5759

5860
[bdist_wheel]

0 commit comments

Comments
 (0)