Skip to content

Commit 2f0272c

Browse files
ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet IO
1 parent f9d2e50 commit 2f0272c

File tree

2 files changed

+28
-2
lines changed

2 files changed

+28
-2
lines changed

pandas/io/_util.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import numpy as np
66

7+
from pandas.compat import pa_version_under18p0
78
from pandas.compat._optional import import_optional_dependency
89

910
import pandas as pd
@@ -35,7 +36,11 @@ def _arrow_dtype_mapping() -> dict:
3536
def arrow_string_types_mapper() -> Callable:
3637
pa = import_optional_dependency("pyarrow")
3738

38-
return {
39+
mapping = {
3940
pa.string(): pd.StringDtype(na_value=np.nan),
4041
pa.large_string(): pd.StringDtype(na_value=np.nan),
41-
}.get
42+
}
43+
if not pa_version_under18p0:
44+
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
45+
46+
return mapping.get

pandas/tests/io/test_parquet.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
pa_version_under13p0,
1818
pa_version_under15p0,
1919
pa_version_under17p0,
20+
pa_version_under18p0,
2021
)
2122

2223
import pandas as pd
@@ -1144,6 +1145,26 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
11441145
)
11451146
tm.assert_frame_equal(result, expected)
11461147

1148+
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
1149+
def test_infer_string_string_view_type(self, tmp_path, pa):
1150+
# GH#54798
1151+
import pyarrow as pa
1152+
import pyarrow.parquet as pq
1153+
1154+
path = tmp_path / "string_view.parquet"
1155+
1156+
table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
1157+
pq.write_table(table, path)
1158+
1159+
with pd.option_context("future.infer_string", True):
1160+
result = read_parquet(path)
1161+
expected = pd.DataFrame(
1162+
data={"a": [None, "b", "c"]},
1163+
dtype=pd.StringDtype(na_value=np.nan),
1164+
columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
1165+
)
1166+
tm.assert_frame_equal(result, expected)
1167+
11471168
# NOTE: this test is not run by default, because it requires a lot of memory (>5GB)
11481169
# @pytest.mark.slow
11491170
# def test_string_column_above_2GB(self, tmp_path, pa):

0 commit comments

Comments
 (0)