Skip to content

Commit 3e5f40a

Browse files
authored
ENH: raise specific error when trying to read non-UTF-8 file with use_arrow (#490)
1 parent f1b6994 commit 3e5f40a

File tree

3 files changed

+62
-16
lines changed

3 files changed

+62
-16
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
### Improvements
2626

2727
- Add support to read, write, list, and remove `/vsimem/` files (#457).
28+
- Raise specific error when trying to read non-UTF-8 file with
29+
`use_arrow=True` (#490).
2830

2931
### Bug fixes
3032

pyogrio/geopandas.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,15 @@ def read_dataframe(
319319
# }.get
320320
if arrow_to_pandas_kwargs is not None:
321321
kwargs.update(arrow_to_pandas_kwargs)
322-
df = table.to_pandas(**kwargs)
322+
323+
try:
324+
df = table.to_pandas(**kwargs)
325+
except UnicodeDecodeError as ex:
326+
# Arrow does not support reading data in a non-UTF-8 encoding
327+
raise DataSourceError(
328+
"The file being read is not encoded in UTF-8; please use_arrow=False"
329+
) from ex
330+
323331
del table
324332

325333
if fid_as_index:

pyogrio/tests/test_geopandas_io.py

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,20 @@ def spatialite_available(path):
103103
return False
104104

105105

106-
@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
107-
def test_read_csv_encoding(tmp_path, encoding):
106+
@pytest.mark.parametrize(
107+
"encoding, arrow",
108+
[
109+
("utf-8", False),
110+
pytest.param("utf-8", True, marks=requires_pyarrow_api),
111+
("cp1252", False),
112+
(None, False),
113+
],
114+
)
115+
def test_read_csv_encoding(tmp_path, encoding, arrow):
116+
""" "Test reading CSV files with different encodings.
117+
118+
Arrow only supports utf-8 encoding.
119+
"""
108120
# Write csv test file. Depending on the os this will be written in a different
109121
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
110122
csv_path = tmp_path / "test.csv"
@@ -115,7 +127,7 @@ def test_read_csv_encoding(tmp_path, encoding):
115127
# Read csv. The data should be read with the same default encoding as the csv file
116128
# was written in, but should have been converted to utf-8 in the dataframe returned.
117129
# Hence, the asserts below, with strings in utf-8, be OK.
118-
df = read_dataframe(csv_path, encoding=encoding)
130+
df = read_dataframe(csv_path, encoding=encoding, use_arrow=arrow)
119131

120132
assert len(df) == 1
121133
assert df.columns.tolist() == ["näme", "city"]
@@ -127,19 +139,29 @@ def test_read_csv_encoding(tmp_path, encoding):
127139
locale.getpreferredencoding().upper() == "UTF-8",
128140
reason="test requires non-UTF-8 default platform",
129141
)
130-
def test_read_csv_platform_encoding(tmp_path):
131-
"""verify that read defaults to platform encoding; only works on Windows (CP1252)"""
142+
def test_read_csv_platform_encoding(tmp_path, use_arrow):
143+
"""Verify that read defaults to platform encoding; only works on Windows (CP1252).
144+
145+
When use_arrow=True, reading an non-UTF8 fails.
146+
"""
132147
csv_path = tmp_path / "test.csv"
133148
with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
134149
csv.write("näme,city\n")
135150
csv.write("Wilhelm Röntgen,Zürich\n")
136151

137-
df = read_dataframe(csv_path)
152+
if use_arrow:
153+
with pytest.raises(
154+
DataSourceError,
155+
match="; please use_arrow=False",
156+
):
157+
df = read_dataframe(csv_path, use_arrow=use_arrow)
158+
else:
159+
df = read_dataframe(csv_path, use_arrow=use_arrow)
138160

139-
assert len(df) == 1
140-
assert df.columns.tolist() == ["näme", "city"]
141-
assert df.city.tolist() == ["Zürich"]
142-
assert df.näme.tolist() == ["Wilhelm Röntgen"]
161+
assert len(df) == 1
162+
assert df.columns.tolist() == ["näme", "city"]
163+
assert df.city.tolist() == ["Zürich"]
164+
assert df.näme.tolist() == ["Wilhelm Röntgen"]
143165

144166

145167
def test_read_dataframe(naturalearth_lowres_all_ext):
@@ -983,9 +1005,20 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
9831005
assert df.iloc[0].geometry.area > area_canada
9841006

9851007

986-
@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
987-
def test_write_csv_encoding(tmp_path, encoding):
988-
"""Test if write_dataframe uses the default encoding correctly."""
1008+
@pytest.mark.parametrize(
1009+
"encoding, arrow",
1010+
[
1011+
("utf-8", False),
1012+
pytest.param("utf-8", True, marks=requires_arrow_write_api),
1013+
("cp1252", False),
1014+
(None, False),
1015+
],
1016+
)
1017+
def test_write_csv_encoding(tmp_path, encoding, arrow):
1018+
"""Test if write_dataframe uses the default encoding correctly.
1019+
1020+
Arrow only supports utf-8 encoding.
1021+
"""
9891022
# Write csv test file. Depending on the os this will be written in a different
9901023
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
9911024
csv_path = tmp_path / "test.csv"
@@ -998,7 +1031,7 @@ def test_write_csv_encoding(tmp_path, encoding):
9981031
# same encoding as above.
9991032
df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
10001033
csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
1001-
write_dataframe(df, csv_pyogrio_path, encoding=encoding)
1034+
write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=arrow)
10021035

10031036
# Check if the text files written both ways can be read again and give same result.
10041037
with open(csv_path, encoding=encoding) as csv:
@@ -2325,7 +2358,10 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
23252358

23262359
if use_arrow:
23272360
# pyarrow cannot decode column name with incorrect encoding
2328-
with pytest.raises(UnicodeDecodeError):
2361+
with pytest.raises(
2362+
DataSourceError,
2363+
match="The file being read is not encoded in UTF-8; please use_arrow=False",
2364+
):
23292365
read_dataframe(output_path, use_arrow=True)
23302366
else:
23312367
bad = read_dataframe(output_path, use_arrow=False)

0 commit comments

Comments
 (0)