@@ -103,8 +103,20 @@ def spatialite_available(path):
103
103
return False
104
104
105
105
106
- @pytest .mark .parametrize ("encoding" , ["utf-8" , "cp1252" , None ])
107
- def test_read_csv_encoding (tmp_path , encoding ):
106
+ @pytest .mark .parametrize (
107
+ "encoding, arrow" ,
108
+ [
109
+ ("utf-8" , False ),
110
+ pytest .param ("utf-8" , True , marks = requires_pyarrow_api ),
111
+ ("cp1252" , False ),
112
+ (None , False ),
113
+ ],
114
+ )
115
+ def test_read_csv_encoding (tmp_path , encoding , arrow ):
116
+ """ "Test reading CSV files with different encodings.
117
+
118
+ Arrow only supports utf-8 encoding.
119
+ """
108
120
# Write csv test file. Depending on the os this will be written in a different
109
121
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
110
122
csv_path = tmp_path / "test.csv"
@@ -115,7 +127,7 @@ def test_read_csv_encoding(tmp_path, encoding):
115
127
# Read csv. The data should be read with the same default encoding as the csv file
116
128
# was written in, but should have been converted to utf-8 in the dataframe returned.
117
129
# Hence, the asserts below, with strings in utf-8, be OK.
118
- df = read_dataframe (csv_path , encoding = encoding )
130
+ df = read_dataframe (csv_path , encoding = encoding , use_arrow = arrow )
119
131
120
132
assert len (df ) == 1
121
133
assert df .columns .tolist () == ["näme" , "city" ]
@@ -127,19 +139,29 @@ def test_read_csv_encoding(tmp_path, encoding):
127
139
locale .getpreferredencoding ().upper () == "UTF-8" ,
128
140
reason = "test requires non-UTF-8 default platform" ,
129
141
)
130
- def test_read_csv_platform_encoding (tmp_path ):
131
- """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
142
+ def test_read_csv_platform_encoding (tmp_path , use_arrow ):
143
+ """Verify that read defaults to platform encoding; only works on Windows (CP1252).
144
+
145
+ When use_arrow=True, reading an non-UTF8 fails.
146
+ """
132
147
csv_path = tmp_path / "test.csv"
133
148
with open (csv_path , "w" , encoding = locale .getpreferredencoding ()) as csv :
134
149
csv .write ("näme,city\n " )
135
150
csv .write ("Wilhelm Röntgen,Zürich\n " )
136
151
137
- df = read_dataframe (csv_path )
152
+ if use_arrow :
153
+ with pytest .raises (
154
+ DataSourceError ,
155
+ match = "; please use_arrow=False" ,
156
+ ):
157
+ df = read_dataframe (csv_path , use_arrow = use_arrow )
158
+ else :
159
+ df = read_dataframe (csv_path , use_arrow = use_arrow )
138
160
139
- assert len (df ) == 1
140
- assert df .columns .tolist () == ["näme" , "city" ]
141
- assert df .city .tolist () == ["Zürich" ]
142
- assert df .näme .tolist () == ["Wilhelm Röntgen" ]
161
+ assert len (df ) == 1
162
+ assert df .columns .tolist () == ["näme" , "city" ]
163
+ assert df .city .tolist () == ["Zürich" ]
164
+ assert df .näme .tolist () == ["Wilhelm Röntgen" ]
143
165
144
166
145
167
def test_read_dataframe (naturalearth_lowres_all_ext ):
@@ -983,9 +1005,20 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
983
1005
assert df .iloc [0 ].geometry .area > area_canada
984
1006
985
1007
986
- @pytest .mark .parametrize ("encoding" , ["utf-8" , "cp1252" , None ])
987
- def test_write_csv_encoding (tmp_path , encoding ):
988
- """Test if write_dataframe uses the default encoding correctly."""
1008
+ @pytest .mark .parametrize (
1009
+ "encoding, arrow" ,
1010
+ [
1011
+ ("utf-8" , False ),
1012
+ pytest .param ("utf-8" , True , marks = requires_arrow_write_api ),
1013
+ ("cp1252" , False ),
1014
+ (None , False ),
1015
+ ],
1016
+ )
1017
+ def test_write_csv_encoding (tmp_path , encoding , arrow ):
1018
+ """Test if write_dataframe uses the default encoding correctly.
1019
+
1020
+ Arrow only supports utf-8 encoding.
1021
+ """
989
1022
# Write csv test file. Depending on the os this will be written in a different
990
1023
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
991
1024
csv_path = tmp_path / "test.csv"
@@ -998,7 +1031,7 @@ def test_write_csv_encoding(tmp_path, encoding):
998
1031
# same encoding as above.
999
1032
df = pd .DataFrame ({"näme" : ["Wilhelm Röntgen" ], "city" : ["Zürich" ]})
1000
1033
csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
1001
- write_dataframe (df , csv_pyogrio_path , encoding = encoding )
1034
+ write_dataframe (df , csv_pyogrio_path , encoding = encoding , use_arrow = arrow )
1002
1035
1003
1036
# Check if the text files written both ways can be read again and give same result.
1004
1037
with open (csv_path , encoding = encoding ) as csv :
@@ -2325,7 +2358,10 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
2325
2358
2326
2359
if use_arrow :
2327
2360
# pyarrow cannot decode column name with incorrect encoding
2328
- with pytest .raises (UnicodeDecodeError ):
2361
+ with pytest .raises (
2362
+ DataSourceError ,
2363
+ match = "The file being read is not encoded in UTF-8; please use_arrow=False" ,
2364
+ ):
2329
2365
read_dataframe (output_path , use_arrow = True )
2330
2366
else :
2331
2367
bad = read_dataframe (output_path , use_arrow = False )
0 commit comments