1
1
""" test parquet compat """
2
2
import datetime
3
3
from distutils .version import LooseVersion
4
- import locale
5
4
import os
6
5
from warnings import catch_warnings
7
6
@@ -131,6 +130,7 @@ def check_round_trip(
131
130
read_kwargs = None ,
132
131
expected = None ,
133
132
check_names = True ,
133
+ check_like = False ,
134
134
repeat = 2 ,
135
135
):
136
136
"""Verify parquet serializer and deserializer produce the same results.
@@ -150,6 +150,8 @@ def check_round_trip(
150
150
Expected deserialization result, otherwise will be equal to `df`
151
151
check_names: list of str, optional
152
152
Closed set of column names to be compared
153
+ check_like: bool, optional
154
+ If True, ignore the order of index & columns.
153
155
repeat: int, optional
154
156
How many times to repeat the test
155
157
"""
@@ -169,7 +171,9 @@ def compare(repeat):
169
171
with catch_warnings (record = True ):
170
172
actual = read_parquet (path , ** read_kwargs )
171
173
172
- tm .assert_frame_equal (expected , actual , check_names = check_names )
174
+ tm .assert_frame_equal (
175
+ expected , actual , check_names = check_names , check_like = check_like
176
+ )
173
177
174
178
if path is None :
175
179
with tm .ensure_clean () as path :
@@ -532,15 +536,37 @@ def test_categorical(self, pa):
532
536
expected = df .astype (object )
533
537
check_round_trip (df , pa , expected = expected )
534
538
535
- # GH#33077 2020-03-27
536
- @pytest .mark .xfail (
537
- locale .getlocale ()[0 ] == "zh_CN" ,
538
- reason = "dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'" ,
539
- )
540
539
def test_s3_roundtrip (self , df_compat , s3_resource , pa ):
541
540
# GH #19134
542
541
check_round_trip (df_compat , pa , path = "s3://pandas-test/pyarrow.parquet" )
543
542
543
+ @td .skip_if_no ("s3fs" )
544
+ @pytest .mark .parametrize ("partition_col" , [["A" ], []])
545
+ def test_s3_roundtrip_for_dir (self , df_compat , s3_resource , pa , partition_col ):
546
+ from pandas .io .s3 import get_fs as get_s3_fs
547
+
548
+ # GH #26388
549
+ # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716
550
+ # As per pyarrow partitioned columns become 'categorical' dtypes
551
+ # and are added to back of dataframe on read
552
+
553
+ expected_df = df_compat .copy ()
554
+ if partition_col :
555
+ expected_df [partition_col ] = expected_df [partition_col ].astype ("category" )
556
+ check_round_trip (
557
+ df_compat ,
558
+ pa ,
559
+ expected = expected_df ,
560
+ path = "s3://pandas-test/parquet_dir" ,
561
+ write_kwargs = {
562
+ "partition_cols" : partition_col ,
563
+ "compression" : None ,
564
+ "filesystem" : get_s3_fs (),
565
+ },
566
+ check_like = True ,
567
+ repeat = 1 ,
568
+ )
569
+
544
570
def test_partition_cols_supported (self , pa , df_full ):
545
571
# GH #23283
546
572
partition_cols = ["bool" , "int" ]
0 commit comments