@@ -136,6 +136,7 @@ def test_athena_ctas(path, path2, path3, glue_table, glue_table2, glue_database,
136136 assert len (wr .s3 .list_objects (path = path3 )) == 0
137137
138138
139+ @pytest .mark .modin_index
139140def test_athena_read_sql_ctas_bucketing (path , path2 , glue_table , glue_table2 , glue_database , glue_ctas_database ):
140141 df = pd .DataFrame ({"c0" : [0 , 1 ], "c1" : ["foo" , "bar" ]})
141142 wr .s3 .to_parquet (
@@ -155,12 +156,14 @@ def test_athena_read_sql_ctas_bucketing(path, path2, glue_table, glue_table2, gl
155156 bucketing_info = (["c0" ], 1 ),
156157 ),
157158 s3_output = path2 ,
159+ pyarrow_additional_kwargs = {"ignore_metadata" : True },
158160 )
159161 df_no_ctas = wr .athena .read_sql_query (
160162 sql = f"SELECT * FROM { glue_table } " ,
161163 ctas_approach = False ,
162164 database = glue_database ,
163165 s3_output = path2 ,
166+ pyarrow_additional_kwargs = {"ignore_metadata" : True },
164167 )
165168 assert df_ctas .equals (df_no_ctas )
166169
@@ -855,6 +858,7 @@ def test_bucketing_catalog_parquet_table(path, glue_database, glue_table):
855858 assert table ["StorageDescriptor" ]["BucketColumns" ] == bucket_cols
856859
857860
861+ @pytest .mark .modin_index
858862@pytest .mark .parametrize ("bucketing_data" , [[0 , 1 , 2 ], [False , True , False ], ["b" , "c" , "d" ]])
859863@pytest .mark .parametrize (
860864 "dtype" ,
@@ -907,12 +911,12 @@ def test_bucketing_parquet_dataset(path, glue_database, glue_table, bucketing_da
907911 if isinstance (bucketing_data [0 ], str ):
908912 dtype = pd .StringDtype ()
909913
910- first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
914+ first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
911915 assert len (first_bucket_df ) == 2
912916 assert pandas_equals (pd .Series ([bucketing_data [0 ], bucketing_data [2 ]], dtype = dtype ), first_bucket_df ["c0" ])
913917 assert pandas_equals (pd .Series (["foo" , "baz" ], dtype = pd .StringDtype ()), first_bucket_df ["c1" ])
914918
915- second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
919+ second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
916920 assert len (second_bucket_df ) == 1
917921 assert pandas_equals (pd .Series ([bucketing_data [1 ]], dtype = dtype ), second_bucket_df ["c0" ])
918922 assert pandas_equals (pd .Series (["bar" ], dtype = pd .StringDtype ()), second_bucket_df ["c1" ])
@@ -943,6 +947,7 @@ def test_bucketing_catalog_csv_table(path, glue_database, glue_table):
943947 assert table ["StorageDescriptor" ]["BucketColumns" ] == bucket_cols
944948
945949
950+ @pytest .mark .modin_index
946951@pytest .mark .parametrize ("bucketing_data" , [[0 , 1 , 2 ], [False , True , False ], ["b" , "c" , "d" ]])
947952@pytest .mark .parametrize (
948953 "dtype" ,
@@ -988,12 +993,12 @@ def test_bucketing_csv_dataset(path, glue_database, glue_table, bucketing_data,
988993 assert r ["paths" ][0 ].endswith ("bucket-00000.csv" )
989994 assert r ["paths" ][1 ].endswith ("bucket-00001.csv" )
990995
991- first_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][0 ]], header = None , names = ["c0" , "c1" ])
996+ first_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][0 ]], header = None , names = ["c0" , "c1" ]). reset_index ( drop = True )
992997 assert len (first_bucket_df ) == 2
993998 assert pandas_equals (pd .Series ([bucketing_data [0 ], bucketing_data [2 ]]), first_bucket_df ["c0" ])
994999 assert pandas_equals (pd .Series (["foo" , "baz" ]), first_bucket_df ["c1" ])
9951000
996- second_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][1 ]], header = None , names = ["c0" , "c1" ])
1001+ second_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][1 ]], header = None , names = ["c0" , "c1" ]). reset_index ( drop = True )
9971002 assert len (second_bucket_df ) == 1
9981003 assert pandas_equals (pd .Series ([bucketing_data [1 ]]), second_bucket_df ["c0" ])
9991004 assert pandas_equals (pd .Series (["bar" ]), second_bucket_df ["c1" ])
@@ -1008,6 +1013,7 @@ def test_bucketing_csv_dataset(path, glue_database, glue_table, bucketing_data,
10081013 assert all (x in bucketing_data for x in loaded_df ["c0" ].to_list ())
10091014
10101015
1016+ @pytest .mark .modin_index
10111017@pytest .mark .parametrize ("bucketing_data" , [[0 , 1 , 2 , 3 ], [False , True , False , True ], ["b" , "c" , "d" , "e" ]])
10121018def test_combined_bucketing_partitioning_parquet_dataset (path , glue_database , glue_table , bucketing_data ):
10131019 nb_of_buckets = 2
@@ -1045,22 +1051,22 @@ def test_combined_bucketing_partitioning_parquet_dataset(path, glue_database, gl
10451051 if isinstance (bucketing_data [0 ], str ):
10461052 dtype = pd .StringDtype ()
10471053
1048- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
1054+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
10491055 assert len (bucket_df ) == 1
10501056 assert pandas_equals (pd .Series ([bucketing_data [0 ]], dtype = dtype ), bucket_df ["c0" ])
10511057 assert pandas_equals (pd .Series (["foo" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
10521058
1053- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
1059+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
10541060 assert len (bucket_df ) == 1
10551061 assert pandas_equals (pd .Series ([bucketing_data [1 ]], dtype = dtype ), bucket_df ["c0" ])
10561062 assert pandas_equals (pd .Series (["bar" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
10571063
1058- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][2 ]])
1064+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][2 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
10591065 assert len (bucket_df ) == 1
10601066 assert pandas_equals (pd .Series ([bucketing_data [2 ]], dtype = dtype ), bucket_df ["c0" ])
10611067 assert pandas_equals (pd .Series (["baz" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
10621068
1063- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][3 ]])
1069+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][3 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
10641070 assert len (bucket_df ) == 1
10651071 assert pandas_equals (pd .Series ([bucketing_data [3 ]], dtype = dtype ), bucket_df ["c0" ])
10661072 assert pandas_equals (pd .Series (["boo" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
@@ -1135,6 +1141,7 @@ def test_combined_bucketing_partitioning_csv_dataset(path, glue_database, glue_t
11351141 assert all (x in bucketing_data for x in loaded_df ["c0" ].to_list ())
11361142
11371143
1144+ @pytest .mark .modin_index
11381145def test_multiple_bucketing_columns_parquet_dataset (path , glue_database , glue_table ):
11391146 nb_of_buckets = 2
11401147 df = pd .DataFrame ({"c0" : [0 , 1 , 2 , 3 ], "c1" : [4 , 6 , 5 , 7 ], "c2" : ["foo" , "bar" , "baz" , "boo" ]})
@@ -1152,13 +1159,13 @@ def test_multiple_bucketing_columns_parquet_dataset(path, glue_database, glue_ta
11521159 assert r ["paths" ][0 ].endswith ("bucket-00000.snappy.parquet" )
11531160 assert r ["paths" ][1 ].endswith ("bucket-00001.snappy.parquet" )
11541161
1155- first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
1162+ first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
11561163 assert len (first_bucket_df ) == 2
11571164 assert pandas_equals (pd .Series ([0 , 3 ], dtype = pd .Int64Dtype ()), first_bucket_df ["c0" ])
11581165 assert pandas_equals (pd .Series ([4 , 7 ], dtype = pd .Int64Dtype ()), first_bucket_df ["c1" ])
11591166 assert pandas_equals (pd .Series (["foo" , "boo" ], dtype = pd .StringDtype ()), first_bucket_df ["c2" ])
11601167
1161- second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
1168+ second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
11621169 assert len (second_bucket_df ) == 2
11631170 assert pandas_equals (pd .Series ([1 , 2 ], dtype = pd .Int64Dtype ()), second_bucket_df ["c0" ])
11641171 assert pandas_equals (pd .Series ([6 , 5 ], dtype = pd .Int64Dtype ()), second_bucket_df ["c1" ])
0 commit comments