44
55import boto3
66import numpy as np
7- import pandas as pd
87import pytest
8+ from pandas import DataFrame as PandasDataFrame
99
1010import awswrangler as wr
1111
1919 get_df_list ,
2020 get_df_txt ,
2121 get_time_str_with_random_suffix ,
22+ is_ray_modin ,
23+ pandas_equals ,
2224)
2325
26+ if is_ray_modin :
27+ import modin .pandas as pd
28+ else :
29+ import pandas as pd
30+
2431logging .getLogger ("awswrangler" ).setLevel (logging .DEBUG )
2532
33+ pytestmark = pytest .mark .distributed
34+
2635
2736def test_athena_ctas (path , path2 , path3 , glue_table , glue_table2 , glue_database , glue_ctas_database , kms_key ):
2837 df = get_df_list ()
@@ -203,6 +212,7 @@ def test_athena_create_ctas(path, glue_table, glue_table2, glue_database, glue_c
203212 ensure_athena_ctas_table (ctas_query_info = ctas_query_info , boto3_session = boto3_session )
204213
205214
215+ @pytest .mark .xfail (is_ray_modin , raises = AssertionError , reason = "Index equality regression" )
206216def test_athena (path , glue_database , glue_table , kms_key , workgroup0 , workgroup1 ):
207217 wr .catalog .delete_table_if_exists (database = glue_database , table = glue_table )
208218 wr .s3 .to_parquet (
@@ -821,13 +831,13 @@ def test_bucketing_parquet_dataset(path, glue_database, glue_table, bucketing_da
821831
822832 first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
823833 assert len (first_bucket_df ) == 2
824- assert pd .Series ([bucketing_data [0 ], bucketing_data [2 ]], dtype = dtype ). equals ( first_bucket_df ["c0" ])
825- assert pd .Series (["foo" , "baz" ], dtype = pd .StringDtype ()). equals ( first_bucket_df ["c1" ])
834+ assert pandas_equals ( pd .Series ([bucketing_data [0 ], bucketing_data [2 ]], dtype = dtype ), first_bucket_df ["c0" ])
835+ assert pandas_equals ( pd .Series (["foo" , "baz" ], dtype = pd .StringDtype ()), first_bucket_df ["c1" ])
826836
827837 second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
828838 assert len (second_bucket_df ) == 1
829- assert pd .Series ([bucketing_data [1 ]], dtype = dtype ). equals ( second_bucket_df ["c0" ])
830- assert pd .Series (["bar" ], dtype = pd .StringDtype ()). equals ( second_bucket_df ["c1" ])
839+ assert pandas_equals ( pd .Series ([bucketing_data [1 ]], dtype = dtype ), second_bucket_df ["c0" ])
840+ assert pandas_equals ( pd .Series (["bar" ], dtype = pd .StringDtype ()), second_bucket_df ["c1" ])
831841
832842 loaded_dfs = [
833843 wr .s3 .read_parquet (path = path ),
@@ -903,13 +913,13 @@ def test_bucketing_csv_dataset(path, glue_database, glue_table, bucketing_data,
903913
904914 first_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][0 ]], header = None , names = ["c0" , "c1" ])
905915 assert len (first_bucket_df ) == 2
906- assert pd .Series ([bucketing_data [0 ], bucketing_data [2 ]]). equals ( first_bucket_df ["c0" ])
907- assert pd .Series (["foo" , "baz" ]). equals ( first_bucket_df ["c1" ])
916+ assert pandas_equals ( pd .Series ([bucketing_data [0 ], bucketing_data [2 ]]), first_bucket_df ["c0" ])
917+ assert pandas_equals ( pd .Series (["foo" , "baz" ]), first_bucket_df ["c1" ])
908918
909919 second_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][1 ]], header = None , names = ["c0" , "c1" ])
910920 assert len (second_bucket_df ) == 1
911- assert pd .Series ([bucketing_data [1 ]]). equals ( second_bucket_df ["c0" ])
912- assert pd .Series (["bar" ]). equals ( second_bucket_df ["c1" ])
921+ assert pandas_equals ( pd .Series ([bucketing_data [1 ]]), second_bucket_df ["c0" ])
922+ assert pandas_equals ( pd .Series (["bar" ]), second_bucket_df ["c1" ])
913923
914924 loaded_dfs = [
915925 wr .s3 .read_csv (path = path , header = None , names = ["c0" , "c1" ]),
@@ -960,23 +970,23 @@ def test_combined_bucketing_partitioning_parquet_dataset(path, glue_database, gl
960970
961971 bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
962972 assert len (bucket_df ) == 1
963- assert pd .Series ([bucketing_data [0 ]], dtype = dtype ). equals ( bucket_df ["c0" ])
964- assert pd .Series (["foo" ], dtype = pd .StringDtype ()). equals ( bucket_df ["c1" ])
973+ assert pandas_equals ( pd .Series ([bucketing_data [0 ]], dtype = dtype ), bucket_df ["c0" ])
974+ assert pandas_equals ( pd .Series (["foo" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
965975
966976 bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
967977 assert len (bucket_df ) == 1
968- assert pd .Series ([bucketing_data [1 ]], dtype = dtype ). equals ( bucket_df ["c0" ])
969- assert pd .Series (["bar" ], dtype = pd .StringDtype ()). equals ( bucket_df ["c1" ])
978+ assert pandas_equals ( pd .Series ([bucketing_data [1 ]], dtype = dtype ), bucket_df ["c0" ])
979+ assert pandas_equals ( pd .Series (["bar" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
970980
971981 bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][2 ]])
972982 assert len (bucket_df ) == 1
973- assert pd .Series ([bucketing_data [2 ]], dtype = dtype ). equals ( bucket_df ["c0" ])
974- assert pd .Series (["baz" ], dtype = pd .StringDtype ()). equals ( bucket_df ["c1" ])
983+ assert pandas_equals ( pd .Series ([bucketing_data [2 ]], dtype = dtype ), bucket_df ["c0" ])
984+ assert pandas_equals ( pd .Series (["baz" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
975985
976986 bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][3 ]])
977987 assert len (bucket_df ) == 1
978- assert pd .Series ([bucketing_data [3 ]], dtype = dtype ). equals ( bucket_df ["c0" ])
979- assert pd .Series (["boo" ], dtype = pd .StringDtype ()). equals ( bucket_df ["c1" ])
988+ assert pandas_equals ( pd .Series ([bucketing_data [3 ]], dtype = dtype ), bucket_df ["c0" ])
989+ assert pandas_equals ( pd .Series (["boo" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
980990
981991 loaded_dfs = [
982992 wr .s3 .read_parquet (path = path ),
@@ -1020,23 +1030,23 @@ def test_combined_bucketing_partitioning_csv_dataset(path, glue_database, glue_t
10201030
10211031 bucket_df = wr .s3 .read_csv (path = [r ["paths" ][0 ]], header = None , names = ["c0" , "c1" ])
10221032 assert len (bucket_df ) == 1
1023- assert pd .Series ([bucketing_data [0 ]]). equals ( bucket_df ["c0" ])
1024- assert pd .Series (["foo" ]). equals ( bucket_df ["c1" ])
1033+ assert pandas_equals ( pd .Series ([bucketing_data [0 ]]), bucket_df ["c0" ])
1034+ assert pandas_equals ( pd .Series (["foo" ]), bucket_df ["c1" ])
10251035
10261036 bucket_df = wr .s3 .read_csv (path = [r ["paths" ][1 ]], header = None , names = ["c0" , "c1" ])
10271037 assert len (bucket_df ) == 1
1028- assert pd .Series ([bucketing_data [1 ]]). equals ( bucket_df ["c0" ])
1029- assert pd .Series (["bar" ]). equals ( bucket_df ["c1" ])
1038+ assert pandas_equals ( pd .Series ([bucketing_data [1 ]]), bucket_df ["c0" ])
1039+ assert pandas_equals ( pd .Series (["bar" ]), bucket_df ["c1" ])
10301040
10311041 bucket_df = wr .s3 .read_csv (path = [r ["paths" ][2 ]], header = None , names = ["c0" , "c1" ])
10321042 assert len (bucket_df ) == 1
1033- assert pd .Series ([bucketing_data [2 ]]). equals ( bucket_df ["c0" ])
1034- assert pd .Series (["baz" ]). equals ( bucket_df ["c1" ])
1043+ assert pandas_equals ( pd .Series ([bucketing_data [2 ]]), bucket_df ["c0" ])
1044+ assert pandas_equals ( pd .Series (["baz" ]), bucket_df ["c1" ])
10351045
10361046 bucket_df = wr .s3 .read_csv (path = [r ["paths" ][3 ]], header = None , names = ["c0" , "c1" ])
10371047 assert len (bucket_df ) == 1
1038- assert pd .Series ([bucketing_data [3 ]]). equals ( bucket_df ["c0" ])
1039- assert pd .Series (["boo" ]). equals ( bucket_df ["c1" ])
1048+ assert pandas_equals ( pd .Series ([bucketing_data [3 ]]), bucket_df ["c0" ])
1049+ assert pandas_equals ( pd .Series (["boo" ]), bucket_df ["c1" ])
10401050
10411051 loaded_dfs = [
10421052 wr .s3 .read_csv (path = path , header = None , names = ["c0" , "c1" ]),
@@ -1067,15 +1077,15 @@ def test_multiple_bucketing_columns_parquet_dataset(path, glue_database, glue_ta
10671077
10681078 first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
10691079 assert len (first_bucket_df ) == 2
1070- assert pd .Series ([0 , 3 ], dtype = pd .Int64Dtype ()). equals ( first_bucket_df ["c0" ])
1071- assert pd .Series ([4 , 7 ], dtype = pd .Int64Dtype ()). equals ( first_bucket_df ["c1" ])
1072- assert pd .Series (["foo" , "boo" ], dtype = pd .StringDtype ()). equals ( first_bucket_df ["c2" ])
1080+ assert pandas_equals ( pd .Series ([0 , 3 ], dtype = pd .Int64Dtype ()), first_bucket_df ["c0" ])
1081+ assert pandas_equals ( pd .Series ([4 , 7 ], dtype = pd .Int64Dtype ()), first_bucket_df ["c1" ])
1082+ assert pandas_equals ( pd .Series (["foo" , "boo" ], dtype = pd .StringDtype ()), first_bucket_df ["c2" ])
10731083
10741084 second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
10751085 assert len (second_bucket_df ) == 2
1076- assert pd .Series ([1 , 2 ], dtype = pd .Int64Dtype ()). equals ( second_bucket_df ["c0" ])
1077- assert pd .Series ([6 , 5 ], dtype = pd .Int64Dtype ()). equals ( second_bucket_df ["c1" ])
1078- assert pd .Series (["bar" , "baz" ], dtype = pd .StringDtype ()). equals ( second_bucket_df ["c2" ])
1086+ assert pandas_equals ( pd .Series ([1 , 2 ], dtype = pd .Int64Dtype ()), second_bucket_df ["c0" ])
1087+ assert pandas_equals ( pd .Series ([6 , 5 ], dtype = pd .Int64Dtype ()), second_bucket_df ["c1" ])
1088+ assert pandas_equals ( pd .Series (["bar" , "baz" ], dtype = pd .StringDtype ()), second_bucket_df ["c2" ])
10791089
10801090
10811091@pytest .mark .parametrize ("dtype" , ["int" , "str" , "bool" ])
@@ -1216,14 +1226,14 @@ def test_get_query_results(path, glue_table, glue_database):
12161226 )
12171227 query_id_ctas = df_ctas .query_metadata ["QueryExecutionId" ]
12181228 df_get_query_results_ctas = wr .athena .get_query_results (query_execution_id = query_id_ctas )
1219- pd . testing . assert_frame_equal (df_get_query_results_ctas , df_ctas )
1229+ pandas_equals (df_get_query_results_ctas , df_ctas )
12201230
12211231 df_unload : pd .DataFrame = wr .athena .read_sql_query (
12221232 sql = sql , database = glue_database , ctas_approach = False , unload_approach = True , s3_output = path
12231233 )
12241234 query_id_unload = df_unload .query_metadata ["QueryExecutionId" ]
12251235 df_get_query_results_df_unload = wr .athena .get_query_results (query_execution_id = query_id_unload )
1226- pd . testing . assert_frame_equal (df_get_query_results_df_unload , df_unload )
1236+ pandas_equals (df_get_query_results_df_unload , df_unload )
12271237
12281238 wr .catalog .delete_table_if_exists (database = glue_database , table = glue_table )
12291239 wr .s3 .to_parquet (
@@ -1245,7 +1255,7 @@ def test_get_query_results(path, glue_table, glue_database):
12451255 )
12461256 query_id_regular = df_regular .query_metadata ["QueryExecutionId" ]
12471257 df_get_query_results_df_regular = wr .athena .get_query_results (query_execution_id = query_id_regular )
1248- pd . testing . assert_frame_equal (df_get_query_results_df_regular , df_regular )
1258+ assert pandas_equals (df_get_query_results_df_regular , df_regular )
12491259
12501260
12511261def test_athena_generate_create_query (path , glue_database , glue_table ):
@@ -1326,13 +1336,13 @@ def test_get_query_execution(workgroup0, workgroup1):
13261336 assert query_execution_ids
13271337 query_execution_detail = wr .athena .get_query_execution (query_execution_id = query_execution_ids [0 ])
13281338 query_executions_df = wr .athena .get_query_executions (query_execution_ids )
1329- assert isinstance (query_executions_df , pd . DataFrame )
1339+ assert isinstance (query_executions_df , PandasDataFrame )
13301340 assert isinstance (query_execution_detail , dict )
13311341 assert set (query_execution_ids ).intersection (set (query_executions_df ["QueryExecutionId" ].values .tolist ()))
13321342 query_execution_ids1 = query_execution_ids + ["aaa" , "bbb" ]
13331343 query_executions_df , unprocessed_query_executions_df = wr .athena .get_query_executions (
13341344 query_execution_ids1 , return_unprocessed = True
13351345 )
1336- assert isinstance (unprocessed_query_executions_df , pd . DataFrame )
1346+ assert isinstance (unprocessed_query_executions_df , PandasDataFrame )
13371347 assert set (query_execution_ids ).intersection (set (query_executions_df ["QueryExecutionId" ].values .tolist ()))
13381348 assert {"aaa" , "bbb" }.intersection (set (unprocessed_query_executions_df ["QueryExecutionId" ].values .tolist ()))
0 commit comments