@@ -499,7 +499,7 @@ def read_sql_athena(self,
499499 workgroup : Optional [str ] = None ,
500500 encryption : Optional [str ] = None ,
501501 kms_key : Optional [str ] = None ,
502- ctas_approach : bool = False ,
502+ ctas_approach : bool = None ,
503503 procs_cpu_bound : Optional [int ] = None ,
504504 max_result_size : Optional [int ] = None ):
505505 """
@@ -523,11 +523,12 @@ def read_sql_athena(self,
523523 :param workgroup: The name of the workgroup in which the query is being started. (By default uses de Session() workgroup)
524524 :param encryption: None|'SSE_S3'|'SSE_KMS'|'CSE_KMS'
525525 :param kms_key: For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
526- :param ctas_approach: Wraps the query with a CTAS
526+ :param ctas_approach: Wraps the query with a CTAS (Session's deafult is False)
527527 :param procs_cpu_bound: Number of cores used for CPU bound tasks
528528 :param max_result_size: Max number of bytes on each request to S3 (VALID ONLY FOR ctas_approach=False)
529529 :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size was passed
530530 """
531+ ctas_approach = ctas_approach if ctas_approach is not None else self ._session .ctas_approach if self ._session .ctas_approach is not None else False
531532 if ctas_approach is True and max_result_size is not None :
532533 raise InvalidParameters ("ctas_approach can't use max_result_size!" )
533534 if s3_output is None :
@@ -580,7 +581,10 @@ def _read_sql_athena_ctas(self,
580581 kms_key = kms_key )
581582 self ._session .athena .wait_query (query_execution_id = query_id )
582583 self ._session .glue .delete_table_if_exists (database = database , table = name )
583- return self .read_parquet (path = path , procs_cpu_bound = procs_cpu_bound )
584+ manifest_path : str = f"{ s3_output } /tables/{ query_id } -manifest.csv"
585+ paths : List [str ] = self ._session .athena .extract_manifest_paths (path = manifest_path )
586+ logger .debug (f"paths: { paths } " )
587+ return self .read_parquet (path = paths , procs_cpu_bound = procs_cpu_bound )
584588
585589 def _read_sql_athena_regular (self ,
586590 sql : str ,
@@ -1209,30 +1213,150 @@ def drop_duplicated_columns(dataframe: pd.DataFrame, inplace: bool = True) -> pd
12091213 return dataframe .loc [:, ~ duplicated_cols ]
12101214
12111215 def read_parquet (self ,
1212- path : str ,
1216+ path : Union [ str , List [ str ]] ,
12131217 columns : Optional [List [str ]] = None ,
12141218 filters : Optional [Union [List [Tuple [Any ]], List [Tuple [Any ]]]] = None ,
12151219 procs_cpu_bound : Optional [int ] = None ) -> pd .DataFrame :
12161220 """
12171221 Read parquet data from S3
12181222
1223+ :param path: AWS S3 path or List of paths (E.g. s3://bucket-name/folder_name/)
1224+ :param columns: Names of columns to read from the file
1225+ :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
1226+ :param procs_cpu_bound: Number of cores used for CPU bound tasks
1227+ """
1228+ procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else self ._session .procs_cpu_bound if self ._session .procs_cpu_bound is not None else 1
1229+ logger .debug (f"procs_cpu_bound: { procs_cpu_bound } " )
1230+ df : Optional [pd .DataFrame ] = None
1231+ session_primitives = self ._session .primitives
1232+ path = [path ] if type (path ) == str else path # type: ignore
1233+ bounders = calculate_bounders (len (path ), procs_cpu_bound )
1234+ logger .debug (f"len(bounders): { len (bounders )} " )
1235+ if len (bounders ) == 1 :
1236+ df = Pandas ._read_parquet_paths (session_primitives = session_primitives ,
1237+ path = path ,
1238+ columns = columns ,
1239+ filters = filters ,
1240+ procs_cpu_bound = procs_cpu_bound )
1241+ else :
1242+ procs = []
1243+ receive_pipes = []
1244+ for bounder in bounders :
1245+ receive_pipe , send_pipe = mp .Pipe ()
1246+ logger .debug (f"bounder: { bounder } " )
1247+ proc = mp .Process (
1248+ target = self ._read_parquet_paths_remote ,
1249+ args = (
1250+ send_pipe ,
1251+ session_primitives ,
1252+ path [bounder [0 ]:bounder [1 ]],
1253+ columns ,
1254+ filters ,
1255+ 1 # procs_cpu_bound
1256+ ),
1257+ )
1258+ proc .daemon = False
1259+ proc .start ()
1260+ procs .append (proc )
1261+ receive_pipes .append (receive_pipe )
1262+ logger .debug (f"len(procs): { len (bounders )} " )
1263+ for i in range (len (procs )):
1264+ logger .debug (f"Waiting pipe number: { i } " )
1265+ df_received = receive_pipes [i ].recv ()
1266+ if df is None :
1267+ df = df_received
1268+ else :
1269+ df = pd .concat (objs = [df , df_received ], ignore_index = True )
1270+ logger .debug (f"Waiting proc number: { i } " )
1271+ procs [i ].join ()
1272+ logger .debug (f"Closing proc number: { i } " )
1273+ receive_pipes [i ].close ()
1274+ return df
1275+
1276+ @staticmethod
1277+ def _read_parquet_paths_remote (send_pipe : mp .connection .Connection ,
1278+ session_primitives : Any ,
1279+ path : Union [str , List [str ]],
1280+ columns : Optional [List [str ]] = None ,
1281+ filters : Optional [Union [List [Tuple [Any ]], List [Tuple [Any ]]]] = None ,
1282+ procs_cpu_bound : Optional [int ] = None ):
1283+ df : pd .DataFrame = Pandas ._read_parquet_paths (session_primitives = session_primitives ,
1284+ path = path ,
1285+ columns = columns ,
1286+ filters = filters ,
1287+ procs_cpu_bound = procs_cpu_bound )
1288+ send_pipe .send (df )
1289+ send_pipe .close ()
1290+
1291+ @staticmethod
1292+ def _read_parquet_paths (session_primitives : Any ,
1293+ path : Union [str , List [str ]],
1294+ columns : Optional [List [str ]] = None ,
1295+ filters : Optional [Union [List [Tuple [Any ]], List [Tuple [Any ]]]] = None ,
1296+ procs_cpu_bound : Optional [int ] = None ) -> pd .DataFrame :
1297+ """
1298+ Read parquet data from S3
1299+
1300+ :param session_primitives: SessionPrimitives()
1301+ :param path: AWS S3 path or List of paths (E.g. s3://bucket-name/folder_name/)
1302+ :param columns: Names of columns to read from the file
1303+ :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
1304+ :param procs_cpu_bound: Number of cores used for CPU bound tasks
1305+ """
1306+ df : pd .DataFrame
1307+ if (type (path ) == str ) or (len (path ) == 1 ):
1308+ path = path [0 ] if type (path ) == list else path # type: ignore
1309+ df = Pandas ._read_parquet_path (
1310+ session_primitives = session_primitives ,
1311+ path = path , # type: ignore
1312+ columns = columns ,
1313+ filters = filters ,
1314+ procs_cpu_bound = procs_cpu_bound )
1315+ else :
1316+ df = Pandas ._read_parquet_path (session_primitives = session_primitives ,
1317+ path = path [0 ],
1318+ columns = columns ,
1319+ filters = filters ,
1320+ procs_cpu_bound = procs_cpu_bound )
1321+ for p in path [1 :]:
1322+ df_aux = Pandas ._read_parquet_path (session_primitives = session_primitives ,
1323+ path = p ,
1324+ columns = columns ,
1325+ filters = filters ,
1326+ procs_cpu_bound = procs_cpu_bound )
1327+ df = pd .concat (objs = [df , df_aux ], ignore_index = True )
1328+ return df
1329+
1330+ @staticmethod
1331+ def _read_parquet_path (session_primitives : Any ,
1332+ path : str ,
1333+ columns : Optional [List [str ]] = None ,
1334+ filters : Optional [Union [List [Tuple [Any ]], List [Tuple [Any ]]]] = None ,
1335+ procs_cpu_bound : Optional [int ] = None ) -> pd .DataFrame :
1336+ """
1337+ Read parquet data from S3
1338+
1339+ :param session_primitives: SessionPrimitives()
12191340 :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/)
12201341 :param columns: Names of columns to read from the file
12211342 :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
12221343 :param procs_cpu_bound: Number of cores used for CPU bound tasks
12231344 """
12241345 path = path [:- 1 ] if path [- 1 ] == "/" else path
1225- procs_cpu_bound = 1 if self . _session . procs_cpu_bound is None else self . _session . procs_cpu_bound if procs_cpu_bound is None else procs_cpu_bound
1346+ procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else session_primitives . procs_cpu_bound if session_primitives . procs_cpu_bound is not None else 1
12261347 use_threads : bool = True if procs_cpu_bound > 1 else False
1227- fs : S3FileSystem = s3 .get_fs (session_primitives = self . _session . primitives )
1348+ fs : S3FileSystem = s3 .get_fs (session_primitives = session_primitives )
12281349 fs = pa .filesystem ._ensure_filesystem (fs )
1350+ logger .debug (f"Reading Parquet table: { path } " )
12291351 table = pq .read_table (source = path , columns = columns , filters = filters , filesystem = fs , use_threads = use_threads )
12301352 # Check if we lose some integer during the conversion (Happens when has some null value)
12311353 integers = [field .name for field in table .schema if str (field .type ).startswith ("int" )]
1354+ logger .debug (f"Converting to Pandas: { path } " )
12321355 df = table .to_pandas (use_threads = use_threads , integer_object_nulls = True )
12331356 for c in integers :
12341357 if not str (df [c ].dtype ).startswith ("int" ):
12351358 df [c ] = df [c ].astype ("Int64" )
1359+ logger .debug (f"Done: { path } " )
12361360 return df
12371361
12381362 def read_table (self ,
0 commit comments