Makes ctas_approach works even with eventual consistency issues

igorborgest · igorborgest · commit c99dca2d2953 · 2019-12-12T09:44:31.000-03:00
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -16,7 +16,10 @@
 class Athena:
     def __init__(self, session):
         self._session = session
-        self._client_athena = session.boto3_session.client(service_name="athena", config=session.botocore_config)
+        self._client_athena = session.boto3_session.client(service_name="athena",
+                                                           use_ssl=True,
+                                                           config=session.botocore_config)
+        self._client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config)
 
     def get_query_columns_metadata(self, query_execution_id: str) -> Dict[str, str]:
         """
@@ -256,3 +259,14 @@ def normalize_table_name(name):
         :return: normalized table name (str)
         """
         return Athena._normalize_name(name=name)
+
+    @staticmethod
+    def _parse_path(path):
+        path2 = path.replace("s3://", "")
+        parts = path2.partition("/")
+        return parts[0], parts[2]
+
+    def extract_manifest_paths(self, path: str) -> List[str]:
+        bucket_name, key_path = self._parse_path(path)
+        body: bytes = self._client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read()
+        return [x for x in body.decode('utf-8').split("\n") if x != ""]
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -499,7 +499,7 @@ def read_sql_athena(self,
                         workgroup: Optional[str] = None,
                         encryption: Optional[str] = None,
                         kms_key: Optional[str] = None,
-                        ctas_approach: bool = False,
+                        ctas_approach: bool = None,
                         procs_cpu_bound: Optional[int] = None,
                         max_result_size: Optional[int] = None):
         """
@@ -523,11 +523,12 @@ def read_sql_athena(self,
         :param workgroup: The name of the workgroup in which the query is being started. (By default uses de Session() workgroup)
         :param encryption: None|'SSE_S3'|'SSE_KMS'|'CSE_KMS'
         :param kms_key: For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
-        :param ctas_approach: Wraps the query with a CTAS
+        :param ctas_approach: Wraps the query with a CTAS (Session's deafult is False)
         :param procs_cpu_bound: Number of cores used for CPU bound tasks
         :param max_result_size: Max number of bytes on each request to S3 (VALID ONLY FOR ctas_approach=False)
         :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size was passed
         """
+        ctas_approach = ctas_approach if ctas_approach is not None else self._session.ctas_approach if self._session.ctas_approach is not None else False
         if ctas_approach is True and max_result_size is not None:
             raise InvalidParameters("ctas_approach can't use max_result_size!")
         if s3_output is None:
@@ -580,7 +581,10 @@ def _read_sql_athena_ctas(self,
                                                        kms_key=kms_key)
         self._session.athena.wait_query(query_execution_id=query_id)
         self._session.glue.delete_table_if_exists(database=database, table=name)
-        return self.read_parquet(path=path, procs_cpu_bound=procs_cpu_bound)
+        manifest_path: str = f"{s3_output}/tables/{query_id}-manifest.csv"
+        paths: List[str] = self._session.athena.extract_manifest_paths(path=manifest_path)
+        logger.debug(f"paths: {paths}")
+        return self.read_parquet(path=paths, procs_cpu_bound=procs_cpu_bound)
 
     def _read_sql_athena_regular(self,
                                  sql: str,
@@ -1209,30 +1213,150 @@ def drop_duplicated_columns(dataframe: pd.DataFrame, inplace: bool = True) -> pd
         return dataframe.loc[:, ~duplicated_cols]
 
     def read_parquet(self,
-                     path: str,
+                     path: Union[str, List[str]],
                      columns: Optional[List[str]] = None,
                      filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
                      procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
         """
         Read parquet data from S3
 
+        :param path: AWS S3 path or List of paths (E.g. s3://bucket-name/folder_name/)
+        :param columns: Names of columns to read from the file
+        :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
+        :param procs_cpu_bound: Number of cores used for CPU bound tasks
+        """
+        procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else self._session.procs_cpu_bound if self._session.procs_cpu_bound is not None else 1
+        logger.debug(f"procs_cpu_bound: {procs_cpu_bound}")
+        df: Optional[pd.DataFrame] = None
+        session_primitives = self._session.primitives
+        path = [path] if type(path) == str else path  # type: ignore
+        bounders = calculate_bounders(len(path), procs_cpu_bound)
+        logger.debug(f"len(bounders): {len(bounders)}")
+        if len(bounders) == 1:
+            df = Pandas._read_parquet_paths(session_primitives=session_primitives,
+                                            path=path,
+                                            columns=columns,
+                                            filters=filters,
+                                            procs_cpu_bound=procs_cpu_bound)
+        else:
+            procs = []
+            receive_pipes = []
+            for bounder in bounders:
+                receive_pipe, send_pipe = mp.Pipe()
+                logger.debug(f"bounder: {bounder}")
+                proc = mp.Process(
+                    target=self._read_parquet_paths_remote,
+                    args=(
+                        send_pipe,
+                        session_primitives,
+                        path[bounder[0]:bounder[1]],
+                        columns,
+                        filters,
+                        1  # procs_cpu_bound
+                    ),
+                )
+                proc.daemon = False
+                proc.start()
+                procs.append(proc)
+                receive_pipes.append(receive_pipe)
+            logger.debug(f"len(procs): {len(bounders)}")
+            for i in range(len(procs)):
+                logger.debug(f"Waiting pipe number: {i}")
+                df_received = receive_pipes[i].recv()
+                if df is None:
+                    df = df_received
+                else:
+                    df = pd.concat(objs=[df, df_received], ignore_index=True)
+                logger.debug(f"Waiting proc number: {i}")
+                procs[i].join()
+                logger.debug(f"Closing proc number: {i}")
+                receive_pipes[i].close()
+        return df
+
+    @staticmethod
+    def _read_parquet_paths_remote(send_pipe: mp.connection.Connection,
+                                   session_primitives: Any,
+                                   path: Union[str, List[str]],
+                                   columns: Optional[List[str]] = None,
+                                   filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                                   procs_cpu_bound: Optional[int] = None):
+        df: pd.DataFrame = Pandas._read_parquet_paths(session_primitives=session_primitives,
+                                                      path=path,
+                                                      columns=columns,
+                                                      filters=filters,
+                                                      procs_cpu_bound=procs_cpu_bound)
+        send_pipe.send(df)
+        send_pipe.close()
+
+    @staticmethod
+    def _read_parquet_paths(session_primitives: Any,
+                            path: Union[str, List[str]],
+                            columns: Optional[List[str]] = None,
+                            filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                            procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
+        """
+        Read parquet data from S3
+
+        :param session_primitives: SessionPrimitives()
+        :param path: AWS S3 path or List of paths (E.g. s3://bucket-name/folder_name/)
+        :param columns: Names of columns to read from the file
+        :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
+        :param procs_cpu_bound: Number of cores used for CPU bound tasks
+        """
+        df: pd.DataFrame
+        if (type(path) == str) or (len(path) == 1):
+            path = path[0] if type(path) == list else path  # type: ignore
+            df = Pandas._read_parquet_path(
+                session_primitives=session_primitives,
+                path=path,  # type: ignore
+                columns=columns,
+                filters=filters,
+                procs_cpu_bound=procs_cpu_bound)
+        else:
+            df = Pandas._read_parquet_path(session_primitives=session_primitives,
+                                           path=path[0],
+                                           columns=columns,
+                                           filters=filters,
+                                           procs_cpu_bound=procs_cpu_bound)
+            for p in path[1:]:
+                df_aux = Pandas._read_parquet_path(session_primitives=session_primitives,
+                                                   path=p,
+                                                   columns=columns,
+                                                   filters=filters,
+                                                   procs_cpu_bound=procs_cpu_bound)
+                df = pd.concat(objs=[df, df_aux], ignore_index=True)
+        return df
+
+    @staticmethod
+    def _read_parquet_path(session_primitives: Any,
+                           path: str,
+                           columns: Optional[List[str]] = None,
+                           filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                           procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
+        """
+        Read parquet data from S3
+
+        :param session_primitives: SessionPrimitives()
         :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/)
         :param columns: Names of columns to read from the file
         :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
         :param procs_cpu_bound: Number of cores used for CPU bound tasks
         """
         path = path[:-1] if path[-1] == "/" else path
-        procs_cpu_bound = 1 if self._session.procs_cpu_bound is None else self._session.procs_cpu_bound if procs_cpu_bound is None else procs_cpu_bound
+        procs_cpu_bound = procs_cpu_bound if procs_cpu_bound is not None else session_primitives.procs_cpu_bound if session_primitives.procs_cpu_bound is not None else 1
         use_threads: bool = True if procs_cpu_bound > 1 else False
-        fs: S3FileSystem = s3.get_fs(session_primitives=self._session.primitives)
+        fs: S3FileSystem = s3.get_fs(session_primitives=session_primitives)
         fs = pa.filesystem._ensure_filesystem(fs)
+        logger.debug(f"Reading Parquet table: {path}")
         table = pq.read_table(source=path, columns=columns, filters=filters, filesystem=fs, use_threads=use_threads)
         # Check if we lose some integer during the conversion (Happens when has some null value)
         integers = [field.name for field in table.schema if str(field.type).startswith("int")]
+        logger.debug(f"Converting to Pandas: {path}")
         df = table.to_pandas(use_threads=use_threads, integer_object_nulls=True)
         for c in integers:
             if not str(df[c].dtype).startswith("int"):
                 df[c] = df[c].astype("Int64")
+        logger.debug(f"Done: {path}")
         return df
 
     def read_table(self,
diff --git a/awswrangler/session.py b/awswrangler/session.py
@@ -48,7 +48,8 @@ def __init__(self,
                  athena_s3_output: Optional[str] = None,
                  athena_encryption: Optional[str] = "SSE_S3",
                  athena_kms_key: Optional[str] = None,
-                 athena_database: str = "default"):
+                 athena_database: str = "default",
+                 athena_ctas_approach: bool = False):
         """
         Most parameters inherit from Boto3 or Pyspark.
         https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
@@ -68,6 +69,7 @@ def __init__(self,
         :param procs_io_bound: number of processes that can be used in single node applications for I/O bound cases (Default: os.cpu_count() * PROCS_IO_BOUND_FACTOR)
         :param athena_workgroup: Default AWS Athena Workgroup (str)
         :param athena_database: AWS Glue/Athena database name
+        :param athena_ctas_approach: Wraps the query with a CTAS
         :param athena_s3_output: AWS S3 path
         :param athena_encryption: None|'SSE_S3'|'SSE_KMS'|'CSE_KMS'
         :param athena_kms_key: For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
@@ -92,6 +94,7 @@ def __init__(self,
         self._athena_encryption: Optional[str] = athena_encryption
         self._athena_kms_key: Optional[str] = athena_kms_key
         self._athena_database: str = athena_database
+        self._athena_ctas_approach: bool = athena_ctas_approach
         self._primitives = None
         self._load_new_primitives()
         if boto3_session:
@@ -131,23 +134,22 @@ def _load_new_primitives(self):
         Load or reload a new AWS Wrangler Session primitives
         :return: None
         """
-        self._primitives = SessionPrimitives(
-            profile_name=self._profile_name,
-            aws_access_key_id=self._aws_access_key_id,
-            aws_secret_access_key=self._aws_secret_access_key,
-            aws_session_token=self._aws_session_token,
-            region_name=self._region_name,
-            botocore_max_retries=self._botocore_max_retries,
-            s3_additional_kwargs=self._s3_additional_kwargs,
-            botocore_config=self._botocore_config,
-            procs_cpu_bound=self._procs_cpu_bound,
-            procs_io_bound=self._procs_io_bound,
-            athena_workgroup=self._athena_workgroup,
-            athena_s3_output=self._athena_s3_output,
-            athena_encryption=self._athena_encryption,
-            athena_kms_key=self._athena_kms_key,
-            athena_database=self._athena_database,
-        )
+        self._primitives = SessionPrimitives(profile_name=self._profile_name,
+                                             aws_access_key_id=self._aws_access_key_id,
+                                             aws_secret_access_key=self._aws_secret_access_key,
+                                             aws_session_token=self._aws_session_token,
+                                             region_name=self._region_name,
+                                             botocore_max_retries=self._botocore_max_retries,
+                                             s3_additional_kwargs=self._s3_additional_kwargs,
+                                             botocore_config=self._botocore_config,
+                                             procs_cpu_bound=self._procs_cpu_bound,
+                                             procs_io_bound=self._procs_io_bound,
+                                             athena_workgroup=self._athena_workgroup,
+                                             athena_s3_output=self._athena_s3_output,
+                                             athena_encryption=self._athena_encryption,
+                                             athena_kms_key=self._athena_kms_key,
+                                             athena_database=self._athena_database,
+                                             athena_ctas_approach=self._athena_ctas_approach)
 
     @property
     def profile_name(self):
@@ -217,6 +219,10 @@ def athena_kms_key(self) -> Optional[str]:
     def athena_database(self) -> str:
         return self._athena_database
 
+    @property
+    def athena_ctas_approach(self) -> bool:
+        return self._athena_ctas_approach
+
     @property
     def boto3_session(self):
         return self._boto3_session
@@ -297,7 +303,8 @@ def __init__(self,
                  athena_s3_output: Optional[str] = None,
                  athena_encryption: Optional[str] = None,
                  athena_kms_key: Optional[str] = None,
-                 athena_database: Optional[str] = None):
+                 athena_database: Optional[str] = None,
+                 athena_ctas_approach: bool = False):
         """
         Most parameters inherit from Boto3.
         https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
@@ -314,6 +321,7 @@ def __init__(self,
         :param procs_io_bound: number of processes that can be used in single node applications for I/O bound cases (Default: os.cpu_count() * PROCS_IO_BOUND_FACTOR)
         :param athena_workgroup: Default AWS Athena Workgroup (str)
         :param athena_database: AWS Glue/Athena database name
+        :param athena_ctas_approach: Wraps the query with a CTAS
         :param athena_s3_output: AWS S3 path
         :param athena_encryption: None|'SSE_S3'|'SSE_KMS'|'CSE_KMS'
         :param athena_kms_key: For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
@@ -333,6 +341,7 @@ def __init__(self,
         self._athena_encryption: Optional[str] = athena_encryption
         self._athena_kms_key: Optional[str] = athena_kms_key
         self._athena_database: Optional[str] = athena_database
+        self._athena_ctas_approach: bool = athena_ctas_approach
 
     @property
     def profile_name(self):
@@ -394,6 +403,10 @@ def athena_kms_key(self) -> Optional[str]:
     def athena_database(self) -> Optional[str]:
         return self._athena_database
 
+    @property
+    def athena_ctas_approach(self) -> bool:
+        return self._athena_ctas_approach
+
     @property
     def session(self):
         """
@@ -413,4 +426,5 @@ def session(self):
                        athena_s3_output=self._athena_s3_output,
                        athena_encryption=self._athena_encryption,
                        athena_kms_key=self._athena_kms_key,
-                       athena_database=self._athena_database)
+                       athena_database=self._athena_database,
+                       athena_ctas_approach=self._athena_ctas_approach)
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 numpy~=1.17.4
 pandas~=0.25.3
 pyarrow~=0.15.1
-botocore~=1.13.35
-boto3~=1.10.35
+botocore~=1.13.36
+boto3~=1.10.36
 s3fs~=0.4.0
 tenacity~=6.0.0
 pg8000~=1.13.2
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -1509,5 +1509,29 @@ def test_read_sql_athena_ctas(session, bucket, database):
                               procs_cpu_bound=4,
                               partition_cols=["partition"])
     df2 = session.pandas.read_sql_athena(ctas_approach=True, sql="select * from test", database=database)
+    session.s3.delete_objects(path=path)
     assert len(list(df.columns)) == len(list(df2.columns))
     assert len(df.index) == len(df2.index)
+
+
+def test_read_sql_athena_s3_output_ctas(session, bucket, database):
+    n: int = 1_000_000
+    df = pd.DataFrame({"id": list((range(n))), "partition": list(["foo" if i % 2 == 0 else "boo" for i in range(n)])})
+    path = f"s3://{bucket}/test_read_sql_athena_s3_output_ctas/"
+    session.pandas.to_parquet(dataframe=df,
+                              database=database,
+                              table="test",
+                              path=path,
+                              mode="overwrite",
+                              preserve_index=True,
+                              procs_cpu_bound=4,
+                              partition_cols=["partition"])
+    path_ctas = f"s3://{bucket}/test_read_sql_athena_s3_output_ctas_metadata/"
+    df2 = session.pandas.read_sql_athena(ctas_approach=True,
+                                         sql="select * from test",
+                                         database=database,
+                                         s3_output=path_ctas)
+    session.s3.delete_objects(path=path)
+    assert len(list(df.columns)) + 1 == len(list(df2.columns))
+    assert len(df.index) == len(df2.index)
+    print(df2)
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py