Add create_ctas_table to Athena module (#1207)

jaidisido · web-flow · commit 728c5ec49787 · 2022-03-10T09:44:45.000Z
* Add create_ctas_table to Athena module

* Fix is_parquet_format regex

* Add Athena test create_ctas_table
diff --git a/awswrangler/athena/__init__.py b/awswrangler/athena/__init__.py
@@ -3,6 +3,7 @@
 from awswrangler.athena._read import read_sql_query, read_sql_table, unload  # noqa
 from awswrangler.athena._utils import (  # noqa
     create_athena_bucket,
+    create_ctas_table,
     describe_table,
     get_named_query_statement,
     get_query_columns_types,
@@ -25,6 +26,7 @@
     "get_named_query_statement",
     "get_work_group",
     "repair_table",
+    "create_ctas_table",
     "show_create_table",
     "start_query_execution",
     "stop_query_execution",
diff --git a/awswrangler/athena/_cache.py b/awswrangler/athena/_cache.py
@@ -87,7 +87,7 @@ def max_cache_size(self, value: int) -> None:
 def _parse_select_query_from_possible_ctas(possible_ctas: str) -> Optional[str]:
     """Check if `possible_ctas` is a valid parquet-generating CTAS and returns the full SELECT statement."""
     possible_ctas = possible_ctas.lower()
-    parquet_format_regex: str = r"format\s*=\s*\'parquet\'\s*,"
+    parquet_format_regex: str = r"format\s*=\s*\'parquet\'\s*"
     is_parquet_format: Optional[Match[str]] = re.search(pattern=parquet_format_regex, string=possible_ctas)
     if is_parquet_format is not None:
         unstripped_select_statement_regex: str = r"\s+as\s+\(*(select|with).*"
diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py
@@ -22,6 +22,7 @@
     _QueryMetadata,
     _start_query_execution,
     _WorkGroupConfig,
+    create_ctas_table,
 )
 
 from ._cache import _cache_manager, _CacheInfo, _check_for_cached_results
@@ -251,7 +252,6 @@ def _resolve_query_without_cache_ctas(
     encryption: Optional[str],
     workgroup: Optional[str],
     kms_key: Optional[str],
-    wg_config: _WorkGroupConfig,
     alt_database: Optional[str],
     name: Optional[str],
     ctas_bucketing_info: Optional[Tuple[List[str], int]],
@@ -260,52 +260,25 @@ def _resolve_query_without_cache_ctas(
     boto3_session: boto3.Session,
     pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
-    path: str = f"{s3_output}/{name}"
-    ext_location: str = "\n" if wg_config.enforced is True else f",\n    external_location = '{path}'\n"
     fully_qualified_name: str = f'"{alt_database}"."{name}"' if alt_database else f'"{database}"."{name}"'
-    bucketing_str = (
-        (f",\n" f"    bucketed_by = ARRAY{ctas_bucketing_info[0]},\n" f"    bucket_count = {ctas_bucketing_info[1]}")
-        if ctas_bucketing_info
-        else ""
-    )
-    sql = (
-        f"CREATE TABLE {fully_qualified_name}\n"
-        f"WITH(\n"
-        f"    format = 'Parquet',\n"
-        f"    parquet_compression = 'SNAPPY'"
-        f"{bucketing_str}"
-        f"{ext_location}"
-        f") AS\n"
-        f"{sql}"
+    ctas_query_info: Dict[str, str] = create_ctas_table(
+        sql=sql,
+        database=database,
+        ctas_table=name,
+        ctas_database=alt_database,
+        bucketing_info=ctas_bucketing_info,
+        data_source=data_source,
+        s3_output=s3_output,
+        workgroup=workgroup,
+        encryption=encryption,
+        kms_key=kms_key,
+        boto3_session=boto3_session,
     )
-    _logger.debug("sql: %s", sql)
-    try:
-        query_id: str = _start_query_execution(
-            sql=sql,
-            wg_config=wg_config,
-            database=database,
-            data_source=data_source,
-            s3_output=s3_output,
-            workgroup=workgroup,
-            encryption=encryption,
-            kms_key=kms_key,
-            boto3_session=boto3_session,
-        )
-    except botocore.exceptions.ClientError as ex:
-        error: Dict[str, Any] = ex.response["Error"]
-        if error["Code"] == "InvalidRequestException" and "Exception parsing query" in error["Message"]:
-            raise exceptions.InvalidCtasApproachQuery(
-                "Is not possible to wrap this query into a CTAS statement. Please use ctas_approach=False."
-            )
-        if error["Code"] == "InvalidRequestException" and "extraneous input" in error["Message"]:
-            raise exceptions.InvalidCtasApproachQuery(
-                "Is not possible to wrap this query into a CTAS statement. Please use ctas_approach=False."
-            )
-        raise ex
-    _logger.debug("query_id: %s", query_id)
+    ctas_query_id: str = ctas_query_info["ctas_query_id"]
+    _logger.debug("ctas_query_id: %s", ctas_query_id)
     try:
         query_metadata: _QueryMetadata = _get_query_metadata(
-            query_execution_id=query_id,
+            query_execution_id=ctas_query_id,
             boto3_session=boto3_session,
             categories=categories,
             metadata_cache_manager=_cache_manager,
@@ -482,7 +455,6 @@ def _resolve_query_without_cache(
                 encryption=encryption,
                 workgroup=workgroup,
                 kms_key=kms_key,
-                wg_config=wg_config,
                 alt_database=ctas_database_name,
                 name=name,
                 ctas_bucketing_info=ctas_bucketing_info,
diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py
@@ -3,15 +3,16 @@
 import logging
 import pprint
 import time
+import uuid
 import warnings
 from decimal import Decimal
-from typing import Any, Dict, Generator, List, NamedTuple, Optional, Union, cast
+from typing import Any, Dict, Generator, List, NamedTuple, Optional, Tuple, Union, cast
 
 import boto3
 import botocore.exceptions
 import pandas as pd
 
-from awswrangler import _data_types, _utils, exceptions, s3, sts
+from awswrangler import _data_types, _utils, catalog, exceptions, s3, sts
 from awswrangler._config import apply_configs
 
 from ._cache import _cache_manager, _CacheInfo, _check_for_cached_results, _LocalMetadataCacheManager
@@ -640,6 +641,143 @@ def describe_table(
     return _parse_describe_table(raw_result)
 
 
+@apply_configs
+def create_ctas_table(
+    sql: str,
+    database: str,
+    ctas_table: Optional[str] = None,
+    ctas_database: Optional[str] = None,
+    s3_output: Optional[str] = None,
+    storage_format: Optional[str] = None,
+    write_compression: Optional[str] = None,
+    partitioning_info: Optional[List[str]] = None,
+    bucketing_info: Optional[Tuple[List[str], int]] = None,
+    field_delimiter: Optional[str] = None,
+    schema_only: bool = False,
+    workgroup: Optional[str] = None,
+    data_source: Optional[str] = None,
+    encryption: Optional[str] = None,
+    kms_key: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> Dict[str, str]:
+    """Create a new table populated with the results of a SELECT query.
+
+    https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html
+
+    Parameters
+    ----------
+    sql : str
+        SELECT SQL query.
+    database : str
+        The name of the database where the original table is stored.
+    ctas_table : Optional[str], optional
+        The name of the CTAS table.
+        If None, a random string is used.
+    ctas_database : Optional[str], optional
+        The name of the alternative database where the CTAS table should be stored.
+        If None, `database` is used, that is the CTAS table is stored in the same database as the original table.
+    s3_output : Optional[str], optional
+        The output Amazon S3 path.
+        If None, either the Athena workgroup or client-side location setting is used.
+        If a workgroup enforces a query results location, then it overrides this argument.
+    storage_format : Optional[str], optional
+        The storage format for the CTAS query results, such as ORC, PARQUET, AVRO, JSON, or TEXTFILE.
+        PARQUET by default.
+    write_compression : Optional[str], optional
+        The compression type to use for any storage format that allows compression to be specified.
+    partitioning_info : Optional[List[str]], optional
+        A list of columns by which the CTAS table will be partitioned.
+    bucketing_info : Optional[Tuple[List[str], int]], optional
+        Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the
+        second element.
+        Only `str`, `int` and `bool` are supported as column data types for bucketing.
+    field_delimiter : Optional[str], optional
+        The single-character field delimiter for files in CSV, TSV, and text files.
+    schema_only : bool, optional
+        _description_, by default False
+    workgroup : Optional[str], optional
+        Athena workgroup.
+    data_source : Optional[str], optional
+        Data Source / Catalog name. If None, 'AwsDataCatalog' is used.
+    encryption : str, optional
+        Valid values: [None, 'SSE_S3', 'SSE_KMS']. Note: 'CSE_KMS' is not supported.
+    kms_key : str, optional
+        For SSE-KMS, this is the KMS key ARN or ID.
+    boto3_session : Optional[boto3.Session], optional
+        Boto3 Session. The default boto3 session is used if boto3_session is None.
+
+    Returns
+    -------
+    Dict[str, str]
+        A dictionary with the ID of the query, and the CTAS database and table names
+    """
+    ctas_table = catalog.sanitize_table_name(ctas_table) if ctas_table else f"temp_table_{uuid.uuid4().hex}"
+    ctas_database = ctas_database if ctas_database else database
+    fully_qualified_name = f'"{ctas_database}"."{ctas_table}"'
+
+    wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup)
+    s3_output = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session)
+    s3_output = s3_output[:-1] if s3_output[-1] == "/" else s3_output
+    # If the workgroup enforces an external location, then it overrides the user supplied argument
+    external_location_str: str = (
+        f"    external_location = '{s3_output}/{ctas_table}',\n" if (not wg_config.enforced) and (s3_output) else ""
+    )
+
+    # At least one property must be specified within `WITH()` in the query. We default to `PARQUET` for `storage_format`
+    storage_format_str: str = f"""    format = '{storage_format.upper() if storage_format else "PARQUET"}'"""
+    write_compression_str: str = (
+        f"    write_compression = '{write_compression.upper()}',\n" if write_compression else ""
+    )
+    partitioning_str: str = f"    partitioned_by = ARRAY{partitioning_info},\n" if partitioning_info else ""
+    bucketing_str: str = (
+        f"    bucketed_by = ARRAY{bucketing_info[0]},\n    bucket_count = {bucketing_info[1]},\n"
+        if bucketing_info
+        else ""
+    )
+    field_delimiter_str: str = f"    field_delimiter = '{field_delimiter}',\n" if field_delimiter else ""
+    schema_only_str: str = "\nWITH NO DATA" if schema_only else ""
+
+    ctas_sql = (
+        f"CREATE TABLE {fully_qualified_name}\n"
+        f"WITH(\n"
+        f"{external_location_str}"
+        f"{partitioning_str}"
+        f"{bucketing_str}"
+        f"{field_delimiter_str}"
+        f"{write_compression_str}"
+        f"{storage_format_str}"
+        f")\n"
+        f"AS {sql}"
+        f"{schema_only_str}"
+    )
+    _logger.debug("ctas sql: %s", ctas_sql)
+
+    try:
+        query_id: str = _start_query_execution(
+            sql=ctas_sql,
+            wg_config=wg_config,
+            database=database,
+            data_source=data_source,
+            s3_output=s3_output,
+            workgroup=workgroup,
+            encryption=encryption,
+            kms_key=kms_key,
+            boto3_session=boto3_session,
+        )
+    except botocore.exceptions.ClientError as ex:
+        error: Dict[str, Any] = ex.response["Error"]
+        if error["Code"] == "InvalidRequestException" and "Exception parsing query" in error["Message"]:
+            raise exceptions.InvalidCtasApproachQuery(
+                f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}"
+            )
+        if error["Code"] == "InvalidRequestException" and "extraneous input" in error["Message"]:
+            raise exceptions.InvalidCtasApproachQuery(
+                f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}"
+            )
+        raise ex
+    return {"ctas_database": ctas_database, "ctas_table": ctas_table, "ctas_query_id": query_id}
+
+
 @apply_configs
 def show_create_table(
     table: str,
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -111,6 +111,7 @@ Amazon Athena
     :toctree: stubs
 
     create_athena_bucket
+    create_ctas_table
     get_query_columns_types
     get_query_execution
     get_named_query_statement
diff --git a/tests/_utils.py b/tests/_utils.py
@@ -2,7 +2,7 @@
 import time
 from datetime import datetime
 from decimal import Decimal
-from typing import Iterator
+from typing import Dict, Iterator
 
 import boto3
 import botocore.exceptions
@@ -501,6 +501,14 @@ def ensure_data_types_csv(df, governed=False):
         assert str(df["par1"].dtype) == "string"
 
 
+def ensure_athena_ctas_table(ctas_query_info: Dict[str, str], boto3_session: boto3.Session) -> None:
+    query_metadata = wr.athena._utils._get_query_metadata(
+        query_execution_id=ctas_query_info["ctas_query_id"], boto3_session=boto3_session
+    )
+    assert query_metadata.raw_payload["Status"]["State"] == "SUCCEEDED"
+    wr.catalog.delete_table_if_exists(table=ctas_query_info["ctas_table"], database=ctas_query_info["ctas_database"])
+
+
 def ensure_athena_query_metadata(df, ctas_approach=True, encrypted=False):
     assert df.query_metadata is not None
     assert isinstance(df.query_metadata, dict)
diff --git a/tests/test_athena.py b/tests/test_athena.py
@@ -10,6 +10,7 @@
 import awswrangler as wr
 
 from ._utils import (
+    ensure_athena_ctas_table,
     ensure_athena_query_metadata,
     ensure_data_types,
     ensure_data_types_category,
@@ -148,6 +149,57 @@ def test_athena_read_sql_ctas_bucketing(path, path2, glue_table, glue_table2, gl
     assert df_ctas.equals(df_no_ctas)
 
 
+def test_athena_create_ctas(path, glue_table, glue_table2, glue_database, glue_ctas_database, kms_key):
+    boto3_session = boto3.DEFAULT_SESSION
+    wr.s3.to_parquet(
+        df=get_df_list(),
+        path=path,
+        index=False,
+        use_threads=True,
+        dataset=True,
+        mode="overwrite",
+        database=glue_database,
+        table=glue_table,
+        partition_cols=["par0", "par1"],
+    )
+
+    # Select *
+    ctas_query_info = wr.athena.create_ctas_table(
+        sql=f"select * from {glue_table}",
+        database=glue_database,
+        encryption="SSE_KMS",
+        kms_key=kms_key,
+    )
+    ensure_athena_ctas_table(ctas_query_info=ctas_query_info, boto3_session=boto3_session)
+
+    # Schema only (i.e. WITH NO DATA)
+    ctas_query_info = wr.athena.create_ctas_table(
+        sql=f"select * from {glue_table}",
+        database=glue_database,
+        ctas_table=glue_table2,
+        schema_only=True,
+    )
+    ensure_athena_ctas_table(ctas_query_info=ctas_query_info, boto3_session=boto3_session)
+
+    # Convert to new data storage and compression
+    ctas_query_info = wr.athena.create_ctas_table(
+        sql=f"select string, bool from {glue_table}",
+        database=glue_database,
+        storage_format="avro",
+        write_compression="snappy",
+    )
+    ensure_athena_ctas_table(ctas_query_info=ctas_query_info, boto3_session=boto3_session)
+
+    # Partition and save to CTAS database
+    ctas_query_info = wr.athena.create_ctas_table(
+        sql=f"select * from {glue_table}",
+        database=glue_database,
+        ctas_database=glue_ctas_database,
+        partitioning_info=["par0", "par1"],
+    )
+    ensure_athena_ctas_table(ctas_query_info=ctas_query_info, boto3_session=boto3_session)
+
+
 def test_athena(path, glue_database, glue_table, kms_key, workgroup0, workgroup1):
     wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table)
     wr.s3.to_parquet(