|
3 | 3 | import logging |
4 | 4 | import pprint |
5 | 5 | import time |
| 6 | +import uuid |
6 | 7 | import warnings |
7 | 8 | from decimal import Decimal |
8 | | -from typing import Any, Dict, Generator, List, NamedTuple, Optional, Union, cast |
| 9 | +from typing import Any, Dict, Generator, List, NamedTuple, Optional, Tuple, Union, cast |
9 | 10 |
|
10 | 11 | import boto3 |
11 | 12 | import botocore.exceptions |
12 | 13 | import pandas as pd |
13 | 14 |
|
14 | | -from awswrangler import _data_types, _utils, exceptions, s3, sts |
| 15 | +from awswrangler import _data_types, _utils, catalog, exceptions, s3, sts |
15 | 16 | from awswrangler._config import apply_configs |
16 | 17 |
|
17 | 18 | from ._cache import _cache_manager, _CacheInfo, _check_for_cached_results, _LocalMetadataCacheManager |
@@ -640,6 +641,143 @@ def describe_table( |
640 | 641 | return _parse_describe_table(raw_result) |
641 | 642 |
|
642 | 643 |
|
| 644 | +@apply_configs |
| 645 | +def create_ctas_table( |
| 646 | + sql: str, |
| 647 | + database: str, |
| 648 | + ctas_table: Optional[str] = None, |
| 649 | + ctas_database: Optional[str] = None, |
| 650 | + s3_output: Optional[str] = None, |
| 651 | + storage_format: Optional[str] = None, |
| 652 | + write_compression: Optional[str] = None, |
| 653 | + partitioning_info: Optional[List[str]] = None, |
| 654 | + bucketing_info: Optional[Tuple[List[str], int]] = None, |
| 655 | + field_delimiter: Optional[str] = None, |
| 656 | + schema_only: bool = False, |
| 657 | + workgroup: Optional[str] = None, |
| 658 | + data_source: Optional[str] = None, |
| 659 | + encryption: Optional[str] = None, |
| 660 | + kms_key: Optional[str] = None, |
| 661 | + boto3_session: Optional[boto3.Session] = None, |
| 662 | +) -> Dict[str, str]: |
| 663 | + """Create a new table populated with the results of a SELECT query. |
| 664 | +
|
| 665 | + https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html |
| 666 | +
|
| 667 | + Parameters |
| 668 | + ---------- |
| 669 | + sql : str |
| 670 | + SELECT SQL query. |
| 671 | + database : str |
| 672 | + The name of the database where the original table is stored. |
| 673 | + ctas_table : Optional[str], optional |
| 674 | + The name of the CTAS table. |
| 675 | + If None, a random string is used. |
| 676 | + ctas_database : Optional[str], optional |
| 677 | + The name of the alternative database where the CTAS table should be stored. |
| 678 | + If None, `database` is used, that is the CTAS table is stored in the same database as the original table. |
| 679 | + s3_output : Optional[str], optional |
| 680 | + The output Amazon S3 path. |
| 681 | + If None, either the Athena workgroup or client-side location setting is used. |
| 682 | + If a workgroup enforces a query results location, then it overrides this argument. |
| 683 | + storage_format : Optional[str], optional |
| 684 | + The storage format for the CTAS query results, such as ORC, PARQUET, AVRO, JSON, or TEXTFILE. |
| 685 | + PARQUET by default. |
| 686 | + write_compression : Optional[str], optional |
| 687 | + The compression type to use for any storage format that allows compression to be specified. |
| 688 | + partitioning_info : Optional[List[str]], optional |
| 689 | + A list of columns by which the CTAS table will be partitioned. |
| 690 | + bucketing_info : Optional[Tuple[List[str], int]], optional |
| 691 | + Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the |
| 692 | + second element. |
| 693 | + Only `str`, `int` and `bool` are supported as column data types for bucketing. |
| 694 | + field_delimiter : Optional[str], optional |
| 695 | + The single-character field delimiter for files in CSV, TSV, and text files. |
| 696 | + schema_only : bool, optional |
| 697 | + _description_, by default False |
| 698 | + workgroup : Optional[str], optional |
| 699 | + Athena workgroup. |
| 700 | + data_source : Optional[str], optional |
| 701 | + Data Source / Catalog name. If None, 'AwsDataCatalog' is used. |
| 702 | + encryption : str, optional |
| 703 | + Valid values: [None, 'SSE_S3', 'SSE_KMS']. Note: 'CSE_KMS' is not supported. |
| 704 | + kms_key : str, optional |
| 705 | + For SSE-KMS, this is the KMS key ARN or ID. |
| 706 | + boto3_session : Optional[boto3.Session], optional |
| 707 | + Boto3 Session. The default boto3 session is used if boto3_session is None. |
| 708 | +
|
| 709 | + Returns |
| 710 | + ------- |
| 711 | + Dict[str, str] |
| 712 | + A dictionary with the ID of the query, and the CTAS database and table names |
| 713 | + """ |
| 714 | + ctas_table = catalog.sanitize_table_name(ctas_table) if ctas_table else f"temp_table_{uuid.uuid4().hex}" |
| 715 | + ctas_database = ctas_database if ctas_database else database |
| 716 | + fully_qualified_name = f'"{ctas_database}"."{ctas_table}"' |
| 717 | + |
| 718 | + wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup) |
| 719 | + s3_output = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) |
| 720 | + s3_output = s3_output[:-1] if s3_output[-1] == "/" else s3_output |
| 721 | + # If the workgroup enforces an external location, then it overrides the user supplied argument |
| 722 | + external_location_str: str = ( |
| 723 | + f" external_location = '{s3_output}/{ctas_table}',\n" if (not wg_config.enforced) and (s3_output) else "" |
| 724 | + ) |
| 725 | + |
| 726 | + # At least one property must be specified within `WITH()` in the query. We default to `PARQUET` for `storage_format` |
| 727 | + storage_format_str: str = f""" format = '{storage_format.upper() if storage_format else "PARQUET"}'""" |
| 728 | + write_compression_str: str = ( |
| 729 | + f" write_compression = '{write_compression.upper()}',\n" if write_compression else "" |
| 730 | + ) |
| 731 | + partitioning_str: str = f" partitioned_by = ARRAY{partitioning_info},\n" if partitioning_info else "" |
| 732 | + bucketing_str: str = ( |
| 733 | + f" bucketed_by = ARRAY{bucketing_info[0]},\n bucket_count = {bucketing_info[1]},\n" |
| 734 | + if bucketing_info |
| 735 | + else "" |
| 736 | + ) |
| 737 | + field_delimiter_str: str = f" field_delimiter = '{field_delimiter}',\n" if field_delimiter else "" |
| 738 | + schema_only_str: str = "\nWITH NO DATA" if schema_only else "" |
| 739 | + |
| 740 | + ctas_sql = ( |
| 741 | + f"CREATE TABLE {fully_qualified_name}\n" |
| 742 | + f"WITH(\n" |
| 743 | + f"{external_location_str}" |
| 744 | + f"{partitioning_str}" |
| 745 | + f"{bucketing_str}" |
| 746 | + f"{field_delimiter_str}" |
| 747 | + f"{write_compression_str}" |
| 748 | + f"{storage_format_str}" |
| 749 | + f")\n" |
| 750 | + f"AS {sql}" |
| 751 | + f"{schema_only_str}" |
| 752 | + ) |
| 753 | + _logger.debug("ctas sql: %s", ctas_sql) |
| 754 | + |
| 755 | + try: |
| 756 | + query_id: str = _start_query_execution( |
| 757 | + sql=ctas_sql, |
| 758 | + wg_config=wg_config, |
| 759 | + database=database, |
| 760 | + data_source=data_source, |
| 761 | + s3_output=s3_output, |
| 762 | + workgroup=workgroup, |
| 763 | + encryption=encryption, |
| 764 | + kms_key=kms_key, |
| 765 | + boto3_session=boto3_session, |
| 766 | + ) |
| 767 | + except botocore.exceptions.ClientError as ex: |
| 768 | + error: Dict[str, Any] = ex.response["Error"] |
| 769 | + if error["Code"] == "InvalidRequestException" and "Exception parsing query" in error["Message"]: |
| 770 | + raise exceptions.InvalidCtasApproachQuery( |
| 771 | + f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" |
| 772 | + ) |
| 773 | + if error["Code"] == "InvalidRequestException" and "extraneous input" in error["Message"]: |
| 774 | + raise exceptions.InvalidCtasApproachQuery( |
| 775 | + f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" |
| 776 | + ) |
| 777 | + raise ex |
| 778 | + return {"ctas_database": ctas_database, "ctas_table": ctas_table, "ctas_query_id": query_id} |
| 779 | + |
| 780 | + |
643 | 781 | @apply_configs |
644 | 782 | def show_create_table( |
645 | 783 | table: str, |
|
0 commit comments