feat: Iceberg PARTITIONED BY and additional table properties support (#2322)

kukushking · web-flow · commit 9f4d83d67a0a · 2023-06-05T15:02:02.000+01:00
* feat: Iceberg `PARTITIONED BY` support

* Additional table properties
diff --git a/awswrangler/athena/_write_iceberg.py b/awswrangler/athena/_write_iceberg.py
@@ -2,7 +2,7 @@
 
 import logging
 import uuid
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import boto3
 import pandas as pd
@@ -25,6 +25,8 @@ def _create_iceberg_table(
     table: str,
     path: str,
     wg_config: _WorkGroupConfig,
+    partition_cols: Optional[List[str]],
+    additional_table_properties: Optional[Dict[str, Any]],
     index: bool = False,
     data_source: Optional[str] = None,
     workgroup: Optional[str] = None,
@@ -37,11 +39,18 @@ def _create_iceberg_table(
 
     columns_types, _ = catalog.extract_athena_types(df=df, index=index)
     cols_str: str = ", ".join([f"{k} {v}" for k, v in columns_types.items()])
+    partition_cols_str: str = f"PARTITIONED BY ({', '.join([col for col in partition_cols])})" if partition_cols else ""
+    table_properties_str: str = (
+        ", " + ", ".join([f"'{key}'='{value}'" for key, value in additional_table_properties.items()])
+        if additional_table_properties
+        else ""
+    )
 
     create_sql: str = (
         f"CREATE TABLE IF NOT EXISTS {table} ({cols_str}) "
+        f"{partition_cols_str} "
         f"LOCATION '{path}' "
-        f"TBLPROPERTIES ( 'table_type' ='ICEBERG', 'format'='parquet' )"
+        f"TBLPROPERTIES ('table_type' ='ICEBERG', 'format'='parquet'{table_properties_str})"
     )
 
     query_id: str = _start_query_execution(
@@ -68,13 +77,15 @@ def to_iceberg(
     temp_path: Optional[str] = None,
     index: bool = False,
     table_location: Optional[str] = None,
+    partition_cols: Optional[List[str]] = None,
     keep_files: bool = True,
     data_source: Optional[str] = None,
     workgroup: Optional[str] = None,
     encryption: Optional[str] = None,
     kms_key: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, Any]] = None,
+    additional_table_properties: Optional[Dict[str, Any]] = None,
 ) -> None:
     """
     Insert into Athena Iceberg table using INSERT INTO ... SELECT. Will create Iceberg table if it does not exist.
@@ -97,6 +108,11 @@ def to_iceberg(
         Should consider the DataFrame index as a column?.
     table_location : str, optional
         Amazon S3 location for the table. Will only be used to create a new table if it does not exist.
+    partition_cols: List[str], optional
+        List of column names that will be used to create partitions, including support for transform
+        functions (e.g. "day(ts)").
+
+        https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html#querying-iceberg-partitioning
     keep_files : bool
         Whether staging files produced by Athena are retained. 'True' by default.
     data_source : str, optional
@@ -112,6 +128,11 @@ def to_iceberg(
     s3_additional_kwargs : Optional[Dict[str, Any]]
         Forwarded to botocore requests.
         e.g. s3_additional_kwargs={'RequestPayer': 'requester'}
+    additional_table_properties : Optional[Dict[str, Any]]
+        Additional table properties.
+        e.g. additional_table_properties={'write_target_data_file_size_bytes': '536870912'}
+
+        https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html#querying-iceberg-table-properties
 
     Returns
     -------
@@ -163,6 +184,8 @@ def to_iceberg(
                 table=table,
                 path=table_location,  # type: ignore[arg-type]
                 wg_config=wg_config,
+                partition_cols=partition_cols,
+                additional_table_properties=additional_table_properties,
                 index=index,
                 data_source=data_source,
                 workgroup=workgroup,
diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
@@ -23,6 +23,7 @@
     get_df_txt,
     get_time_str_with_random_suffix,
     pandas_equals,
+    ts,
 )
 
 logging.getLogger("awswrangler").setLevel(logging.DEBUG)
@@ -1403,8 +1404,19 @@ def test_athena_date_recovery(path, glue_database, glue_table):
     assert pandas_equals(df, df2)
 
 
-def test_athena_to_iceberg(path, path2, glue_database, glue_table):
-    df = pd.DataFrame({"id": [1, 2, 3], "name": ["a", "b", "c"]})
+@pytest.mark.parametrize("partition_cols", [None, ["name"], ["name", "day(ts)"]])
+@pytest.mark.parametrize(
+    "additional_table_properties",
+    [None, {"write_target_data_file_size_bytes": 536870912, "optimize_rewrite_delete_file_threshold": 10}],
+)
+def test_athena_to_iceberg(path, path2, glue_database, glue_table, partition_cols, additional_table_properties):
+    df = pd.DataFrame(
+        {
+            "id": [1, 2, 3],
+            "name": ["a", "b", "c"],
+            "ts": [ts("2020-01-01 00:00:00.0"), ts("2020-01-02 00:00:01.0"), ts("2020-01-03 00:00:00.0")],
+        }
+    )
     df["id"] = df["id"].astype("Int64")  # Cast as nullable int64 type
     df["name"] = df["name"].astype("string")
 
@@ -1414,6 +1426,8 @@ def test_athena_to_iceberg(path, path2, glue_database, glue_table):
         table=glue_table,
         table_location=path,
         temp_path=path2,
+        partition_cols=partition_cols,
+        additional_table_properties=additional_table_properties,
     )
 
     df_out = wr.athena.read_sql_query(