feat: Add schema evolution to athena.to_iceberg (#2465)

LeonLuttenberger · web-flow · commit 831a72a13c9c · 2023-09-25T09:42:45.000-05:00
diff --git a/awswrangler/athena/_write_iceberg.py b/awswrangler/athena/_write_iceberg.py
@@ -1,13 +1,14 @@
 """Amazon Athena Module containing all to_* write functions."""
 
 import logging
+import typing
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Set, TypedDict
 
 import boto3
 import pandas as pd
 
-from awswrangler import _utils, catalog, exceptions, s3
+from awswrangler import _data_types, _utils, catalog, exceptions, s3
 from awswrangler._config import apply_configs
 from awswrangler.athena._executions import wait_query
 from awswrangler.athena._utils import (
@@ -67,6 +68,111 @@ def _create_iceberg_table(
     wait_query(query_execution_id=query_execution_id, boto3_session=boto3_session)
 
 
+class _SchemaChanges(TypedDict):
+    to_add: Dict[str, str]
+    to_change: Dict[str, str]
+    to_remove: Set[str]
+
+
+def _determine_differences(
+    df: pd.DataFrame,
+    database: str,
+    table: str,
+    index: bool,
+    partition_cols: Optional[List[str]],
+    boto3_session: Optional[boto3.Session],
+    dtype: Optional[Dict[str, str]],
+    catalog_id: Optional[str],
+) -> _SchemaChanges:
+    frame_columns_types, frame_partitions_types = _data_types.athena_types_from_pandas_partitioned(
+        df=df, index=index, partition_cols=partition_cols, dtype=dtype
+    )
+    frame_columns_types.update(frame_partitions_types)
+
+    catalog_column_types = typing.cast(
+        Dict[str, str],
+        catalog.get_table_types(database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session),
+    )
+
+    original_columns = set(catalog_column_types)
+    new_columns = set(frame_columns_types)
+
+    to_add = {col: frame_columns_types[col] for col in new_columns - original_columns}
+    to_remove = original_columns - new_columns
+
+    columns_to_change = [
+        col
+        for col in original_columns.intersection(new_columns)
+        if frame_columns_types[col] != catalog_column_types[col]
+    ]
+    to_change = {col: frame_columns_types[col] for col in columns_to_change}
+
+    return _SchemaChanges(to_add=to_add, to_change=to_change, to_remove=to_remove)
+
+
+def _alter_iceberg_table(
+    database: str,
+    table: str,
+    schema_changes: _SchemaChanges,
+    wg_config: _WorkGroupConfig,
+    data_source: Optional[str] = None,
+    workgroup: Optional[str] = None,
+    encryption: Optional[str] = None,
+    kms_key: Optional[str] = None,
+    boto3_session: Optional[boto3.Session] = None,
+) -> None:
+    sql_statements: List[str] = []
+
+    if schema_changes["to_add"]:
+        sql_statements += _alter_iceberg_table_add_columns_sql(
+            table=table,
+            columns_to_add=schema_changes["to_add"],
+        )
+
+    if schema_changes["to_change"]:
+        sql_statements += _alter_iceberg_table_change_columns_sql(
+            table=table,
+            columns_to_change=schema_changes["to_change"],
+        )
+
+    if schema_changes["to_remove"]:
+        raise exceptions.InvalidArgumentCombination("Removing columns of Iceberg tables is not currently supported.")
+
+    for statement in sql_statements:
+        query_execution_id: str = _start_query_execution(
+            sql=statement,
+            workgroup=workgroup,
+            wg_config=wg_config,
+            database=database,
+            data_source=data_source,
+            encryption=encryption,
+            kms_key=kms_key,
+            boto3_session=boto3_session,
+        )
+        wait_query(query_execution_id=query_execution_id, boto3_session=boto3_session)
+
+
+def _alter_iceberg_table_add_columns_sql(
+    table: str,
+    columns_to_add: Dict[str, str],
+) -> List[str]:
+    add_cols_str = ", ".join([f"{col_name} {columns_to_add[col_name]}" for col_name in columns_to_add])
+
+    return [f"ALTER TABLE {table} ADD COLUMNS ({add_cols_str})"]
+
+
+def _alter_iceberg_table_change_columns_sql(
+    table: str,
+    columns_to_change: Dict[str, str],
+) -> List[str]:
+    sql_statements = []
+
+    for col_name, col_type in columns_to_change.items():
+        sql_statements.append(f"ALTER TABLE {table} CHANGE COLUMN {col_name} {col_name} {col_type}")
+
+    return sql_statements
+
+
 @apply_configs
 @_utils.validate_distributed_kwargs(
     unsupported_kwargs=["boto3_session", "s3_additional_kwargs"],
@@ -89,6 +195,7 @@ def to_iceberg(
     additional_table_properties: Optional[Dict[str, Any]] = None,
     dtype: Optional[Dict[str, str]] = None,
     catalog_id: Optional[str] = None,
+    schema_evolution: bool = False,
 ) -> None:
     """
     Insert into Athena Iceberg table using INSERT INTO ... SELECT. Will create Iceberg table if it does not exist.
@@ -143,6 +250,8 @@ def to_iceberg(
     catalog_id : str, optional
         The ID of the Data Catalog from which to retrieve Databases.
         If none is provided, the AWS account ID is used by default
+    schema_evolution: bool
+        If True allows schema evolution for new columns or changes in column types.
 
     Returns
     -------
@@ -206,6 +315,31 @@ def to_iceberg(
                 boto3_session=boto3_session,
                 dtype=dtype,
             )
+        else:
+            schema_differences = _determine_differences(
+                df=df,
+                database=database,
+                table=table,
+                index=index,
+                partition_cols=partition_cols,
+                boto3_session=boto3_session,
+                dtype=dtype,
+                catalog_id=catalog_id,
+            )
+            if schema_evolution is False and any([schema_differences[x] for x in schema_differences]):  # type: ignore[literal-required]
+                raise exceptions.InvalidArgumentValue(f"Schema change detected: {schema_differences}")
+
+            _alter_iceberg_table(
+                database=database,
+                table=table,
+                schema_changes=schema_differences,
+                wg_config=wg_config,
+                data_source=data_source,
+                workgroup=workgroup,
+                encryption=encryption,
+                kms_key=kms_key,
+                boto3_session=boto3_session,
+            )
 
         # Create temporary external table, write the results
         s3.to_parquet(
diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
@@ -1525,6 +1525,137 @@ def test_athena_to_iceberg(path, path2, glue_database, glue_table, partition_col
     assert df.equals(df_out)
 
 
+def test_athena_to_iceberg_schema_evolution_add_columns(
+    path: str, path2: str, glue_database: str, glue_table: str
+) -> None:
+    df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5]})
+    wr.athena.to_iceberg(
+        df=df,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+        schema_evolution=True,
+    )
+
+    df["c2"] = [6, 7, 8]
+    wr.athena.to_iceberg(
+        df=df,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+        schema_evolution=True,
+    )
+
+    column_types = wr.catalog.get_table_types(glue_database, glue_table)
+    assert len(column_types) == len(df.columns)
+
+    df_out = wr.athena.read_sql_table(
+        table=glue_table,
+        database=glue_database,
+        ctas_approach=False,
+        unload_approach=False,
+    )
+    assert len(df_out) == len(df) * 2
+
+    df["c3"] = [9, 10, 11]
+    with pytest.raises(wr.exceptions.InvalidArgumentValue):
+        wr.athena.to_iceberg(
+            df=df,
+            database=glue_database,
+            table=glue_table,
+            table_location=path,
+            temp_path=path2,
+            keep_files=False,
+            schema_evolution=False,
+        )
+
+
+def test_athena_to_iceberg_schema_evolution_modify_columns(
+    path: str, path2: str, glue_database: str, glue_table: str
+) -> None:
+    # Version 1
+    df = pd.DataFrame({"c1": pd.Series([1.0, 2.0], dtype="float32"), "c2": pd.Series([-1, -2], dtype="int32")})
+
+    wr.athena.to_iceberg(
+        df=df,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+        schema_evolution=True,
+    )
+
+    df_out = wr.athena.read_sql_table(
+        table=glue_table,
+        database=glue_database,
+        ctas_approach=False,
+        unload_approach=False,
+    )
+
+    assert len(df_out) == 2
+    assert len(df_out.columns) == 2
+    assert str(df_out["c1"].dtype).startswith("float32")
+    assert str(df_out["c2"].dtype).startswith("Int32")
+
+    # Version 2
+    df2 = pd.DataFrame({"c1": pd.Series([3.0, 4.0], dtype="float64"), "c2": pd.Series([-3, -4], dtype="int64")})
+
+    wr.athena.to_iceberg(
+        df=df2,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+        schema_evolution=True,
+    )
+
+    df2_out = wr.athena.read_sql_table(
+        table=glue_table,
+        database=glue_database,
+        ctas_approach=False,
+        unload_approach=False,
+    )
+
+    assert len(df2_out) == 4
+    assert len(df2_out.columns) == 2
+    assert str(df2_out["c1"].dtype).startswith("float64")
+    assert str(df2_out["c2"].dtype).startswith("Int64")
+
+
+def test_athena_to_iceberg_schema_evolution_remove_columns_error(
+    path: str, path2: str, glue_database: str, glue_table: str
+) -> None:
+    df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5]})
+    wr.athena.to_iceberg(
+        df=df,
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+        schema_evolution=True,
+    )
+
+    df = pd.DataFrame({"c0": [6, 7, 8]})
+
+    with pytest.raises(wr.exceptions.InvalidArgumentCombination):
+        wr.athena.to_iceberg(
+            df=df,
+            database=glue_database,
+            table=glue_table,
+            table_location=path,
+            temp_path=path2,
+            keep_files=False,
+            schema_evolution=True,
+        )
+
+
 def test_to_iceberg_cast(path, path2, glue_table, glue_database):
     df = pd.DataFrame(
         {