11"""Amazon Athena Module containing all to_* write functions."""
22
33import logging
4+ import typing
45import uuid
5- from typing import Any , Dict , List , Optional
6+ from typing import Any , Dict , List , Optional , Set , TypedDict
67
78import boto3
89import pandas as pd
910
10- from awswrangler import _utils , catalog , exceptions , s3
11+ from awswrangler import _data_types , _utils , catalog , exceptions , s3
1112from awswrangler ._config import apply_configs
1213from awswrangler .athena ._executions import wait_query
1314from awswrangler .athena ._utils import (
@@ -67,6 +68,111 @@ def _create_iceberg_table(
6768 wait_query (query_execution_id = query_execution_id , boto3_session = boto3_session )
6869
6970
71+ class _SchemaChanges (TypedDict ):
72+ to_add : Dict [str , str ]
73+ to_change : Dict [str , str ]
74+ to_remove : Set [str ]
75+
76+
77+ def _determine_differences (
78+ df : pd .DataFrame ,
79+ database : str ,
80+ table : str ,
81+ index : bool ,
82+ partition_cols : Optional [List [str ]],
83+ boto3_session : Optional [boto3 .Session ],
84+ dtype : Optional [Dict [str , str ]],
85+ catalog_id : Optional [str ],
86+ ) -> _SchemaChanges :
87+ frame_columns_types , frame_partitions_types = _data_types .athena_types_from_pandas_partitioned (
88+ df = df , index = index , partition_cols = partition_cols , dtype = dtype
89+ )
90+ frame_columns_types .update (frame_partitions_types )
91+
92+ catalog_column_types = typing .cast (
93+ Dict [str , str ],
94+ catalog .get_table_types (database = database , table = table , catalog_id = catalog_id , boto3_session = boto3_session ),
95+ )
96+
97+ original_columns = set (catalog_column_types )
98+ new_columns = set (frame_columns_types )
99+
100+ to_add = {col : frame_columns_types [col ] for col in new_columns - original_columns }
101+ to_remove = original_columns - new_columns
102+
103+ columns_to_change = [
104+ col
105+ for col in original_columns .intersection (new_columns )
106+ if frame_columns_types [col ] != catalog_column_types [col ]
107+ ]
108+ to_change = {col : frame_columns_types [col ] for col in columns_to_change }
109+
110+ return _SchemaChanges (to_add = to_add , to_change = to_change , to_remove = to_remove )
111+
112+
113+ def _alter_iceberg_table (
114+ database : str ,
115+ table : str ,
116+ schema_changes : _SchemaChanges ,
117+ wg_config : _WorkGroupConfig ,
118+ data_source : Optional [str ] = None ,
119+ workgroup : Optional [str ] = None ,
120+ encryption : Optional [str ] = None ,
121+ kms_key : Optional [str ] = None ,
122+ boto3_session : Optional [boto3 .Session ] = None ,
123+ ) -> None :
124+ sql_statements : List [str ] = []
125+
126+ if schema_changes ["to_add" ]:
127+ sql_statements += _alter_iceberg_table_add_columns_sql (
128+ table = table ,
129+ columns_to_add = schema_changes ["to_add" ],
130+ )
131+
132+ if schema_changes ["to_change" ]:
133+ sql_statements += _alter_iceberg_table_change_columns_sql (
134+ table = table ,
135+ columns_to_change = schema_changes ["to_change" ],
136+ )
137+
138+ if schema_changes ["to_remove" ]:
139+ raise exceptions .InvalidArgumentCombination ("Removing columns of Iceberg tables is not currently supported." )
140+
141+ for statement in sql_statements :
142+ query_execution_id : str = _start_query_execution (
143+ sql = statement ,
144+ workgroup = workgroup ,
145+ wg_config = wg_config ,
146+ database = database ,
147+ data_source = data_source ,
148+ encryption = encryption ,
149+ kms_key = kms_key ,
150+ boto3_session = boto3_session ,
151+ )
152+ wait_query (query_execution_id = query_execution_id , boto3_session = boto3_session )
153+
154+
155+ def _alter_iceberg_table_add_columns_sql (
156+ table : str ,
157+ columns_to_add : Dict [str , str ],
158+ ) -> List [str ]:
159+ add_cols_str = ", " .join ([f"{ col_name } { columns_to_add [col_name ]} " for col_name in columns_to_add ])
160+
161+ return [f"ALTER TABLE { table } ADD COLUMNS ({ add_cols_str } )" ]
162+
163+
164+ def _alter_iceberg_table_change_columns_sql (
165+ table : str ,
166+ columns_to_change : Dict [str , str ],
167+ ) -> List [str ]:
168+ sql_statements = []
169+
170+ for col_name , col_type in columns_to_change .items ():
171+ sql_statements .append (f"ALTER TABLE { table } CHANGE COLUMN { col_name } { col_name } { col_type } " )
172+
173+ return sql_statements
174+
175+
70176@apply_configs
71177@_utils .validate_distributed_kwargs (
72178 unsupported_kwargs = ["boto3_session" , "s3_additional_kwargs" ],
@@ -89,6 +195,7 @@ def to_iceberg(
89195 additional_table_properties : Optional [Dict [str , Any ]] = None ,
90196 dtype : Optional [Dict [str , str ]] = None ,
91197 catalog_id : Optional [str ] = None ,
198+ schema_evolution : bool = False ,
92199) -> None :
93200 """
94201 Insert into Athena Iceberg table using INSERT INTO ... SELECT. Will create Iceberg table if it does not exist.
@@ -143,6 +250,8 @@ def to_iceberg(
143250 catalog_id : str, optional
144251 The ID of the Data Catalog from which to retrieve Databases.
145252 If none is provided, the AWS account ID is used by default
253+ schema_evolution: bool
254+ If True allows schema evolution for new columns or changes in column types.
146255
147256 Returns
148257 -------
@@ -206,6 +315,31 @@ def to_iceberg(
206315 boto3_session = boto3_session ,
207316 dtype = dtype ,
208317 )
318+ else :
319+ schema_differences = _determine_differences (
320+ df = df ,
321+ database = database ,
322+ table = table ,
323+ index = index ,
324+ partition_cols = partition_cols ,
325+ boto3_session = boto3_session ,
326+ dtype = dtype ,
327+ catalog_id = catalog_id ,
328+ )
329+ if schema_evolution is False and any ([schema_differences [x ] for x in schema_differences ]): # type: ignore[literal-required]
330+ raise exceptions .InvalidArgumentValue (f"Schema change detected: { schema_differences } " )
331+
332+ _alter_iceberg_table (
333+ database = database ,
334+ table = table ,
335+ schema_changes = schema_differences ,
336+ wg_config = wg_config ,
337+ data_source = data_source ,
338+ workgroup = workgroup ,
339+ encryption = encryption ,
340+ kms_key = kms_key ,
341+ boto3_session = boto3_session ,
342+ )
209343
210344 # Create temporary external table, write the results
211345 s3 .to_parquet (
0 commit comments