@@ -27,7 +27,7 @@ def _create_iceberg_table(
2727 df : pd .DataFrame ,
2828 database : str ,
2929 table : str ,
30- path : str ,
30+ path : str | None ,
3131 wg_config : _WorkGroupConfig ,
3232 partition_cols : list [str ] | None ,
3333 additional_table_properties : dict [str , Any ] | None ,
@@ -80,9 +80,9 @@ def _create_iceberg_table(
8080
8181
8282class _SchemaChanges (TypedDict ):
83- to_add : dict [str , str ]
84- to_change : dict [str , str ]
85- to_remove : set [ str ]
83+ new_columns : dict [str , str ]
84+ modified_columns : dict [str , str ]
85+ missing_columns : dict [ str , str ]
8686
8787
8888def _determine_differences (
@@ -94,7 +94,7 @@ def _determine_differences(
9494 boto3_session : boto3 .Session | None ,
9595 dtype : dict [str , str ] | None ,
9696 catalog_id : str | None ,
97- ) -> _SchemaChanges :
97+ ) -> tuple [ _SchemaChanges , list [ str ]] :
9898 frame_columns_types , frame_partitions_types = _data_types .athena_types_from_pandas_partitioned (
9999 df = df , index = index , partition_cols = partition_cols , dtype = dtype
100100 )
@@ -105,26 +105,30 @@ def _determine_differences(
105105 catalog .get_table_types (database = database , table = table , catalog_id = catalog_id , boto3_session = boto3_session ),
106106 )
107107
108- original_columns = set (catalog_column_types )
109- new_columns = set (frame_columns_types )
108+ original_column_names = set (catalog_column_types )
109+ new_column_names = set (frame_columns_types )
110110
111- to_add = {col : frame_columns_types [col ] for col in new_columns - original_columns }
112- to_remove = original_columns - new_columns
111+ new_columns = {col : frame_columns_types [col ] for col in new_column_names - original_column_names }
112+ missing_columns = { col : catalog_column_types [ col ] for col in original_column_names - new_column_names }
113113
114114 columns_to_change = [
115115 col
116- for col in original_columns .intersection (new_columns )
116+ for col in original_column_names .intersection (new_column_names )
117117 if frame_columns_types [col ] != catalog_column_types [col ]
118118 ]
119- to_change = {col : frame_columns_types [col ] for col in columns_to_change }
119+ modified_columns = {col : frame_columns_types [col ] for col in columns_to_change }
120120
121- return _SchemaChanges (to_add = to_add , to_change = to_change , to_remove = to_remove )
121+ return (
122+ _SchemaChanges (new_columns = new_columns , modified_columns = modified_columns , missing_columns = missing_columns ),
123+ [key for key in catalog_column_types ],
124+ )
122125
123126
124127def _alter_iceberg_table (
125128 database : str ,
126129 table : str ,
127130 schema_changes : _SchemaChanges ,
131+ fill_missing_columns_in_df : bool ,
128132 wg_config : _WorkGroupConfig ,
129133 data_source : str | None = None ,
130134 workgroup : str | None = None ,
@@ -134,20 +138,23 @@ def _alter_iceberg_table(
134138) -> None :
135139 sql_statements : list [str ] = []
136140
137- if schema_changes ["to_add " ]:
141+ if schema_changes ["new_columns " ]:
138142 sql_statements += _alter_iceberg_table_add_columns_sql (
139143 table = table ,
140- columns_to_add = schema_changes ["to_add " ],
144+ columns_to_add = schema_changes ["new_columns " ],
141145 )
142146
143- if schema_changes ["to_change " ]:
147+ if schema_changes ["modified_columns " ]:
144148 sql_statements += _alter_iceberg_table_change_columns_sql (
145149 table = table ,
146- columns_to_change = schema_changes ["to_change " ],
150+ columns_to_change = schema_changes ["modified_columns " ],
147151 )
148152
149- if schema_changes ["to_remove" ]:
150- raise exceptions .InvalidArgumentCombination ("Removing columns of Iceberg tables is not currently supported." )
153+ if schema_changes ["missing_columns" ] and not fill_missing_columns_in_df :
154+ raise exceptions .InvalidArgumentCombination (
155+ f"Dropping columns of Iceberg tables is not supported: { schema_changes ['missing_columns' ]} . "
156+ "Please use `fill_missing_columns_in_df=True` to fill missing columns with N/A."
157+ )
151158
152159 for statement in sql_statements :
153160 query_execution_id : str = _start_query_execution (
@@ -208,6 +215,7 @@ def to_iceberg(
208215 dtype : dict [str , str ] | None = None ,
209216 catalog_id : str | None = None ,
210217 schema_evolution : bool = False ,
218+ fill_missing_columns_in_df : bool = True ,
211219 glue_table_settings : GlueTableSettings | None = None ,
212220) -> None :
213221 """
@@ -267,8 +275,14 @@ def to_iceberg(
267275 catalog_id : str, optional
268276 The ID of the Data Catalog from which to retrieve Databases.
269277 If none is provided, the AWS account ID is used by default
270- schema_evolution: bool
271- If True allows schema evolution for new columns or changes in column types.
278+ schema_evolution: bool, optional
279+ If ``True`` allows schema evolution for new columns or changes in column types.
280+ Columns missing from the DataFrame that are present in the Iceberg schema
281+ will throw an error unless ``fill_missing_columns_in_df`` is set to ``True``.
282+ Default is ``False``.
283+ fill_missing_columns_in_df: bool, optional
284+ If ``True``, fill columns that was missing in the DataFrame with ``NULL`` values.
285+ Default is ``True``.
272286 columns_comments: GlueTableSettings, optional
273287 Glue/Athena catalog: Settings for writing to the Glue table.
274288 Currently only the 'columns_comments' attribute is supported for this function.
@@ -329,7 +343,7 @@ def to_iceberg(
329343 df = df ,
330344 database = database ,
331345 table = table ,
332- path = table_location , # type: ignore[arg-type]
346+ path = table_location ,
333347 wg_config = wg_config ,
334348 partition_cols = partition_cols ,
335349 additional_table_properties = additional_table_properties ,
@@ -343,7 +357,7 @@ def to_iceberg(
343357 columns_comments = glue_table_settings .get ("columns_comments" ),
344358 )
345359 else :
346- schema_differences = _determine_differences (
360+ schema_differences , catalog_cols = _determine_differences (
347361 df = df ,
348362 database = database ,
349363 table = table ,
@@ -353,13 +367,27 @@ def to_iceberg(
353367 dtype = dtype ,
354368 catalog_id = catalog_id ,
355369 )
370+
371+ # Add missing columns to the DataFrame
372+ if fill_missing_columns_in_df and schema_differences ["missing_columns" ]:
373+ for col_name , col_type in schema_differences ["missing_columns" ].items ():
374+ df [col_name ] = None
375+ df [col_name ] = df [col_name ].astype (_data_types .athena2pandas (col_type ))
376+
377+ schema_differences ["missing_columns" ] = {}
378+
379+ # Ensure that the ordering of the DF is the same as in the catalog.
380+ # This is required for the INSERT command to work.
381+ df = df [catalog_cols ]
382+
356383 if schema_evolution is False and any ([schema_differences [x ] for x in schema_differences ]): # type: ignore[literal-required]
357384 raise exceptions .InvalidArgumentValue (f"Schema change detected: { schema_differences } " )
358385
359386 _alter_iceberg_table (
360387 database = database ,
361388 table = table ,
362389 schema_changes = schema_differences ,
390+ fill_missing_columns_in_df = fill_missing_columns_in_df ,
363391 wg_config = wg_config ,
364392 data_source = data_source ,
365393 workgroup = workgroup ,
0 commit comments