Skip to content

Commit 498b586

Browse files
feat: to_iceberg support for filling missing columns in the DataFrame with None (#2616)
1 parent 47250a3 commit 498b586

File tree

3 files changed

+443
-338
lines changed

3 files changed

+443
-338
lines changed

awswrangler/athena/_write_iceberg.py

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def _create_iceberg_table(
2727
df: pd.DataFrame,
2828
database: str,
2929
table: str,
30-
path: str,
30+
path: str | None,
3131
wg_config: _WorkGroupConfig,
3232
partition_cols: list[str] | None,
3333
additional_table_properties: dict[str, Any] | None,
@@ -80,9 +80,9 @@ def _create_iceberg_table(
8080

8181

8282
class _SchemaChanges(TypedDict):
83-
to_add: dict[str, str]
84-
to_change: dict[str, str]
85-
to_remove: set[str]
83+
new_columns: dict[str, str]
84+
modified_columns: dict[str, str]
85+
missing_columns: dict[str, str]
8686

8787

8888
def _determine_differences(
@@ -94,7 +94,7 @@ def _determine_differences(
9494
boto3_session: boto3.Session | None,
9595
dtype: dict[str, str] | None,
9696
catalog_id: str | None,
97-
) -> _SchemaChanges:
97+
) -> tuple[_SchemaChanges, list[str]]:
9898
frame_columns_types, frame_partitions_types = _data_types.athena_types_from_pandas_partitioned(
9999
df=df, index=index, partition_cols=partition_cols, dtype=dtype
100100
)
@@ -105,26 +105,30 @@ def _determine_differences(
105105
catalog.get_table_types(database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session),
106106
)
107107

108-
original_columns = set(catalog_column_types)
109-
new_columns = set(frame_columns_types)
108+
original_column_names = set(catalog_column_types)
109+
new_column_names = set(frame_columns_types)
110110

111-
to_add = {col: frame_columns_types[col] for col in new_columns - original_columns}
112-
to_remove = original_columns - new_columns
111+
new_columns = {col: frame_columns_types[col] for col in new_column_names - original_column_names}
112+
missing_columns = {col: catalog_column_types[col] for col in original_column_names - new_column_names}
113113

114114
columns_to_change = [
115115
col
116-
for col in original_columns.intersection(new_columns)
116+
for col in original_column_names.intersection(new_column_names)
117117
if frame_columns_types[col] != catalog_column_types[col]
118118
]
119-
to_change = {col: frame_columns_types[col] for col in columns_to_change}
119+
modified_columns = {col: frame_columns_types[col] for col in columns_to_change}
120120

121-
return _SchemaChanges(to_add=to_add, to_change=to_change, to_remove=to_remove)
121+
return (
122+
_SchemaChanges(new_columns=new_columns, modified_columns=modified_columns, missing_columns=missing_columns),
123+
[key for key in catalog_column_types],
124+
)
122125

123126

124127
def _alter_iceberg_table(
125128
database: str,
126129
table: str,
127130
schema_changes: _SchemaChanges,
131+
fill_missing_columns_in_df: bool,
128132
wg_config: _WorkGroupConfig,
129133
data_source: str | None = None,
130134
workgroup: str | None = None,
@@ -134,20 +138,23 @@ def _alter_iceberg_table(
134138
) -> None:
135139
sql_statements: list[str] = []
136140

137-
if schema_changes["to_add"]:
141+
if schema_changes["new_columns"]:
138142
sql_statements += _alter_iceberg_table_add_columns_sql(
139143
table=table,
140-
columns_to_add=schema_changes["to_add"],
144+
columns_to_add=schema_changes["new_columns"],
141145
)
142146

143-
if schema_changes["to_change"]:
147+
if schema_changes["modified_columns"]:
144148
sql_statements += _alter_iceberg_table_change_columns_sql(
145149
table=table,
146-
columns_to_change=schema_changes["to_change"],
150+
columns_to_change=schema_changes["modified_columns"],
147151
)
148152

149-
if schema_changes["to_remove"]:
150-
raise exceptions.InvalidArgumentCombination("Removing columns of Iceberg tables is not currently supported.")
153+
if schema_changes["missing_columns"] and not fill_missing_columns_in_df:
154+
raise exceptions.InvalidArgumentCombination(
155+
f"Dropping columns of Iceberg tables is not supported: {schema_changes['missing_columns']}. "
156+
"Please use `fill_missing_columns_in_df=True` to fill missing columns with N/A."
157+
)
151158

152159
for statement in sql_statements:
153160
query_execution_id: str = _start_query_execution(
@@ -208,6 +215,7 @@ def to_iceberg(
208215
dtype: dict[str, str] | None = None,
209216
catalog_id: str | None = None,
210217
schema_evolution: bool = False,
218+
fill_missing_columns_in_df: bool = True,
211219
glue_table_settings: GlueTableSettings | None = None,
212220
) -> None:
213221
"""
@@ -267,8 +275,14 @@ def to_iceberg(
267275
catalog_id : str, optional
268276
The ID of the Data Catalog from which to retrieve Databases.
269277
If none is provided, the AWS account ID is used by default
270-
schema_evolution: bool
271-
If True allows schema evolution for new columns or changes in column types.
278+
schema_evolution: bool, optional
279+
If ``True`` allows schema evolution for new columns or changes in column types.
280+
Columns missing from the DataFrame that are present in the Iceberg schema
281+
will throw an error unless ``fill_missing_columns_in_df`` is set to ``True``.
282+
Default is ``False``.
283+
fill_missing_columns_in_df: bool, optional
284+
If ``True``, fill columns that was missing in the DataFrame with ``NULL`` values.
285+
Default is ``True``.
272286
columns_comments: GlueTableSettings, optional
273287
Glue/Athena catalog: Settings for writing to the Glue table.
274288
Currently only the 'columns_comments' attribute is supported for this function.
@@ -329,7 +343,7 @@ def to_iceberg(
329343
df=df,
330344
database=database,
331345
table=table,
332-
path=table_location, # type: ignore[arg-type]
346+
path=table_location,
333347
wg_config=wg_config,
334348
partition_cols=partition_cols,
335349
additional_table_properties=additional_table_properties,
@@ -343,7 +357,7 @@ def to_iceberg(
343357
columns_comments=glue_table_settings.get("columns_comments"),
344358
)
345359
else:
346-
schema_differences = _determine_differences(
360+
schema_differences, catalog_cols = _determine_differences(
347361
df=df,
348362
database=database,
349363
table=table,
@@ -353,13 +367,27 @@ def to_iceberg(
353367
dtype=dtype,
354368
catalog_id=catalog_id,
355369
)
370+
371+
# Add missing columns to the DataFrame
372+
if fill_missing_columns_in_df and schema_differences["missing_columns"]:
373+
for col_name, col_type in schema_differences["missing_columns"].items():
374+
df[col_name] = None
375+
df[col_name] = df[col_name].astype(_data_types.athena2pandas(col_type))
376+
377+
schema_differences["missing_columns"] = {}
378+
379+
# Ensure that the ordering of the DF is the same as in the catalog.
380+
# This is required for the INSERT command to work.
381+
df = df[catalog_cols]
382+
356383
if schema_evolution is False and any([schema_differences[x] for x in schema_differences]): # type: ignore[literal-required]
357384
raise exceptions.InvalidArgumentValue(f"Schema change detected: {schema_differences}")
358385

359386
_alter_iceberg_table(
360387
database=database,
361388
table=table,
362389
schema_changes=schema_differences,
390+
fill_missing_columns_in_df=fill_missing_columns_in_df,
363391
wg_config=wg_config,
364392
data_source=data_source,
365393
workgroup=workgroup,

0 commit comments

Comments
 (0)