Skip to content

Commit e1cd200

Browse files
Append incremental n to duplicate cols recursively (#1124)
* Append incremental n to duplicate cols recursively * Made naming consistant between other functions * Made naming conssitant across init * Less memory intensive rename * handle_dup_cols => handle_duplicate_columns * Rearranged imports * Rearranged imports and formatted * Improved documentation. * Removed whitespace * Added Warn type * Added test for warning too * Formatted * Changed to len(set(.)) to get around mypy errors. * Formatted docstrings to flake8 * Ignored error from mypy * Minor - Fixes Co-authored-by: Abdel Jaidi <[email protected]>
1 parent 7db106c commit e1cd200

File tree

3 files changed

+86
-3
lines changed

3 files changed

+86
-3
lines changed

awswrangler/catalog/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
does_table_exist,
4444
drop_duplicated_columns,
4545
extract_athena_types,
46+
rename_duplicated_columns,
4647
sanitize_column_name,
4748
sanitize_dataframe_columns_names,
4849
sanitize_table_name,
@@ -57,6 +58,7 @@
5758
"delete_column",
5859
"drop_duplicated_columns",
5960
"extract_athena_types",
61+
"rename_duplicated_columns",
6062
"sanitize_column_name",
6163
"sanitize_dataframe_columns_names",
6264
"sanitize_table_name",

awswrangler/catalog/_utils.py

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import re
44
import unicodedata
5+
import warnings
56
from typing import Any, Dict, List, Optional, Tuple
67

78
import boto3
@@ -124,8 +125,52 @@ def sanitize_column_name(column: str) -> str:
124125
return _sanitize_name(name=column)
125126

126127

127-
def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
128-
"""Normalize all columns names to be compatible with Amazon Athena and the AWS Glue Catalog.
128+
def rename_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame:
129+
"""Append an incremental number to duplicate column names to conform with Amazon Athena.
130+
131+
Note
132+
----
133+
This transformation will run `inplace` and will make changes to the original DataFrame.
134+
135+
Note
136+
----
137+
Also handles potential new column duplicate conflicts by appending an additional `_n`.
138+
139+
Parameters
140+
----------
141+
df : pandas.DataFrame
142+
Original Pandas DataFrame.
143+
144+
Returns
145+
-------
146+
pandas.DataFrame
147+
DataFrame with duplicated column names renamed.
148+
149+
Examples
150+
--------
151+
>>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [4, 6]})
152+
>>> df.columns = ['a', 'a', 'a_1']
153+
>>> wr.catalog.rename_duplicated_columns(df=df)
154+
a a_1 a_1_1
155+
1 3 4
156+
2 4 6
157+
"""
158+
names = df.columns
159+
set_names = set(names)
160+
if len(names) == len(set_names):
161+
return df
162+
d = {key: [name + f"_{i}" if i > 0 else name for i, name in enumerate(names[names == key])] for key in set_names}
163+
df.rename(columns=lambda c: d[c].pop(0), inplace=True)
164+
while df.columns.duplicated().any():
165+
# Catches edge cases where pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]})
166+
df = rename_duplicated_columns(df)
167+
return df
168+
169+
170+
def sanitize_dataframe_columns_names(
171+
df: pd.DataFrame, handle_duplicate_columns: Optional[str] = "warn"
172+
) -> pd.DataFrame:
173+
"""Normalize all columns names to be compatible with Amazon Athena.
129174
130175
https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
131176
@@ -142,6 +187,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
142187
----------
143188
df : pandas.DataFrame
144189
Original Pandas DataFrame.
190+
handle_duplicate_columns : str, optional
191+
How to handle duplicate columns. Can be "warn" or "drop" or "rename".
192+
"drop" will drop all but the first duplicated column.
193+
"rename" will rename all duplicated columns with an incremental number.
194+
Defaults to "warn".
145195
146196
Returns
147197
-------
@@ -151,11 +201,29 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
151201
Examples
152202
--------
153203
>>> import awswrangler as wr
154-
>>> df_normalized = wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({'A': [1, 2]}))
204+
>>> df_normalized = wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({"A": [1, 2]}))
205+
>>> df_normalized_drop = wr.catalog.sanitize_dataframe_columns_names(
206+
df=pd.DataFrame({"A": [1, 2], "a": [3, 4]}), handle_duplicate_columns="drop"
207+
)
208+
>>> df_normalized_rename = wr.catalog.sanitize_dataframe_columns_names(
209+
df=pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [4, 6]}), handle_duplicate_columns="rename"
210+
)
155211
156212
"""
157213
df.columns = [sanitize_column_name(x) for x in df.columns]
158214
df.index.names = [None if x is None else sanitize_column_name(x) for x in df.index.names]
215+
if df.columns.duplicated().any(): # type: ignore
216+
if handle_duplicate_columns == "warn":
217+
warnings.warn(
218+
"Duplicate columns were detected, consider using `handle_duplicate_columns='[drop|rename]'`",
219+
UserWarning,
220+
)
221+
elif handle_duplicate_columns == "drop":
222+
df = drop_duplicated_columns(df)
223+
elif handle_duplicate_columns == "rename":
224+
df = rename_duplicated_columns(df)
225+
else:
226+
raise ValueError("handle_duplicate_columns must be one of ['warn', 'drop', 'rename']")
159227
return df
160228

161229

tests/test_athena.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,19 @@ def test_athena_read_list(glue_database):
246246
wr.athena.read_sql_query(sql="SELECT ARRAY[1, 2, 3]", database=glue_database, ctas_approach=False)
247247

248248

249+
def test_sanitize_dataframe_column_names():
250+
with pytest.warns(UserWarning, match=r"Duplicate*"):
251+
test_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
252+
test_df.columns = ["a", "a"]
253+
assert wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({"A": [1, 2], "a": [3, 4]})).equals(test_df)
254+
assert wr.catalog.sanitize_dataframe_columns_names(
255+
df=pd.DataFrame({"A": [1, 2], "a": [3, 4]}), handle_duplicate_columns="drop"
256+
).equals(pd.DataFrame({"a": [1, 2]}))
257+
assert wr.catalog.sanitize_dataframe_columns_names(
258+
df=pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]}), handle_duplicate_columns="rename"
259+
).equals(pd.DataFrame({"a": [1, 2], "a_1": [3, 4], "a_1_1": [5, 6]}))
260+
261+
249262
def test_sanitize_names():
250263
assert wr.catalog.sanitize_column_name("CamelCase") == "camelcase"
251264
assert wr.catalog.sanitize_column_name("CamelCase2") == "camelcase2"

0 commit comments

Comments
 (0)