22import logging
33import re
44import unicodedata
5+ import warnings
56from typing import Any , Dict , List , Optional , Tuple
67
78import boto3
@@ -124,8 +125,52 @@ def sanitize_column_name(column: str) -> str:
124125 return _sanitize_name (name = column )
125126
126127
127- def sanitize_dataframe_columns_names (df : pd .DataFrame ) -> pd .DataFrame :
128- """Normalize all columns names to be compatible with Amazon Athena and the AWS Glue Catalog.
128+ def rename_duplicated_columns (df : pd .DataFrame ) -> pd .DataFrame :
129+ """Append an incremental number to duplicate column names to conform with Amazon Athena.
130+
131+ Note
132+ ----
133+ This transformation will run `inplace` and will make changes to the original DataFrame.
134+
135+ Note
136+ ----
137+ Also handles potential new column duplicate conflicts by appending an additional `_n`.
138+
139+ Parameters
140+ ----------
141+ df : pandas.DataFrame
142+ Original Pandas DataFrame.
143+
144+ Returns
145+ -------
146+ pandas.DataFrame
147+ DataFrame with duplicated column names renamed.
148+
149+ Examples
150+ --------
151+ >>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [4, 6]})
152+ >>> df.columns = ['a', 'a', 'a_1']
153+ >>> wr.catalog.rename_duplicated_columns(df=df)
154+ a a_1 a_1_1
155+ 1 3 4
156+ 2 4 6
157+ """
158+ names = df .columns
159+ set_names = set (names )
160+ if len (names ) == len (set_names ):
161+ return df
162+ d = {key : [name + f"_{ i } " if i > 0 else name for i , name in enumerate (names [names == key ])] for key in set_names }
163+ df .rename (columns = lambda c : d [c ].pop (0 ), inplace = True )
164+ while df .columns .duplicated ().any ():
165+ # Catches edge cases where pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]})
166+ df = rename_duplicated_columns (df )
167+ return df
168+
169+
170+ def sanitize_dataframe_columns_names (
171+ df : pd .DataFrame , handle_duplicate_columns : Optional [str ] = "warn"
172+ ) -> pd .DataFrame :
173+ """Normalize all columns names to be compatible with Amazon Athena.
129174
130175 https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
131176
@@ -142,6 +187,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
142187 ----------
143188 df : pandas.DataFrame
144189 Original Pandas DataFrame.
190+ handle_duplicate_columns : str, optional
191+ How to handle duplicate columns. Can be "warn" or "drop" or "rename".
192+ "drop" will drop all but the first duplicated column.
193+ "rename" will rename all duplicated columns with an incremental number.
194+ Defaults to "warn".
145195
146196 Returns
147197 -------
@@ -151,11 +201,29 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
151201 Examples
152202 --------
153203 >>> import awswrangler as wr
154- >>> df_normalized = wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({'A': [1, 2]}))
204+ >>> df_normalized = wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({"A": [1, 2]}))
205+ >>> df_normalized_drop = wr.catalog.sanitize_dataframe_columns_names(
206+ df=pd.DataFrame({"A": [1, 2], "a": [3, 4]}), handle_duplicate_columns="drop"
207+ )
208+ >>> df_normalized_rename = wr.catalog.sanitize_dataframe_columns_names(
209+ df=pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [4, 6]}), handle_duplicate_columns="rename"
210+ )
155211
156212 """
157213 df .columns = [sanitize_column_name (x ) for x in df .columns ]
158214 df .index .names = [None if x is None else sanitize_column_name (x ) for x in df .index .names ]
215+ if df .columns .duplicated ().any (): # type: ignore
216+ if handle_duplicate_columns == "warn" :
217+ warnings .warn (
218+ "Duplicate columns were detected, consider using `handle_duplicate_columns='[drop|rename]'`" ,
219+ UserWarning ,
220+ )
221+ elif handle_duplicate_columns == "drop" :
222+ df = drop_duplicated_columns (df )
223+ elif handle_duplicate_columns == "rename" :
224+ df = rename_duplicated_columns (df )
225+ else :
226+ raise ValueError ("handle_duplicate_columns must be one of ['warn', 'drop', 'rename']" )
159227 return df
160228
161229
0 commit comments