Skip to content

Commit f6c523a

Browse files
authored
Merge pull request #163 from awslabs/column_name_sanitize
improving the column name sanitisation
2 parents d63375e + e8f21de commit f6c523a

File tree

3 files changed

+42
-19
lines changed

3 files changed

+42
-19
lines changed

awswrangler/catalog.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -730,26 +730,21 @@ def table(
730730

731731

732732
def _sanitize_name(name: str) -> str:
733-
name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn")
734-
name = name.replace("{", "_")
735-
name = name.replace("}", "_")
736-
name = name.replace("]", "_")
737-
name = name.replace("[", "_")
738-
name = name.replace(")", "_")
739-
name = name.replace("(", "_")
740-
name = name.replace(" ", "_")
741-
name = name.replace("-", "_")
742-
name = name.replace(".", "_")
743-
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
744-
name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
745-
return name.lower()
733+
name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn") # strip accents
734+
name = re.sub("[^A-Za-z0-9_]+", "_", name) # Removing non alphanumeric characters
735+
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() # Converting CamelCase to snake_case
746736

747737

748738
def sanitize_column_name(column: str) -> str:
749739
"""Convert the column name to be compatible with Amazon Athena.
750740
751741
https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
752742
743+
Possible transformations:
744+
- Strip accents
745+
- Remove non alphanumeric characters
746+
- Convert CamelCase to snake_case
747+
753748
Parameters
754749
----------
755750
column : str
@@ -775,6 +770,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
775770
776771
https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
777772
773+
Possible transformations:
774+
- Strip accents
775+
- Remove non alphanumeric characters
776+
- Convert CamelCase to snake_case
777+
778778
Parameters
779779
----------
780780
df : pandas.DataFrame
@@ -800,6 +800,11 @@ def sanitize_table_name(table: str) -> str:
800800
801801
https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
802802
803+
Possible transformations:
804+
- Strip accents
805+
- Remove non alphanumeric characters
806+
- Convert CamelCase to snake_case
807+
803808
Parameters
804809
----------
805810
table : str

awswrangler/s3.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,11 @@ def to_parquet( # pylint: disable=too-many-arguments
530530
The concept of Dataset goes beyond the simple idea of files and enable more
531531
complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog).
532532
533+
Note
534+
----
535+
The table name and all column names will be automatically sanitize using
536+
`wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
537+
533538
Note
534539
----
535540
In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().

testing/test_awswrangler/test_data_lake.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -553,12 +553,25 @@ def test_athena_read_list(database):
553553
wr.athena.read_sql_query(sql=f"SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False)
554554

555555

556-
def test_normalize_column_name():
557-
assert wr.catalog.sanitize_column_name("foo()__Boo))))____BAR") == "foo_____boo________bar"
558-
assert (
559-
wr.catalog.sanitize_column_name("foo()__Boo))))_{}{}{{}{}{}{___BAR[][][][]")
560-
== "foo_____boo____________________bar________"
561-
)
556+
def test_sanitize_names():
557+
assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case"
558+
assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2"
559+
assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3"
560+
assert wr.catalog.sanitize_column_name("Cámël_Casë4仮") == "camel_case4_"
561+
assert wr.catalog.sanitize_column_name("Camel__Case5") == "camel__case5"
562+
assert wr.catalog.sanitize_column_name("Camel{}Case6") == "camel_case6"
563+
assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7"
564+
assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd"
565+
assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd"
566+
assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case"
567+
assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2"
568+
assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3"
569+
assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_"
570+
assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5"
571+
assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6"
572+
assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7"
573+
assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd"
574+
assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd"
562575

563576

564577
def test_athena_ctas_empty(database):

0 commit comments

Comments
 (0)