Merge pull request #163 from awslabs/column_name_sanitize

igorborgest · web-flow · commit f6c523a14810 · 2020-04-11T15:49:51.000-03:00
improving the column name sanitisation
diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py
@@ -730,26 +730,21 @@ def table(
 
 
 def _sanitize_name(name: str) -> str:
-    name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn")
-    name = name.replace("{", "_")
-    name = name.replace("}", "_")
-    name = name.replace("]", "_")
-    name = name.replace("[", "_")
-    name = name.replace(")", "_")
-    name = name.replace("(", "_")
-    name = name.replace(" ", "_")
-    name = name.replace("-", "_")
-    name = name.replace(".", "_")
-    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
-    name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
-    return name.lower()
+    name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn")  # strip accents
+    name = re.sub("[^A-Za-z0-9_]+", "_", name)  # Removing non alphanumeric characters
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()  # Converting CamelCase to snake_case
 
 
 def sanitize_column_name(column: str) -> str:
     """Convert the column name to be compatible with Amazon Athena.
 
     https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
 
+    Possible transformations:
+    - Strip accents
+    - Remove non alphanumeric characters
+    - Convert CamelCase to snake_case
+
     Parameters
     ----------
     column : str
@@ -775,6 +770,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
 
     https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
 
+    Possible transformations:
+    - Strip accents
+    - Remove non alphanumeric characters
+    - Convert CamelCase to snake_case
+
     Parameters
     ----------
     df : pandas.DataFrame
@@ -800,6 +800,11 @@ def sanitize_table_name(table: str) -> str:
 
     https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
 
+    Possible transformations:
+    - Strip accents
+    - Remove non alphanumeric characters
+    - Convert CamelCase to snake_case
+
     Parameters
     ----------
     table : str
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -530,6 +530,11 @@ def to_parquet(  # pylint: disable=too-many-arguments
     The concept of Dataset goes beyond the simple idea of files and enable more
     complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog).
 
+    Note
+    ----
+    The table name and all column names will be automatically sanitize using
+    `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
+
     Note
     ----
     In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
@@ -553,12 +553,25 @@ def test_athena_read_list(database):
         wr.athena.read_sql_query(sql=f"SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False)
 
 
-def test_normalize_column_name():
-    assert wr.catalog.sanitize_column_name("foo()__Boo))))____BAR") == "foo_____boo________bar"
-    assert (
-        wr.catalog.sanitize_column_name("foo()__Boo))))_{}{}{{}{}{}{___BAR[][][][]")
-        == "foo_____boo____________________bar________"
-    )
+def test_sanitize_names():
+    assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case"
+    assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2"
+    assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3"
+    assert wr.catalog.sanitize_column_name("Cámël_Casë4仮") == "camel_case4_"
+    assert wr.catalog.sanitize_column_name("Camel__Case5") == "camel__case5"
+    assert wr.catalog.sanitize_column_name("Camel{}Case6") == "camel_case6"
+    assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7"
+    assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd"
+    assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd"
+    assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case"
+    assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2"
+    assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3"
+    assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_"
+    assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5"
+    assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6"
+    assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7"
+    assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd"
+    assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd"
 
 
 def test_athena_ctas_empty(database):