Allowing table names with double underscore

igorborgest · igorborgest · commit e3a522a06685 · 2020-04-08T01:28:52.000-03:00
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -235,7 +235,7 @@ def repair_table(
     >>> query_final_state = wr.athena.repair_table(table='...', database='...')
 
     """
-    query = f"MSCK REPAIR TABLE {table};"
+    query = f"MSCK REPAIR TABLE `{table}`;"
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     query_id = start_query_execution(
         sql=query,
@@ -456,7 +456,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
             else:
                 dfs = _utils.empty_generator()
         else:
-            s3.wait_objects_exist(paths=paths, use_threads=use_threads, boto3_session=session)
+            s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
             dfs = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked)
         return dfs
     dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
@@ -633,7 +633,7 @@ def read_sql_table(
 
     """
     return read_sql_query(
-        sql=f"SELECT * FROM {table}",
+        sql=f'SELECT * FROM "{table}"',
         database=database,
         ctas_approach=ctas_approach,
         chunksize=chunksize,
diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py
@@ -742,11 +742,7 @@ def _sanitize_name(name: str) -> str:
     name = name.replace(".", "_")
     name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
     name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
-    name = name.lower()
-    name = re.sub(r"(_)\1+", "\\1", name)  # remove repeated underscores
-    name = name[1:] if name.startswith("_") else name  # remove trailing underscores
-    name = name[:-1] if name.endswith("_") else name  # remove trailing underscores
-    return name
+    return name.lower()
 
 
 def sanitize_column_name(column: str) -> str:
diff --git a/awswrangler/db.py b/awswrangler/db.py
@@ -522,7 +522,7 @@ def copy_to_redshift(  # pylint: disable=too-many-arguments
         boto3_session=session,
         s3_additional_kwargs=s3_additional_kwargs,
     )["paths"]
-    s3.wait_objects_exist(paths=paths, use_threads=use_threads, boto3_session=session)
+    s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
     copy_files_to_redshift(
         path=paths,
         manifest_directory=_utils.get_directory(path=path),
@@ -642,6 +642,7 @@ def copy_files_to_redshift(  # pylint: disable=too-many-locals,too-many-argument
     write_redshift_copy_manifest(
         manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session
     )
+    s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session)
     athena_types, _ = s3.read_parquet_metadata(
         path=paths, dataset=False, use_threads=use_threads, boto3_session=session
     )
@@ -953,7 +954,7 @@ def unload_redshift(
     paths: List[str] = unload_redshift_to_files(
         sql=sql, path=path, con=con, iam_role=iam_role, use_threads=use_threads, boto3_session=session
     )
-    s3.wait_objects_exist(paths=paths, use_threads=use_threads, boto3_session=session)
+    s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
     if chunked is False:
         if not paths:  # pragma: no cover
             return pd.DataFrame()
diff --git a/requirements.txt b/requirements.txt
@@ -5,5 +5,5 @@ boto3>=1.10.34
 s3fs~=0.4.2
 psycopg2-binary~=2.8.5
 pymysql~=0.9.3
-SQLAlchemy~=1.3.15
+SQLAlchemy==1.3.15
 sqlalchemy-redshift~=0.7.7
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
@@ -130,12 +130,12 @@ def test_athena(bucket, database, kms_key, workgroup_secondary):
         dataset=True,
         mode="overwrite",
         database=database,
-        table="test_athena",
+        table="__test_athena",
         partition_cols=["par0", "par1"],
     )["paths"]
     wr.s3.wait_objects_exist(paths=paths, use_threads=False)
     dfs = wr.athena.read_sql_query(
-        sql="SELECT * FROM test_athena",
+        sql="SELECT * FROM __test_athena",
         database=database,
         ctas_approach=False,
         chunksize=1,
@@ -147,12 +147,12 @@ def test_athena(bucket, database, kms_key, workgroup_secondary):
         print(df2)
         ensure_data_types(df=df2)
     df = wr.athena.read_sql_query(
-        sql="SELECT * FROM test_athena", database=database, ctas_approach=False, workgroup=workgroup_secondary
+        sql="SELECT * FROM __test_athena", database=database, ctas_approach=False, workgroup=workgroup_secondary
     )
     assert len(df.index) == 3
     ensure_data_types(df=df)
-    wr.athena.repair_table(table="test_athena", database=database)
-    wr.catalog.delete_table_if_exists(database=database, table="test_athena")
+    wr.athena.repair_table(table="__test_athena", database=database)
+    wr.catalog.delete_table_if_exists(database=database, table="__test_athena")
     wr.s3.delete_objects(path=paths)
     wr.s3.wait_objects_not_exist(paths=paths)
     wr.s3.delete_objects(path=f"s3://{bucket}/athena_workgroup_secondary/")
@@ -361,7 +361,7 @@ def test_parquet_catalog_casting(bucket, database):
         dataset=True,
         mode="overwrite",
         database=database,
-        table="test_parquet_catalog_casting",
+        table="__test_parquet_catalog_casting",
         dtype={
             "iint8": "tinyint",
             "iint16": "smallint",
@@ -385,16 +385,16 @@ def test_parquet_catalog_casting(bucket, database):
     assert len(df.index) == 3
     assert len(df.columns) == 15
     ensure_data_types(df=df, has_list=False)
-    df = wr.athena.read_sql_table(table="test_parquet_catalog_casting", database=database, ctas_approach=True)
+    df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=database, ctas_approach=True)
     assert len(df.index) == 3
     assert len(df.columns) == 15
     ensure_data_types(df=df, has_list=False)
-    df = wr.athena.read_sql_table(table="test_parquet_catalog_casting", database=database, ctas_approach=False)
+    df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=database, ctas_approach=False)
     assert len(df.index) == 3
     assert len(df.columns) == 15
     ensure_data_types(df=df, has_list=False)
     wr.s3.delete_objects(path=path)
-    assert wr.catalog.delete_table_if_exists(database=database, table="test_parquet_catalog_casting") is True
+    assert wr.catalog.delete_table_if_exists(database=database, table="__test_parquet_catalog_casting") is True
 
 
 def test_catalog(bucket, database):
@@ -552,8 +552,11 @@ def test_athena_read_list(database):
 
 
 def test_normalize_column_name():
-    assert wr.catalog.sanitize_column_name("foo()__Boo))))____BAR") == "foo_boo_bar"
-    assert wr.catalog.sanitize_column_name("foo()__Boo))))_{}{}{{}{}{}{___BAR[][][][]") == "foo_boo_bar"
+    assert wr.catalog.sanitize_column_name("foo()__Boo))))____BAR") == "foo_____boo________bar"
+    assert (
+        wr.catalog.sanitize_column_name("foo()__Boo))))_{}{}{{}{}{}{___BAR[][][][]")
+        == "foo_____boo____________________bar________"
+    )
 
 
 def test_athena_ctas_empty(database):
diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py
@@ -150,12 +150,12 @@ def test_redshift_copy_unload(bucket, parameters):
         path=path,
         con=engine,
         schema="public",
-        table="test_redshift_copy",
+        table="__test_redshift_copy",
         mode="overwrite",
         iam_role=parameters["redshift"]["role"],
     )
     df2 = wr.db.unload_redshift(
-        sql="SELECT * FROM public.test_redshift_copy",
+        sql="SELECT * FROM public.__test_redshift_copy",
         con=engine,
         iam_role=parameters["redshift"]["role"],
         path=path,
@@ -168,12 +168,12 @@ def test_redshift_copy_unload(bucket, parameters):
         path=path,
         con=engine,
         schema="public",
-        table="test_redshift_copy",
+        table="__test_redshift_copy",
         mode="append",
         iam_role=parameters["redshift"]["role"],
     )
     df2 = wr.db.unload_redshift(
-        sql="SELECT * FROM public.test_redshift_copy",
+        sql="SELECT * FROM public.__test_redshift_copy",
         con=engine,
         iam_role=parameters["redshift"]["role"],
         path=path,
@@ -182,7 +182,7 @@ def test_redshift_copy_unload(bucket, parameters):
     assert len(df2.index) == 6
     ensure_data_types(df=df2, has_list=False)
     dfs = wr.db.unload_redshift(
-        sql="SELECT * FROM public.test_redshift_copy",
+        sql="SELECT * FROM public.__test_redshift_copy",
         con=engine,
         iam_role=parameters["redshift"]["role"],
         path=path,