Improving exception handling for empty columns type inference.

igorborgest · igorborgest · commit ec07957cb24a · 2020-09-26T18:36:43.000-03:00
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -361,7 +361,19 @@ def athena_types_from_pandas(
         if v is None:
             athena_columns_types[k] = casts[k].replace(" ", "")
         else:
-            athena_columns_types[k] = pyarrow2athena(dtype=v)
+            try:
+                athena_columns_types[k] = pyarrow2athena(dtype=v)
+            except exceptions.UndetectedType as ex:
+                raise exceptions.UndetectedType(
+                    "Impossible to infer the equivalent Athena data type "
+                    f"for the {k} column. "
+                    "It is completely empty (only null values) "
+                    f"and has a too generic data type ({df[k].dtype}). "
+                    "Please, cast this columns with a more deterministic data type "
+                    f"(e.g. df['{k}'] = df['{k}'].astype('string')) or "
+                    "pass the column schema as argument for Wrangler "
+                    f"(e.g. dtype={{'{k}': 'string'}}"
+                ) from ex
     _logger.debug("athena_columns_types: %s", athena_columns_types)
     return athena_columns_types
 
diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py
@@ -688,11 +688,13 @@ def read_sql_query(
 
     """
     if ctas_approach and data_source not in (None, "AwsDataCatalog"):
-        raise exceptions.InvalidArgumentCombination("Queries with ctas_approach=True (default) does not support "
-                                                    "data_source values different than None and 'AwsDataCatalog'. "
-                                                    "Please check the related tutorial for more details "
-                                                    "(https://github.com/awslabs/aws-data-wrangler/blob/master/"
-                                                    "tutorials/006%20-%20Amazon%20Athena.ipynb)")
+        raise exceptions.InvalidArgumentCombination(
+            "Queries with ctas_approach=True (default) does not support "
+            "data_source values different than None and 'AwsDataCatalog'. "
+            "Please check the related tutorial for more details "
+            "(https://github.com/awslabs/aws-data-wrangler/blob/master/"
+            "tutorials/006%20-%20Amazon%20Athena.ipynb)"
+        )
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
 
     cache_info: _CacheInfo = _check_for_cached_results(
diff --git a/tests/test_athena_csv.py b/tests/test_athena_csv.py
@@ -364,3 +364,14 @@ def test_skip_header(path, glue_database, glue_table, use_threads, ctas_approach
     )
     df2 = wr.athena.read_sql_table(glue_table, glue_database, use_threads=use_threads, ctas_approach=ctas_approach)
     assert df.equals(df2)
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_empty_column(path, glue_table, glue_database, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["par"] = df["par"].astype("string")
+    with pytest.raises(wr.exceptions.UndetectedType):
+        wr.s3.to_csv(
+            df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
+        )
diff --git a/tests/test_athena_parquet.py b/tests/test_athena_parquet.py
@@ -627,3 +627,12 @@ def test_partitions_overwrite(path, glue_table, glue_database, use_threads, part
     ensure_data_types(df2, has_list=True)
     assert df2.shape == (3, 19)
     assert df.iint8.sum() == df2.iint8.sum()
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_empty_column(path, glue_table, glue_database, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["par"] = df["par"].astype("string")
+    with pytest.raises(wr.exceptions.UndetectedType):
+        wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -424,3 +424,15 @@ def test_timezone_raw_values(path, use_threads):
     df2["par"] = df2["par"].astype("string")
     df3["par"] = df3["par"].astype("string")
     assert df2.equals(df3)
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_empty_column(path, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["par"] = df["par"].astype("string")
+    paths = wr.s3.to_parquet(df, path, dataset=True, partition_cols=["par"])["paths"]
+    wr.s3.wait_objects_exist(paths, use_threads=use_threads)
+    df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
+    df2["par"] = df2["par"].astype("string")
+    assert df.equals(df2)