Improving exception handling for columns with mixed data types.

igorborgest · igorborgest · commit 630b4204403b · 2020-09-26T18:36:43.000-03:00
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -293,6 +293,18 @@ def pyarrow_types_from_pandas(
             schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]], preserve_index=False)
         except pa.ArrowInvalid as ex:
             cols_dtypes[col] = process_not_inferred_dtype(ex)
+        except TypeError as ex:
+            msg = str(ex)
+            if " is required (got type " in msg:
+                raise TypeError(
+                    f"The {col} columns has a too generic data type ({df[col].dtype}) and seems "
+                    f"to have mixed data types ({msg}). "
+                    "Please, cast this columns with a more deterministic data type "
+                    f"(e.g. df['{col}'] = df['{col}'].astype('string')) or "
+                    "pass the column schema as argument for AWS Data Wrangler "
+                    f"(e.g. dtype={{'{col}': 'string'}}"
+                ) from ex
+            raise
         else:
             cols_dtypes[col] = schema.field(col).type
 
@@ -371,7 +383,7 @@ def athena_types_from_pandas(
                     f"and has a too generic data type ({df[k].dtype}). "
                     "Please, cast this columns with a more deterministic data type "
                     f"(e.g. df['{k}'] = df['{k}'].astype('string')) or "
-                    "pass the column schema as argument for Wrangler "
+                    "pass the column schema as argument for AWS Data Wrangler "
                     f"(e.g. dtype={{'{k}': 'string'}}"
                 ) from ex
     _logger.debug("athena_columns_types: %s", athena_columns_types)
diff --git a/tests/test_athena_csv.py b/tests/test_athena_csv.py
@@ -375,3 +375,14 @@ def test_empty_column(path, glue_table, glue_database, use_threads):
         wr.s3.to_csv(
             df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
         )
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_mixed_types_column(path, glue_table, glue_database, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3], "c1": [1, 2, "foo"], "par": ["a", "b", "c"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["par"] = df["par"].astype("string")
+    with pytest.raises(TypeError):
+        wr.s3.to_csv(
+            df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
+        )
diff --git a/tests/test_athena_parquet.py b/tests/test_athena_parquet.py
@@ -636,3 +636,12 @@ def test_empty_column(path, glue_table, glue_database, use_threads):
     df["par"] = df["par"].astype("string")
     with pytest.raises(wr.exceptions.UndetectedType):
         wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_mixed_types_column(path, glue_table, glue_database, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3], "c1": [1, 2, "foo"], "par": ["a", "b", "c"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["par"] = df["par"].astype("string")
+    with pytest.raises(TypeError):
+        wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -436,3 +436,12 @@ def test_empty_column(path, use_threads):
     df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
     df2["par"] = df2["par"].astype("string")
     assert df.equals(df2)
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_mixed_types_column(path, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3], "c1": [1, 2, "foo"], "par": ["a", "b", "c"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["par"] = df["par"].astype("string")
+    with pytest.raises(TypeError):
+        wr.s3.to_parquet(df, path, dataset=True, partition_cols=["par"])