documenting the null object columns issue

igorborgest · igorborgest · commit a5bbac0adfe9 · 2019-10-09T01:05:22.000-03:00
diff --git a/README.md b/README.md
@@ -220,6 +220,34 @@ session.athena.repair_table(database="db_name", table="tbl_name")
 
 ## Diving Deep
 
+
+### Pandas with null object columns (UndetectedType exception)
+
+Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc.
+We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer.
+
+To work with null object columns you can explicitly set the expected Athena data type for the target table doing:
+
+```py3
+import awswrangler
+import pandas as pd
+
+dataframe = pd.DataFrame({
+    "col": [1, 2],
+    "col_string_null": [None, None],
+    "col_date_null": [None, None],
+})
+session = awswrangler.Session()
+session.pandas.to_parquet(
+    dataframe=dataframe,
+    database="DATABASE",
+    path=f"s3://...",
+    cast_columns={
+      "col_string_null": "string",
+      "col_date_null": "date"
+    })
+```
+
 ### Pandas to Redshift Flow
 
 ![Pandas to Redshift Flow](docs/source/_static/pandas-to-redshift-flow.jpg?raw=true "Pandas to Redshift Flow")
diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py
@@ -3,7 +3,7 @@
 
 import pyarrow
 
-from awswrangler.exceptions import UnsupportedType
+from awswrangler.exceptions import UnsupportedType, UndetectedType
 
 logger = logging.getLogger(__name__)
 
@@ -160,6 +160,9 @@ def pyarrow2athena(dtype):
         return "date"
     elif dtype_str.startswith("list"):
         return f"array<{pyarrow2athena(dtype.value_type)}>"
+    elif dtype_str == "null":
+        raise UndetectedType(
+            "We can't infer the data type from an entire null object column")
     else:
         raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
diff --git a/awswrangler/exceptions.py b/awswrangler/exceptions.py
@@ -2,6 +2,10 @@ class UnsupportedType(Exception):
     pass
 
 
+class UndetectedType(Exception):
+    pass
+
+
 class UnsupportedFileFormat(Exception):
     pass
 
diff --git a/awswrangler/glue.py b/awswrangler/glue.py
@@ -4,7 +4,7 @@
 
 from awswrangler import data_types
 from awswrangler.athena import Athena
-from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType
+from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType, UndetectedType
 
 logger = logging.getLogger(__name__)
 
@@ -194,6 +194,11 @@ def _build_schema(dataframe,
             else:
                 try:
                     athena_type = data_types.pyarrow2athena(dtype)
+                except UndetectedType:
+                    raise UndetectedType(
+                        f"We can't infer the data type from an entire null object column ({name}). "
+                        f"Please consider pass the type of this column explicitly using the cast "
+                        f"columns argument")
                 except UnsupportedType:
                     raise UnsupportedType(
                         f"Unsupported Pyarrow type for column {name}: {dtype}")
diff --git a/docs/source/divingdeep.rst b/docs/source/divingdeep.rst
@@ -3,6 +3,35 @@
 Diving Deep
 ===========
 
+Pandas with null object columns (UndetectedType exception)
+----------------------------------------------------------
+
+Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc.
+We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer.
+
+To work with null object columns you can explicitly set the expected Athena data type for the target table doing:
+
+.. code-block:: python
+
+    import awswrangler
+    import pandas as pd
+
+    dataframe = pd.DataFrame({
+        "col": [1, 2],
+        "col_string_null": [None, None],
+        "col_date_null": [None, None],
+    })
+    session = awswrangler.Session()
+    session.pandas.to_parquet(
+        dataframe=dataframe,
+        database="DATABASE",
+        path=f"s3://...",
+        cast_columns={
+          "col_string_null": "string",
+          "col_date_null": "date"
+        })
+
+
 Pandas to Redshift Flow
 -----------------------
 
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from awswrangler import Session, Pandas
-from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType
+from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType, UndetectedType
 
 logging.basicConfig(
     level=logging.INFO,
@@ -962,3 +962,20 @@ def test_to_parquet_casting_to_string(
     assert len(dataframe.index) == len(dataframe2.index)
     assert (len(list(dataframe.columns)) + 1) == len(list(dataframe2.columns))
     print(dataframe2)
+
+
+def test_to_parquet_casting_with_null_object(
+        session,
+        bucket,
+        database,
+):
+    dataframe = pd.DataFrame({
+        "a": [1, 2, 3],
+        "b": [4, 5, 6],
+        "col_null": [None, None, None],
+    })
+    with pytest.raises(UndetectedType):
+        assert session.pandas.to_parquet(dataframe=dataframe,
+                                         database=database,
+                                         path=f"s3://{bucket}/test/",
+                                         mode="overwrite")