Skip to content

Commit a5bbac0

Browse files
committed
documenting the null object columns issue
1 parent 6fcb149 commit a5bbac0

File tree

6 files changed

+89
-3
lines changed

6 files changed

+89
-3
lines changed

README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,34 @@ session.athena.repair_table(database="db_name", table="tbl_name")
220220

221221
## Diving Deep
222222

223+
224+
### Pandas with null object columns (UndetectedType exception)
225+
226+
Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc.
227+
We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer.
228+
229+
To work with null object columns you can explicitly set the expected Athena data type for the target table doing:
230+
231+
```py3
232+
import awswrangler
233+
import pandas as pd
234+
235+
dataframe = pd.DataFrame({
236+
"col": [1, 2],
237+
"col_string_null": [None, None],
238+
"col_date_null": [None, None],
239+
})
240+
session = awswrangler.Session()
241+
session.pandas.to_parquet(
242+
dataframe=dataframe,
243+
database="DATABASE",
244+
path=f"s3://...",
245+
cast_columns={
246+
"col_string_null": "string",
247+
"col_date_null": "date"
248+
})
249+
```
250+
223251
### Pandas to Redshift Flow
224252

225253
![Pandas to Redshift Flow](docs/source/_static/pandas-to-redshift-flow.jpg?raw=true "Pandas to Redshift Flow")

awswrangler/data_types.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import pyarrow
55

6-
from awswrangler.exceptions import UnsupportedType
6+
from awswrangler.exceptions import UnsupportedType, UndetectedType
77

88
logger = logging.getLogger(__name__)
99

@@ -160,6 +160,9 @@ def pyarrow2athena(dtype):
160160
return "date"
161161
elif dtype_str.startswith("list"):
162162
return f"array<{pyarrow2athena(dtype.value_type)}>"
163+
elif dtype_str == "null":
164+
raise UndetectedType(
165+
"We can't infer the data type from an entire null object column")
163166
else:
164167
raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
165168

awswrangler/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ class UnsupportedType(Exception):
22
pass
33

44

5+
class UndetectedType(Exception):
6+
pass
7+
8+
59
class UnsupportedFileFormat(Exception):
610
pass
711

awswrangler/glue.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from awswrangler import data_types
66
from awswrangler.athena import Athena
7-
from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType
7+
from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType, UndetectedType
88

99
logger = logging.getLogger(__name__)
1010

@@ -194,6 +194,11 @@ def _build_schema(dataframe,
194194
else:
195195
try:
196196
athena_type = data_types.pyarrow2athena(dtype)
197+
except UndetectedType:
198+
raise UndetectedType(
199+
f"We can't infer the data type from an entire null object column ({name}). "
200+
f"Please consider pass the type of this column explicitly using the cast "
201+
f"columns argument")
197202
except UnsupportedType:
198203
raise UnsupportedType(
199204
f"Unsupported Pyarrow type for column {name}: {dtype}")

docs/source/divingdeep.rst

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,35 @@
33
Diving Deep
44
===========
55

6+
Pandas with null object columns (UndetectedType exception)
7+
----------------------------------------------------------
8+
9+
Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc.
10+
We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer.
11+
12+
To work with null object columns you can explicitly set the expected Athena data type for the target table doing:
13+
14+
.. code-block:: python
15+
16+
import awswrangler
17+
import pandas as pd
18+
19+
dataframe = pd.DataFrame({
20+
"col": [1, 2],
21+
"col_string_null": [None, None],
22+
"col_date_null": [None, None],
23+
})
24+
session = awswrangler.Session()
25+
session.pandas.to_parquet(
26+
dataframe=dataframe,
27+
database="DATABASE",
28+
path=f"s3://...",
29+
cast_columns={
30+
"col_string_null": "string",
31+
"col_date_null": "date"
32+
})
33+
34+
635
Pandas to Redshift Flow
736
-----------------------
837

testing/test_awswrangler/test_pandas.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import numpy as np
1010

1111
from awswrangler import Session, Pandas
12-
from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType
12+
from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType, UndetectedType
1313

1414
logging.basicConfig(
1515
level=logging.INFO,
@@ -962,3 +962,20 @@ def test_to_parquet_casting_to_string(
962962
assert len(dataframe.index) == len(dataframe2.index)
963963
assert (len(list(dataframe.columns)) + 1) == len(list(dataframe2.columns))
964964
print(dataframe2)
965+
966+
967+
def test_to_parquet_casting_with_null_object(
968+
session,
969+
bucket,
970+
database,
971+
):
972+
dataframe = pd.DataFrame({
973+
"a": [1, 2, 3],
974+
"b": [4, 5, 6],
975+
"col_null": [None, None, None],
976+
})
977+
with pytest.raises(UndetectedType):
978+
assert session.pandas.to_parquet(dataframe=dataframe,
979+
database=database,
980+
path=f"s3://{bucket}/test/",
981+
mode="overwrite")

0 commit comments

Comments
 (0)