Skip to content

Commit ec07957

Browse files
committed
Improving exception handling for empty columns type inference.
1 parent b8335e0 commit ec07957

File tree

5 files changed

+52
-6
lines changed

5 files changed

+52
-6
lines changed

awswrangler/_data_types.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,19 @@ def athena_types_from_pandas(
361361
if v is None:
362362
athena_columns_types[k] = casts[k].replace(" ", "")
363363
else:
364-
athena_columns_types[k] = pyarrow2athena(dtype=v)
364+
try:
365+
athena_columns_types[k] = pyarrow2athena(dtype=v)
366+
except exceptions.UndetectedType as ex:
367+
raise exceptions.UndetectedType(
368+
"Impossible to infer the equivalent Athena data type "
369+
f"for the {k} column. "
370+
"It is completely empty (only null values) "
371+
f"and has a too generic data type ({df[k].dtype}). "
372+
"Please, cast this columns with a more deterministic data type "
373+
f"(e.g. df['{k}'] = df['{k}'].astype('string')) or "
374+
"pass the column schema as argument for Wrangler "
375+
f"(e.g. dtype={{'{k}': 'string'}}"
376+
) from ex
365377
_logger.debug("athena_columns_types: %s", athena_columns_types)
366378
return athena_columns_types
367379

awswrangler/athena/_read.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -688,11 +688,13 @@ def read_sql_query(
688688
689689
"""
690690
if ctas_approach and data_source not in (None, "AwsDataCatalog"):
691-
raise exceptions.InvalidArgumentCombination("Queries with ctas_approach=True (default) does not support "
692-
"data_source values different than None and 'AwsDataCatalog'. "
693-
"Please check the related tutorial for more details "
694-
"(https://github.com/awslabs/aws-data-wrangler/blob/master/"
695-
"tutorials/006%20-%20Amazon%20Athena.ipynb)")
691+
raise exceptions.InvalidArgumentCombination(
692+
"Queries with ctas_approach=True (default) does not support "
693+
"data_source values different than None and 'AwsDataCatalog'. "
694+
"Please check the related tutorial for more details "
695+
"(https://github.com/awslabs/aws-data-wrangler/blob/master/"
696+
"tutorials/006%20-%20Amazon%20Athena.ipynb)"
697+
)
696698
session: boto3.Session = _utils.ensure_session(session=boto3_session)
697699

698700
cache_info: _CacheInfo = _check_for_cached_results(

tests/test_athena_csv.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,3 +364,14 @@ def test_skip_header(path, glue_database, glue_table, use_threads, ctas_approach
364364
)
365365
df2 = wr.athena.read_sql_table(glue_table, glue_database, use_threads=use_threads, ctas_approach=ctas_approach)
366366
assert df.equals(df2)
367+
368+
369+
@pytest.mark.parametrize("use_threads", [True, False])
370+
def test_empty_column(path, glue_table, glue_database, use_threads):
371+
df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
372+
df["c0"] = df["c0"].astype("Int64")
373+
df["par"] = df["par"].astype("string")
374+
with pytest.raises(wr.exceptions.UndetectedType):
375+
wr.s3.to_csv(
376+
df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
377+
)

tests/test_athena_parquet.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,3 +627,12 @@ def test_partitions_overwrite(path, glue_table, glue_database, use_threads, part
627627
ensure_data_types(df2, has_list=True)
628628
assert df2.shape == (3, 19)
629629
assert df.iint8.sum() == df2.iint8.sum()
630+
631+
632+
@pytest.mark.parametrize("use_threads", [True, False])
633+
def test_empty_column(path, glue_table, glue_database, use_threads):
634+
df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
635+
df["c0"] = df["c0"].astype("Int64")
636+
df["par"] = df["par"].astype("string")
637+
with pytest.raises(wr.exceptions.UndetectedType):
638+
wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])

tests/test_s3_parquet.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,3 +424,15 @@ def test_timezone_raw_values(path, use_threads):
424424
df2["par"] = df2["par"].astype("string")
425425
df3["par"] = df3["par"].astype("string")
426426
assert df2.equals(df3)
427+
428+
429+
@pytest.mark.parametrize("use_threads", [True, False])
430+
def test_empty_column(path, use_threads):
431+
df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
432+
df["c0"] = df["c0"].astype("Int64")
433+
df["par"] = df["par"].astype("string")
434+
paths = wr.s3.to_parquet(df, path, dataset=True, partition_cols=["par"])["paths"]
435+
wr.s3.wait_objects_exist(paths, use_threads=use_threads)
436+
df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
437+
df2["par"] = df2["par"].astype("string")
438+
assert df.equals(df2)

0 commit comments

Comments
 (0)