Skip to content

Commit 630b420

Browse files
committed
Improving exception handling for columns with mixed data types.
1 parent ec07957 commit 630b420

File tree

4 files changed

+42
-1
lines changed

4 files changed

+42
-1
lines changed

awswrangler/_data_types.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,18 @@ def pyarrow_types_from_pandas(
293293
schema: pa.Schema = pa.Schema.from_pandas(df=df[[col]], preserve_index=False)
294294
except pa.ArrowInvalid as ex:
295295
cols_dtypes[col] = process_not_inferred_dtype(ex)
296+
except TypeError as ex:
297+
msg = str(ex)
298+
if " is required (got type " in msg:
299+
raise TypeError(
300+
f"The {col} columns has a too generic data type ({df[col].dtype}) and seems "
301+
f"to have mixed data types ({msg}). "
302+
"Please, cast this columns with a more deterministic data type "
303+
f"(e.g. df['{col}'] = df['{col}'].astype('string')) or "
304+
"pass the column schema as argument for AWS Data Wrangler "
305+
f"(e.g. dtype={{'{col}': 'string'}}"
306+
) from ex
307+
raise
296308
else:
297309
cols_dtypes[col] = schema.field(col).type
298310

@@ -371,7 +383,7 @@ def athena_types_from_pandas(
371383
f"and has a too generic data type ({df[k].dtype}). "
372384
"Please, cast this columns with a more deterministic data type "
373385
f"(e.g. df['{k}'] = df['{k}'].astype('string')) or "
374-
"pass the column schema as argument for Wrangler "
386+
"pass the column schema as argument for AWS Data Wrangler "
375387
f"(e.g. dtype={{'{k}': 'string'}}"
376388
) from ex
377389
_logger.debug("athena_columns_types: %s", athena_columns_types)

tests/test_athena_csv.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,3 +375,14 @@ def test_empty_column(path, glue_table, glue_database, use_threads):
375375
wr.s3.to_csv(
376376
df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
377377
)
378+
379+
380+
@pytest.mark.parametrize("use_threads", [True, False])
381+
def test_mixed_types_column(path, glue_table, glue_database, use_threads):
382+
df = pd.DataFrame({"c0": [1, 2, 3], "c1": [1, 2, "foo"], "par": ["a", "b", "c"]})
383+
df["c0"] = df["c0"].astype("Int64")
384+
df["par"] = df["par"].astype("string")
385+
with pytest.raises(TypeError):
386+
wr.s3.to_csv(
387+
df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
388+
)

tests/test_athena_parquet.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,3 +636,12 @@ def test_empty_column(path, glue_table, glue_database, use_threads):
636636
df["par"] = df["par"].astype("string")
637637
with pytest.raises(wr.exceptions.UndetectedType):
638638
wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])
639+
640+
641+
@pytest.mark.parametrize("use_threads", [True, False])
642+
def test_mixed_types_column(path, glue_table, glue_database, use_threads):
643+
df = pd.DataFrame({"c0": [1, 2, 3], "c1": [1, 2, "foo"], "par": ["a", "b", "c"]})
644+
df["c0"] = df["c0"].astype("Int64")
645+
df["par"] = df["par"].astype("string")
646+
with pytest.raises(TypeError):
647+
wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])

tests/test_s3_parquet.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,3 +436,12 @@ def test_empty_column(path, use_threads):
436436
df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
437437
df2["par"] = df2["par"].astype("string")
438438
assert df.equals(df2)
439+
440+
441+
@pytest.mark.parametrize("use_threads", [True, False])
442+
def test_mixed_types_column(path, use_threads):
443+
df = pd.DataFrame({"c0": [1, 2, 3], "c1": [1, 2, "foo"], "par": ["a", "b", "c"]})
444+
df["c0"] = df["c0"].astype("Int64")
445+
df["par"] = df["par"].astype("string")
446+
with pytest.raises(TypeError):
447+
wr.s3.to_parquet(df, path, dataset=True, partition_cols=["par"])

0 commit comments

Comments
 (0)