Cleaning up written files if catalog write failed.

igorborgest · igorborgest · commit 678262b542a0 · 2020-09-26T18:36:43.000-03:00
diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py
@@ -14,6 +14,7 @@
 
 from awswrangler import _data_types, _utils, catalog, exceptions
 from awswrangler._config import apply_configs
+from awswrangler.s3._delete import delete_objects
 from awswrangler.s3._fs import open_s3_object
 from awswrangler.s3._read_parquet import _read_parquet_metadata
 from awswrangler.s3._write import _COMPRESSION_2_EXT, _apply_dtype, _sanitize, _validate_args
@@ -531,39 +532,44 @@ def to_parquet(  # pylint: disable=too-many-arguments,too-many-locals
             max_rows_by_file=max_rows_by_file,
         )
         if (database is not None) and (table is not None):
-            catalog._create_parquet_table(  # pylint: disable=protected-access
-                database=database,
-                table=table,
-                path=path,
-                columns_types=columns_types,
-                partitions_types=partitions_types,
-                compression=compression,
-                description=description,
-                parameters=parameters,
-                columns_comments=columns_comments,
-                boto3_session=session,
-                mode=mode,
-                catalog_versioning=catalog_versioning,
-                projection_enabled=projection_enabled,
-                projection_types=projection_types,
-                projection_ranges=projection_ranges,
-                projection_values=projection_values,
-                projection_intervals=projection_intervals,
-                projection_digits=projection_digits,
-                catalog_id=catalog_id,
-                catalog_table_input=catalog_table_input,
-            )
-            if partitions_values and (regular_partitions is True):
-                _logger.debug("partitions_values:\n%s", partitions_values)
-                catalog.add_parquet_partitions(
+            try:
+                catalog._create_parquet_table(  # pylint: disable=protected-access
                     database=database,
                     table=table,
-                    partitions_values=partitions_values,
+                    path=path,
+                    columns_types=columns_types,
+                    partitions_types=partitions_types,
                     compression=compression,
+                    description=description,
+                    parameters=parameters,
+                    columns_comments=columns_comments,
                     boto3_session=session,
+                    mode=mode,
+                    catalog_versioning=catalog_versioning,
+                    projection_enabled=projection_enabled,
+                    projection_types=projection_types,
+                    projection_ranges=projection_ranges,
+                    projection_values=projection_values,
+                    projection_intervals=projection_intervals,
+                    projection_digits=projection_digits,
                     catalog_id=catalog_id,
-                    columns_types=columns_types,
+                    catalog_table_input=catalog_table_input,
                 )
+                if partitions_values and (regular_partitions is True):
+                    _logger.debug("partitions_values:\n%s", partitions_values)
+                    catalog.add_parquet_partitions(
+                        database=database,
+                        table=table,
+                        partitions_values=partitions_values,
+                        compression=compression,
+                        boto3_session=session,
+                        catalog_id=catalog_id,
+                        columns_types=columns_types,
+                    )
+            except Exception:
+                _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).", paths)
+                delete_objects(path=paths, use_threads=use_threads, boto3_session=session)
+                raise
     return {"paths": paths, "partitions_values": partitions_values}
 
 
diff --git a/awswrangler/s3/_write_text.py b/awswrangler/s3/_write_text.py
@@ -10,6 +10,7 @@
 
 from awswrangler import _data_types, _utils, catalog, exceptions
 from awswrangler._config import apply_configs
+from awswrangler.s3._delete import delete_objects
 from awswrangler.s3._fs import open_s3_object
 from awswrangler.s3._write import _apply_dtype, _sanitize, _validate_args
 from awswrangler.s3._write_dataset import _to_dataset
@@ -406,44 +407,49 @@ def to_csv(  # pylint: disable=too-many-arguments,too-many-locals
             date_format="%Y-%m-%d %H:%M:%S.%f",
         )
         if (database is not None) and (table is not None):
-            columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned(
-                df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True
-            )
-            catalog._create_csv_table(  # pylint: disable=protected-access
-                database=database,
-                table=table,
-                path=path,
-                columns_types=columns_types,
-                partitions_types=partitions_types,
-                description=description,
-                parameters=parameters,
-                columns_comments=columns_comments,
-                boto3_session=session,
-                mode=mode,
-                catalog_versioning=catalog_versioning,
-                sep=sep,
-                projection_enabled=projection_enabled,
-                projection_types=projection_types,
-                projection_ranges=projection_ranges,
-                projection_values=projection_values,
-                projection_intervals=projection_intervals,
-                projection_digits=projection_digits,
-                catalog_table_input=catalog_table_input,
-                catalog_id=catalog_id,
-                compression=None,
-                skip_header_line_count=None,
-            )
-            if partitions_values and (regular_partitions is True):
-                _logger.debug("partitions_values:\n%s", partitions_values)
-                catalog.add_csv_partitions(
+            try:
+                columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned(
+                    df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True
+                )
+                catalog._create_csv_table(  # pylint: disable=protected-access
                     database=database,
                     table=table,
-                    partitions_values=partitions_values,
+                    path=path,
+                    columns_types=columns_types,
+                    partitions_types=partitions_types,
+                    description=description,
+                    parameters=parameters,
+                    columns_comments=columns_comments,
                     boto3_session=session,
+                    mode=mode,
+                    catalog_versioning=catalog_versioning,
                     sep=sep,
+                    projection_enabled=projection_enabled,
+                    projection_types=projection_types,
+                    projection_ranges=projection_ranges,
+                    projection_values=projection_values,
+                    projection_intervals=projection_intervals,
+                    projection_digits=projection_digits,
+                    catalog_table_input=catalog_table_input,
                     catalog_id=catalog_id,
-                    columns_types=columns_types,
+                    compression=None,
+                    skip_header_line_count=None,
                 )
+                if partitions_values and (regular_partitions is True):
+                    _logger.debug("partitions_values:\n%s", partitions_values)
+                    catalog.add_csv_partitions(
+                        database=database,
+                        table=table,
+                        partitions_values=partitions_values,
+                        boto3_session=session,
+                        sep=sep,
+                        catalog_id=catalog_id,
+                        columns_types=columns_types,
+                    )
+            except Exception:
+                _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).", paths)
+                delete_objects(path=paths, use_threads=use_threads, boto3_session=session)
+                raise
     return {"paths": paths, "partitions_values": partitions_values}
 
 
diff --git a/tests/test_athena_csv.py b/tests/test_athena_csv.py
@@ -1,5 +1,7 @@
 import logging
+import time
 
+import boto3
 import pandas as pd
 import pytest
 
@@ -386,3 +388,14 @@ def test_mixed_types_column(path, glue_table, glue_database, use_threads):
         wr.s3.to_csv(
             df, path, index=False, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"]
         )
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_failing_catalog(path, glue_table, glue_database, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3]})
+    try:
+        wr.s3.to_csv(df, path, dataset=True, table=glue_table, database="foo")
+    except boto3.client("glue").exceptions.EntityNotFoundException:
+        pass
+    time.sleep(3)
+    assert len(wr.s3.list_objects(path)) == 0
diff --git a/tests/test_athena_parquet.py b/tests/test_athena_parquet.py
@@ -1,7 +1,9 @@
 import datetime
 import logging
 import math
+import time
 
+import boto3
 import numpy as np
 import pandas as pd
 import pytest
@@ -645,3 +647,14 @@ def test_mixed_types_column(path, glue_table, glue_database, use_threads):
     df["par"] = df["par"].astype("string")
     with pytest.raises(TypeError):
         wr.s3.to_parquet(df, path, dataset=True, table=glue_table, database=glue_database, partition_cols=["par"])
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_failing_catalog(path, glue_table, glue_database, use_threads):
+    df = pd.DataFrame({"c0": [1, 2, 3]})
+    try:
+        wr.s3.to_parquet(df, path, max_rows_by_file=1, dataset=True, table=glue_table, database="foo")
+    except boto3.client("glue").exceptions.EntityNotFoundException:
+        pass
+    time.sleep(3)
+    assert len(wr.s3.list_objects(path)) == 0