Fix keep_files behaviour for failed redshift COPY. #505

igorborgest · igorborgest · commit c703f4593872 · 2021-01-09T10:03:56.000-03:00
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -952,7 +952,7 @@ def unload(
         List of columns names that should be returned as pandas.Categorical.
         Recommended for memory restricted environments.
     keep_files : bool
-        Should keep the stage files?
+        Should keep stage files?
     chunked : Union[int, bool]
         If passed will split the data in a Iterable of DataFrames (Memory friendly).
         If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
@@ -1290,7 +1290,7 @@ def copy(  # pylint: disable=too-many-arguments
     varchar_lengths : Dict[str, int], optional
         Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}).
     keep_files : bool
-        Should keep the stage files?
+        Should keep stage files?
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -1334,38 +1334,40 @@ def copy(  # pylint: disable=too-many-arguments
             f"The received S3 path ({path}) is not empty. "
             "Please, provide a different path or use wr.s3.delete_objects() to clean up the current one."
         )
-    s3.to_parquet(
-        df=df,
-        path=path,
-        index=index,
-        dataset=True,
-        mode="append",
-        dtype=dtype,
-        use_threads=use_threads,
-        boto3_session=session,
-        s3_additional_kwargs=s3_additional_kwargs,
-        max_rows_by_file=max_rows_by_file,
-    )
-    copy_from_files(
-        path=path,
-        con=con,
-        table=table,
-        schema=schema,
-        iam_role=iam_role,
-        aws_access_key_id=aws_access_key_id,
-        aws_secret_access_key=aws_secret_access_key,
-        aws_session_token=aws_session_token,
-        mode=mode,
-        diststyle=diststyle,
-        distkey=distkey,
-        sortstyle=sortstyle,
-        sortkey=sortkey,
-        primary_keys=primary_keys,
-        varchar_lengths_default=varchar_lengths_default,
-        varchar_lengths=varchar_lengths,
-        use_threads=use_threads,
-        boto3_session=session,
-        s3_additional_kwargs=s3_additional_kwargs,
-    )
-    if keep_files is False:
-        s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session)
+    try:
+        s3.to_parquet(
+            df=df,
+            path=path,
+            index=index,
+            dataset=True,
+            mode="append",
+            dtype=dtype,
+            use_threads=use_threads,
+            boto3_session=session,
+            s3_additional_kwargs=s3_additional_kwargs,
+            max_rows_by_file=max_rows_by_file,
+        )
+        copy_from_files(
+            path=path,
+            con=con,
+            table=table,
+            schema=schema,
+            iam_role=iam_role,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            aws_session_token=aws_session_token,
+            mode=mode,
+            diststyle=diststyle,
+            distkey=distkey,
+            sortstyle=sortstyle,
+            sortkey=sortkey,
+            primary_keys=primary_keys,
+            varchar_lengths_default=varchar_lengths_default,
+            varchar_lengths=varchar_lengths,
+            use_threads=use_threads,
+            boto3_session=session,
+            s3_additional_kwargs=s3_additional_kwargs,
+        )
+    finally:
+        if keep_files is False:
+            s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session)
diff --git a/tests/test_athena.py b/tests/test_athena.py
@@ -669,7 +669,6 @@ def test_glue_database():
 
     # Round 1 - Create Database
     glue_database_name = f"database_{get_time_str_with_random_suffix()}"
-    print(f"Database Name: {glue_database_name}")
     wr.catalog.create_database(name=glue_database_name, description="Database Description")
     databases = wr.catalog.get_databases()
     test_database_name = ""
@@ -684,7 +683,6 @@ def test_glue_database():
     assert test_database_description == "Database Description"
 
     # Round 2 - Delete Database
-    print(f"Glue Database Name: {glue_database_name}")
     wr.catalog.delete_database(name=glue_database_name)
     databases = wr.catalog.get_databases()
     test_database_name = ""
@@ -786,8 +784,6 @@ def test_describe_table(path, glue_database, glue_table):
 def test_athena_nan_inf(glue_database, ctas_approach, data_source):
     sql = "SELECT nan() AS nan, infinity() as inf, -infinity() as inf_n, 1.2 as regular"
     df = wr.athena.read_sql_query(sql, glue_database, ctas_approach, data_source=data_source)
-    print(df)
-    print(df.dtypes)
     assert df.shape == (1, 4)
     assert df.dtypes.to_list() == ["float64", "float64", "float64", "float64"]
     assert np.isnan(df.nan.iloc[0])
diff --git a/tests/test_athena_csv.py b/tests/test_athena_csv.py
@@ -330,12 +330,11 @@ def test_athena_csv_types(path, glue_database, glue_table):
     wr.athena.repair_table(glue_table, glue_database)
     assert len(wr.catalog.get_csv_partitions(glue_database, glue_table)) == 3
     df2 = wr.athena.read_sql_table(glue_table, glue_database)
-    print(df2)
-    # assert len(df2.index) == 3
-    # assert len(df2.columns) == 10
-    # assert df2["id"].sum() == 6
-    # ensure_data_types_csv(df2)
-    # assert wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table) is True
+    assert len(df2.index) == 3
+    assert len(df2.columns) == 10
+    assert df2["id"].sum() == 6
+    ensure_data_types_csv(df2)
+    assert wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table) is True
 
 
 @pytest.mark.parametrize("use_threads", [True, False])
diff --git a/tests/test_athena_projection.py b/tests/test_athena_projection.py
@@ -74,7 +74,6 @@ def test_to_parquet_projection_date(glue_database, glue_table, path):
         projection_ranges={"c1": "2020-01-01,2020-01-03", "c2": "2020-01-01 01:01:00,2020-01-01 01:01:03"},
     )
     df2 = wr.athena.read_sql_table(glue_table, glue_database)
-    print(df2)
     assert df.shape == df2.shape
     assert df.c0.sum() == df2.c0.sum()
 
diff --git a/tests/test_redshift.py b/tests/test_redshift.py
@@ -8,6 +8,7 @@
 import pyarrow as pa
 import pytest
 import redshift_connector
+from redshift_connector.error import ProgrammingError
 
 import awswrangler as wr
 from awswrangler import _utils
@@ -888,6 +889,20 @@ def test_column_length(path, redshift_table, databases_parameters):
     )
     df2 = wr.redshift.read_sql_query(sql=f"SELECT * FROM public.{redshift_table}", con=con)
     con.close()
-    print(df.dtypes)
-    print(df2.dtypes)
     assert df2.equals(df)
+
+
+def test_failed_keep_files(path, redshift_table, databases_parameters):
+    df = pd.DataFrame({"c0": [1], "c1": ["foo"]}, dtype="string")
+    con = wr.redshift.connect("aws-data-wrangler-redshift")
+    with pytest.raises(ProgrammingError):
+        wr.redshift.copy(
+            df=df,
+            path=path,
+            con=con,
+            table=redshift_table,
+            schema="public",
+            iam_role=databases_parameters["redshift"]["role"],
+            varchar_lengths={"c1": 2},
+        )
+    assert len(wr.s3.list_objects(path)) == 0

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,6 @@ def test_to_parquet_projection_date(glue_database, glue_table, path):`
`74`	`74`	`projection_ranges={"c1": "2020-01-01,2020-01-03", "c2": "2020-01-01 01:01:00,2020-01-01 01:01:03"},`
`75`	`75`	`)`
`76`	`76`	`df2 = wr.athena.read_sql_table(glue_table, glue_database)`
`77`		`- print(df2)`
`78`	`77`	`assert df.shape == df2.shape`
`79`	`78`	`assert df.c0.sum() == df2.c0.sum()`
`80`	`79`