Adding S3 path validation for Redshift COPY.

igorborgest · igorborgest · commit 8352c29eb2c4 · 2020-12-12T18:46:00.000-03:00
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -1103,12 +1103,12 @@ def copy(  # pylint: disable=too-many-arguments
 ) -> None:
     """Load Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage.
 
-    This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.to_sql()` to load large
+    This is a **HIGH** latency and **HIGH** throughput alternative to `wr.redshift.to_sql()` to load large
     DataFrames into Amazon Redshift through the ** SQL COPY command**.
 
     This strategy has more overhead and requires more IAM privileges
-    than the regular `wr.db.to_sql()` function, so it is only recommended
-    to inserting +1MM rows at once.
+    than the regular `wr.redshift.to_sql()` function, so it is only recommended
+    to inserting +1K rows at once.
 
     https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html
 
@@ -1129,7 +1129,8 @@ def copy(  # pylint: disable=too-many-arguments
     df: pandas.DataFrame
         Pandas DataFrame.
     path : str
-        S3 path to write stage files (e.g. s3://bucket_name/any_name/)
+        S3 path to write stage files (e.g. s3://bucket_name/any_name/).
+        Note: This path must be empty.
     con : redshift_connector.Connection
         Use redshift_connector.connect() to use "
         "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog.
@@ -1204,6 +1205,11 @@ def copy(  # pylint: disable=too-many-arguments
     path = path[:-1] if path.endswith("*") else path
     path = path if path.endswith("/") else f"{path}/"
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
+    if s3.list_objects(path=path):
+        raise exceptions.InvalidArgument(
+            f"The received S3 path ({path}) is not empty. "
+            "Please, provide a different path or use wr.s3.delete_objects() to clean up the current one."
+        )
     s3.to_parquet(
         df=df,
         path=path,
diff --git a/tests/test_redshift.py b/tests/test_redshift.py
@@ -747,3 +747,24 @@ def test_copy_from_files_empty(path, redshift_table, databases_parameters):
     df2 = wr.redshift.read_sql_query(sql=f"SELECT count(*) AS counter FROM public.{redshift_table}", con=con)
     con.close()
     assert df2["counter"].iloc[0] == 3
+
+
+def test_copy_dirty_path(path, redshift_table, databases_parameters):
+    df = pd.DataFrame({"col0": [0, 1, 2]})
+
+    # previous file at same path
+    wr.s3.to_parquet(df, f"{path}test.parquet")
+
+    con = wr.redshift.connect("aws-data-wrangler-redshift")
+    with pytest.raises(wr.exceptions.InvalidArgument):
+        try:
+            wr.redshift.copy(  # Trying to copy using a dirty path
+                df=df,
+                path=path,
+                con=con,
+                table=redshift_table,
+                schema="public",
+                iam_role=databases_parameters["redshift"]["role"],
+            )
+        finally:
+            con.close()
diff --git a/tutorials/008 - Redshift - Copy & Unload.ipynb b/tutorials/008 - Redshift - Copy & Unload.ipynb
@@ -287,7 +287,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load and Unload with COPY and UNLOAD commands"
+    "## Load and Unload with COPY and UNLOAD commands\n",
+    "\n",
+    "> Note: Please use a empty S3 path for the COPY command."
    ]
   },
   {
@@ -554,4 +556,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}