Skip to content

Commit 8352c29

Browse files
committed
Adding S3 path validation for Redshift COPY.
1 parent 510af85 commit 8352c29

File tree

3 files changed

+35
-6
lines changed

3 files changed

+35
-6
lines changed

awswrangler/redshift.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,12 +1103,12 @@ def copy( # pylint: disable=too-many-arguments
11031103
) -> None:
11041104
"""Load Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage.
11051105
1106-
This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.to_sql()` to load large
1106+
This is a **HIGH** latency and **HIGH** throughput alternative to `wr.redshift.to_sql()` to load large
11071107
DataFrames into Amazon Redshift through the ** SQL COPY command**.
11081108
11091109
This strategy has more overhead and requires more IAM privileges
1110-
than the regular `wr.db.to_sql()` function, so it is only recommended
1111-
to inserting +1MM rows at once.
1110+
than the regular `wr.redshift.to_sql()` function, so it is only recommended
1111+
to inserting +1K rows at once.
11121112
11131113
https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html
11141114
@@ -1129,7 +1129,8 @@ def copy( # pylint: disable=too-many-arguments
11291129
df: pandas.DataFrame
11301130
Pandas DataFrame.
11311131
path : str
1132-
S3 path to write stage files (e.g. s3://bucket_name/any_name/)
1132+
S3 path to write stage files (e.g. s3://bucket_name/any_name/).
1133+
Note: This path must be empty.
11331134
con : redshift_connector.Connection
11341135
Use redshift_connector.connect() to use "
11351136
"credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog.
@@ -1204,6 +1205,11 @@ def copy( # pylint: disable=too-many-arguments
12041205
path = path[:-1] if path.endswith("*") else path
12051206
path = path if path.endswith("/") else f"{path}/"
12061207
session: boto3.Session = _utils.ensure_session(session=boto3_session)
1208+
if s3.list_objects(path=path):
1209+
raise exceptions.InvalidArgument(
1210+
f"The received S3 path ({path}) is not empty. "
1211+
"Please, provide a different path or use wr.s3.delete_objects() to clean up the current one."
1212+
)
12071213
s3.to_parquet(
12081214
df=df,
12091215
path=path,

tests/test_redshift.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,3 +747,24 @@ def test_copy_from_files_empty(path, redshift_table, databases_parameters):
747747
df2 = wr.redshift.read_sql_query(sql=f"SELECT count(*) AS counter FROM public.{redshift_table}", con=con)
748748
con.close()
749749
assert df2["counter"].iloc[0] == 3
750+
751+
752+
def test_copy_dirty_path(path, redshift_table, databases_parameters):
753+
df = pd.DataFrame({"col0": [0, 1, 2]})
754+
755+
# previous file at same path
756+
wr.s3.to_parquet(df, f"{path}test.parquet")
757+
758+
con = wr.redshift.connect("aws-data-wrangler-redshift")
759+
with pytest.raises(wr.exceptions.InvalidArgument):
760+
try:
761+
wr.redshift.copy( # Trying to copy using a dirty path
762+
df=df,
763+
path=path,
764+
con=con,
765+
table=redshift_table,
766+
schema="public",
767+
iam_role=databases_parameters["redshift"]["role"],
768+
)
769+
finally:
770+
con.close()

tutorials/008 - Redshift - Copy & Unload.ipynb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,9 @@
287287
"cell_type": "markdown",
288288
"metadata": {},
289289
"source": [
290-
"## Load and Unload with COPY and UNLOAD commands"
290+
"## Load and Unload with COPY and UNLOAD commands\n",
291+
"\n",
292+
"> Note: Please use a empty S3 path for the COPY command."
291293
]
292294
},
293295
{
@@ -554,4 +556,4 @@
554556
},
555557
"nbformat": 4,
556558
"nbformat_minor": 4
557-
}
559+
}

0 commit comments

Comments
 (0)