Skip to content

Commit 1e22878

Browse files
authored
Enable prefix in output S3 files for UNLOAD (#729)
1 parent 0eeac80 commit 1e22878

File tree

2 files changed

+35
-5
lines changed

2 files changed

+35
-5
lines changed

awswrangler/redshift.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,6 @@ def unload_to_files(
925925
926926
927927
"""
928-
path = path if path.endswith("/") else f"{path}/"
929928
session: boto3.Session = _utils.ensure_session(session=boto3_session)
930929
s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session)
931930
with con.cursor() as cursor:
@@ -995,16 +994,16 @@ def unload(
995994
----
996995
``Batching`` (`chunked` argument) (Memory Friendly):
997996
998-
Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
997+
Will enable the function to return an Iterable of DataFrames instead of a regular DataFrame.
999998
1000999
There are two batching strategies on Wrangler:
10011000
10021001
- If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.
10031002
1004-
- If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
1003+
- If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows (equal to the received INTEGER).
10051004
1006-
`P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
1007-
in number of rows for each Dataframe.
1005+
`P.S.` `chunked=True` is faster and uses less memory while `chunked=INTEGER` is more precise
1006+
in the number of rows for each Dataframe.
10081007
10091008
10101009
Note
@@ -1077,6 +1076,7 @@ def unload(
10771076
>>> con.close()
10781077
10791078
"""
1079+
path = path if path.endswith("/") else f"{path}/"
10801080
session: boto3.Session = _utils.ensure_session(session=boto3_session)
10811081
unload_to_files(
10821082
sql=sql,

tests/test_redshift.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,36 @@ def test_unload_extras(bucket, path, redshift_table, redshift_con, databases_par
352352
assert len(df.columns) == 2
353353

354354

355+
def test_unload_with_prefix(bucket, path, redshift_table, redshift_con, databases_parameters, kms_key_id):
356+
test_prefix = "my_prefix"
357+
table = redshift_table
358+
schema = databases_parameters["redshift"]["schema"]
359+
df = pd.DataFrame({"id": [1, 2], "name": ["foo", "boo"]})
360+
wr.redshift.to_sql(df=df, con=redshift_con, table=table, schema=schema, mode="overwrite", index=False)
361+
362+
args = {
363+
"sql": f"SELECT * FROM {schema}.{table}",
364+
"path": f"{path}{test_prefix}",
365+
"con": redshift_con,
366+
"iam_role": databases_parameters["redshift"]["role"],
367+
"region": wr.s3.get_bucket_region(bucket),
368+
"max_file_size": 5.0,
369+
"kms_key_id": kms_key_id,
370+
}
371+
# Adding a prefix to S3 output files
372+
wr.redshift.unload_to_files(**args)
373+
filename = wr.s3.list_objects(path=path)[0].split("/")[-1]
374+
assert filename.startswith(test_prefix)
375+
376+
# Prefix becomes part of path with partitioning
377+
wr.redshift.unload_to_files(
378+
**args,
379+
partition_cols=["name"],
380+
)
381+
object_prefix = wr.s3.list_objects(path=path)[0].split("/")[-3]
382+
assert object_prefix == test_prefix
383+
384+
355385
def test_to_sql_cast(redshift_table, redshift_con):
356386
table = redshift_table
357387
df = pd.DataFrame(

0 commit comments

Comments
 (0)