Enable prefix in output S3 files for UNLOAD (#729)

jaidisido · web-flow · commit 1e22878c3181 · 2021-06-04T11:43:40.000+01:00
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -925,7 +925,6 @@ def unload_to_files(
 
 
     """
-    path = path if path.endswith("/") else f"{path}/"
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session)
     with con.cursor() as cursor:
@@ -995,16 +994,16 @@ def unload(
     ----
     ``Batching`` (`chunked` argument) (Memory Friendly):
 
-    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
+    Will enable the function to return an Iterable of DataFrames instead of a regular DataFrame.
 
     There are two batching strategies on Wrangler:
 
     - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.
 
-    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
+    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows (equal to the received INTEGER).
 
-    `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
-    in number of rows for each Dataframe.
+    `P.S.` `chunked=True` is faster and uses less memory while `chunked=INTEGER` is more precise
+    in the number of rows for each Dataframe.
 
 
     Note
@@ -1077,6 +1076,7 @@ def unload(
     >>> con.close()
 
     """
+    path = path if path.endswith("/") else f"{path}/"
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     unload_to_files(
         sql=sql,
diff --git a/tests/test_redshift.py b/tests/test_redshift.py
@@ -352,6 +352,36 @@ def test_unload_extras(bucket, path, redshift_table, redshift_con, databases_par
     assert len(df.columns) == 2
 
 
+def test_unload_with_prefix(bucket, path, redshift_table, redshift_con, databases_parameters, kms_key_id):
+    test_prefix = "my_prefix"
+    table = redshift_table
+    schema = databases_parameters["redshift"]["schema"]
+    df = pd.DataFrame({"id": [1, 2], "name": ["foo", "boo"]})
+    wr.redshift.to_sql(df=df, con=redshift_con, table=table, schema=schema, mode="overwrite", index=False)
+
+    args = {
+        "sql": f"SELECT * FROM {schema}.{table}",
+        "path": f"{path}{test_prefix}",
+        "con": redshift_con,
+        "iam_role": databases_parameters["redshift"]["role"],
+        "region": wr.s3.get_bucket_region(bucket),
+        "max_file_size": 5.0,
+        "kms_key_id": kms_key_id,
+    }
+    # Adding a prefix to S3 output files
+    wr.redshift.unload_to_files(**args)
+    filename = wr.s3.list_objects(path=path)[0].split("/")[-1]
+    assert filename.startswith(test_prefix)
+
+    # Prefix becomes part of path with partitioning
+    wr.redshift.unload_to_files(
+        **args,
+        partition_cols=["name"],
+    )
+    object_prefix = wr.s3.list_objects(path=path)[0].split("/")[-3]
+    assert object_prefix == test_prefix
+
+
 def test_to_sql_cast(redshift_table, redshift_con):
     table = redshift_table
     df = pd.DataFrame(