Minor - Adding unload format to unload_files (#761)

jaidisido · kukushking · web-flow · commit 9831829bd71a · 2021-06-28T18:37:04.000+01:00
Co-authored-by: kukushking &lt;3997468+kukushking@users.noreply.github.com&gt;
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -851,6 +851,7 @@ def unload_to_files(
     aws_secret_access_key: Optional[str] = None,
     aws_session_token: Optional[str] = None,
     region: Optional[str] = None,
+    unload_format: Optional[str] = None,
     max_file_size: Optional[float] = None,
     kms_key_id: Optional[str] = None,
     manifest: bool = False,
@@ -890,6 +891,9 @@ def unload_to_files(
         same AWS Region as the Amazon Redshift cluster. By default, UNLOAD
         assumes that the target Amazon S3 bucket is located in the same AWS
         Region as the Amazon Redshift cluster.
+    unload_format: str, optional
+        Format of the unloaded S3 objects from the query.
+        Valid values: "CSV", "PARQUET". Case sensitive. Defaults to PARQUET.
     max_file_size : float, optional
         Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3.
         Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default
@@ -925,9 +929,12 @@ def unload_to_files(
 
 
     """
+    if unload_format not in [None, "CSV", "PARQUET"]:
+        raise exceptions.InvalidArgumentValue("<unload_format> argument must be 'CSV' or 'PARQUET'")
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session)
     with con.cursor() as cursor:
+        format_str: str = unload_format or "PARQUET"
         partition_str: str = f"\nPARTITION BY ({','.join(partition_cols)})" if partition_cols else ""
         manifest_str: str = "\nmanifest" if manifest is True else ""
         region_str: str = f"\nREGION AS '{region}'" if region is not None else ""
@@ -948,7 +955,7 @@ def unload_to_files(
             f"{auth_str}"
             "ALLOWOVERWRITE\n"
             "PARALLEL ON\n"
-            "FORMAT PARQUET\n"
+            f"FORMAT {format_str}\n"
             "ENCRYPTED"
             f"{kms_key_id_str}"
             f"{partition_str}"
diff --git a/tests/test_redshift.py b/tests/test_redshift.py
@@ -352,7 +352,10 @@ def test_unload_extras(bucket, path, redshift_table, redshift_con, databases_par
     assert len(df.columns) == 2
 
 
-def test_unload_with_prefix(bucket, path, redshift_table, redshift_con, databases_parameters, kms_key_id):
+@pytest.mark.parametrize("unload_format", [None, "CSV", "PARQUET"])
+def test_unload_with_prefix(
+    bucket, path, redshift_table, redshift_con, databases_parameters, kms_key_id, unload_format
+):
     test_prefix = "my_prefix"
     table = redshift_table
     schema = databases_parameters["redshift"]["schema"]
@@ -367,6 +370,7 @@ def test_unload_with_prefix(bucket, path, redshift_table, redshift_con, database
         "region": wr.s3.get_bucket_region(bucket),
         "max_file_size": 5.0,
         "kms_key_id": kms_key_id,
+        "unload_format": unload_format,
     }
     # Adding a prefix to S3 output files
     wr.redshift.unload_to_files(**args)