Add Redshift.to_parquet()

igorborgest · igorborgest · commit 9903c9e3f60a · 2019-12-10T23:29:09.000-03:00
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Union, Optional
+from typing import Dict, List, Union, Optional, Any
 import json
 import logging
 
@@ -346,3 +346,49 @@ def _get_redshift_schema(dataframe, dataframe_type, preserve_index=False, cast_c
         else:
             raise InvalidDataframeType(dataframe_type)
         return schema_built
+
+    @staticmethod
+    def to_parquet(sql: str,
+                   path: str,
+                   iam_role: str,
+                   redshift_conn: Any,
+                   partition_cols: Optional[List] = None) -> List[str]:
+        """
+        Write a query result as parquet files on S3
+
+        :param sql: SQL Query
+        :param path: AWS S3 path to write the data (e.g. s3://...)
+        :param iam_role: AWS IAM role with the related permissions
+        :param redshift_conn: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
+        :param partition_cols: Specifies the partition keys for the unload operation.
+        """
+        sql = sql.replace("'", "\'").replace(";", "")  # escaping single quote
+        path = path if path[-1] == "/" else path + "/"
+        cursor: Any = redshift_conn.cursor()
+        partition_str: str = ""
+        if partition_cols is not None:
+            partition_str = f"PARTITION BY ({','.join([x for x in partition_cols])})\n"
+        query: str = f"-- AWS DATA WRANGLER\n" \
+                     f"UNLOAD ('{sql}')\n" \
+                     f"TO '{path}'\n" \
+                     f"IAM_ROLE '{iam_role}'\n" \
+                     f"ALLOWOVERWRITE\n" \
+                     f"PARALLEL ON\n" \
+                     f"ENCRYPTED \n" \
+                     f"{partition_str}" \
+                     f"FORMAT PARQUET;"
+        logger.debug(f"query:\n{query}")
+        cursor.execute(query)
+        query = "-- AWS DATA WRANGLER\nSELECT pg_last_query_id() AS query_id"
+        logger.debug(f"query:\n{query}")
+        cursor.execute(query)
+        query_id = cursor.fetchall()[0][0]
+        query = f"-- AWS DATA WRANGLER\n" \
+                f"SELECT path FROM STL_UNLOAD_LOG WHERE query={query_id};"
+        logger.debug(f"query:\n{query}")
+        cursor.execute(query)
+        paths: List[str] = [row[0].replace(" ", "") for row in cursor.fetchall()]
+        logger.debug(f"paths: {paths}")
+        redshift_conn.commit()
+        cursor.close()
+        return paths
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -1474,6 +1474,7 @@ def test_read_table2(session, bucket, database):
                               preserve_index=False,
                               procs_cpu_bound=4,
                               partition_cols=["partition"])
+    sleep(5)
     df2 = session.pandas.read_table(database=database, table="test")
     assert len(list(df.columns)) == len(list(df2.columns))
     assert len(df.index) == len(df2.index)
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py
@@ -508,3 +508,22 @@ def test_to_redshift_spark_decimal(session, bucket, redshift_parameters):
         elif row[2] == 3:
             assert row[1] == Decimal((0, (1, 9, 0), -2))
             assert row[2] == Decimal((0, (1, 9, 0, 0, 0, 0), -5))
+
+
+def test_to_parquet(bucket, redshift_parameters):
+    con = Redshift.generate_connection(
+        database="test",
+        host=redshift_parameters.get("RedshiftAddress"),
+        port=redshift_parameters.get("RedshiftPort"),
+        user="test",
+        password=redshift_parameters.get("RedshiftPassword"),
+    )
+    path = f"s3://{bucket}/test_to_parquet/"
+    paths = Redshift.to_parquet(
+        sql="SELECT * FROM public.test",
+        path=path,
+        iam_role=redshift_parameters.get("RedshiftRole"),
+        redshift_conn=con,
+        partition_cols=["name"]
+    )
+    assert len(paths) == 20