Add test for Redshift Spectrum

igorborgest · igorborgest · commit 19df7c1437ba · 2020-04-09T23:12:43.000-03:00
diff --git a/awswrangler/db.py b/awswrangler/db.py
@@ -421,9 +421,9 @@ def copy_to_redshift(  # pylint: disable=too-many-arguments
     """Load Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage.
 
     This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.to_sql()` to load large
-    DataFrame into Amazon Redshift through the **COPY command**.
+    DataFrames into Amazon Redshift through the ** SQL COPY command**.
 
-    This function/strategy has more overhead and requires more privileges (`iam_role` argument)
+    This strategy has more overhead and requires more IAM privileges
     than the regular `wr.db.to_sql()` function, so it is only recommended
     to inserting +1MM rows at once.
 
@@ -897,9 +897,9 @@ def unload_redshift(
 
     This is a **HIGH** latency and **HIGH** throughput alternative to
     `wr.db.read_sql_query()`/`wr.db.read_sql_table()` to extract large
-    Amazon Redshift data into a Pandas DataFrame through the **UNLOAD command**.
+    Amazon Redshift data into a Pandas DataFrames through the **UNLOAD command**.
 
-    This function/strategy has more overhead and requires more privileges (`iam_role` argument)
+    This strategy has more overhead and requires more IAM privileges
     than the regular `wr.db.read_sql_query()`/`wr.db.read_sql_table()` function,
     so it is only recommended to fetch +1MM rows at once.
 
diff --git a/testing/test_awswrangler/test_db.py b/testing/test_awswrangler/test_db.py
@@ -56,6 +56,26 @@ def parameters(cloudformation_outputs):
     yield parameters
 
 
+@pytest.fixture(scope="module")
+def glue_database(cloudformation_outputs):
+    yield cloudformation_outputs["GlueDatabaseName"]
+
+
+@pytest.fixture(scope="module")
+def external_schema(cloudformation_outputs, parameters, glue_database):
+    region = cloudformation_outputs.get("Region")
+    sql = f"""
+    CREATE EXTERNAL SCHEMA IF NOT EXISTS aws_data_wrangler_external FROM data catalog
+    DATABASE '{glue_database}'
+    IAM_ROLE '{parameters["redshift"]["role"]}'
+    REGION '{region}';
+    """
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-redshift")
+    with engine.connect() as con:
+        con.execute(sql)
+    yield "aws_data_wrangler_external"
+
+
 @pytest.mark.parametrize("db_type", ["mysql", "redshift", "postgresql"])
 def test_sql(parameters, db_type):
     df = get_df()
@@ -305,3 +325,26 @@ def test_redshift_exceptions(bucket, parameters, diststyle, distkey, sortstyle,
             index=False,
         )
     wr.s3.delete_objects(path=path)
+
+
+def test_redshift_spectrum(bucket, glue_database, external_schema):
+    df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "col_str": ["foo", None, "bar", None, "xoo"], "par_int": [0, 1, 0, 1, 1]})
+    path = f"s3://{bucket}/test_redshift_spectrum/"
+    paths = wr.s3.to_parquet(
+        df=df,
+        path=path,
+        database=glue_database,
+        table="test_redshift_spectrum",
+        mode="overwrite",
+        index=False,
+        dataset=True,
+        partition_cols=["par_int"],
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
+    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-redshift")
+    with engine.connect() as con:
+        cursor = con.execute(f"SELECT * FROM {external_schema}.test_redshift_spectrum")
+        rows = cursor.fetchall()
+        assert len(rows) == len(df.index)
+        for row in rows:
+            assert len(row) == len(df.columns)