Fix Redshift data type conversions

igorborgest · igorborgest · commit 65904ce4773c · 2019-09-16T22:12:04.000-03:00
diff --git a/awswrangler/glue.py b/awswrangler/glue.py
@@ -56,7 +56,7 @@ def type_pandas2athena(dtype):
             return "double"
         elif dtype == "bool":
             return "boolean"
-        elif dtype == "object" and isinstance(dtype, str):
+        elif dtype == "object":
             return "string"
         elif dtype[:10] == "datetime64":
             return "timestamp"
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -341,11 +341,9 @@ def _get_redshift_schema(dataframe, dataframe_type, preserve_index=False):
                 dtype = str(dataframe.index.dtype)
                 redshift_type = Redshift._type_pandas2redshift(dtype)
                 schema_built.append((name, redshift_type))
-            for col in dataframe.columns:
-                name = str(col)
-                dtype = str(dataframe[name].dtype)
+            for col, dtype in dataframe.dtypes:
                 redshift_type = Redshift._type_pandas2redshift(dtype)
-                schema_built.append((name, redshift_type))
+                schema_built.append((col, redshift_type))
         elif dataframe_type == "spark":
             for name, dtype in dataframe.dtypes:
                 redshift_type = Redshift._type_spark2redshift(dtype)
@@ -377,17 +375,17 @@ def _type_pandas2redshift(dtype):
     @staticmethod
     def _type_spark2redshift(dtype):
         dtype = dtype.lower()
-        if dtype == "int":
-            return "INTEGER"
-        elif dtype == "long":
+        if dtype in ["smallint", "int", "bigint"]:
             return "BIGINT"
         elif dtype == "float":
+            return "FLOAT4"
+        elif dtype == "double":
             return "FLOAT8"
         elif dtype == "bool":
             return "BOOLEAN"
+        elif dtype == "timestamp":
+            return "TIMESTAMP"
         elif dtype == "string":
             return "VARCHAR(256)"
-        elif dtype[:10] == "datetime.datetime":
-            return "TIMESTAMP"
         else:
             raise UnsupportedType("Unsupported Spark type: " + dtype)
diff --git a/awswrangler/spark.py b/awswrangler/spark.py
@@ -4,6 +4,7 @@
 
 from pyspark.sql.functions import pandas_udf, PandasUDFType
 from pyspark.sql.functions import floor, rand
+from pyspark.sql.types import TimestampType
 
 from awswrangler.exceptions import MissingBatchDetected
 
@@ -16,9 +17,29 @@ class Spark:
     def __init__(self, session):
         self._session = session
 
-    def read_csv(self, path):
+    def read_csv(self, **args):
         spark = self._session.spark_session
-        return spark.read.csv(path=path, header=True)
+        return spark.read.csv(**args)
+
+    @staticmethod
+    def _extract_casts(dtypes):
+        casts = {}
+        for col, dtype in dtypes:
+            if dtype in ["smallint", "int", "bigint"]:
+                casts[col] = "Int64"
+            elif dtype == "object":
+                casts[col] = "str"
+        logger.debug(f"casts: {casts}")
+        return casts
+
+    @staticmethod
+    def date2timestamp(dataframe):
+        for col, dtype in dataframe.dtypes:
+            if dtype == "date":
+                dataframe = dataframe.withColumn(
+                    col, dataframe[col].cast(TimestampType()))
+                logger.warning(f"Casting column {col} from date to timestamp!")
+        return dataframe
 
     def to_redshift(
             self,
@@ -57,6 +78,7 @@ def to_redshift(
             path += "/"
         self._session.s3.delete_objects(path=path)
         spark = self._session.spark_session
+        dataframe = Spark.date2timestamp(dataframe)
         dataframe.cache()
         num_rows = dataframe.count()
         logger.info(f"Number of rows: {num_rows}")
@@ -72,6 +94,7 @@ def to_redshift(
         logger.debug(f"Number of partitions calculated: {num_partitions}")
         spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         session_primitives = self._session.primitives
+        casts = Spark._extract_casts(dataframe.dtypes)
 
         @pandas_udf(returnType="objects_paths string",
                     functionType=PandasUDFType.GROUPED_MAP)
@@ -83,7 +106,7 @@ def write(pandas_dataframe):
                 preserve_index=False,
                 mode="append",
                 procs_cpu_bound=1,
-            )
+                cast_columns=casts)
             return pandas.DataFrame.from_dict({"objects_paths": paths})
 
         df_objects_paths = (dataframe.withColumn(
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py
@@ -86,14 +86,22 @@ def redshift_parameters(cloudformation_outputs):
 def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name,
                             mode, factor, diststyle, distkey, sortstyle,
                             sortkey):
+    if sample_name == "micro":
+        dates = ["date"]
+    if sample_name == "small":
+        dates = ["date"]
+    if sample_name == "nano":
+        dates = ["date", "time"]
+    dataframe = pandas.read_csv(f"data_samples/{sample_name}.csv",
+                                parse_dates=dates,
+                                infer_datetime_format=True)
     con = Redshift.generate_connection(
         database="test",
         host=redshift_parameters.get("RedshiftAddress"),
         port=redshift_parameters.get("RedshiftPort"),
         user="test",
         password=redshift_parameters.get("RedshiftPassword"),
     )
-    dataframe = pandas.read_csv(f"data_samples/{sample_name}.csv")
     path = f"s3://{bucket}/redshift-load/"
     session.pandas.to_redshift(
         dataframe=dataframe,
@@ -110,11 +118,12 @@ def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name,
         preserve_index=False,
     )
     cursor = con.cursor()
-    cursor.execute("SELECT COUNT(*) as counter from public.test")
-    counter = cursor.fetchall()[0][0]
+    cursor.execute("SELECT * from public.test")
+    rows = cursor.fetchall()
     cursor.close()
     con.close()
-    assert len(dataframe.index) * factor == counter
+    assert len(dataframe.index) * factor == len(rows)
+    assert len(list(dataframe.columns)) == len(list(rows[0]))
 
 
 @pytest.mark.parametrize(
@@ -135,14 +144,14 @@ def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name,
 def test_to_redshift_pandas_exceptions(session, bucket, redshift_parameters,
                                        sample_name, mode, factor, diststyle,
                                        distkey, sortstyle, sortkey, exc):
+    dataframe = pandas.read_csv(f"data_samples/{sample_name}.csv")
     con = Redshift.generate_connection(
         database="test",
         host=redshift_parameters.get("RedshiftAddress"),
         port=redshift_parameters.get("RedshiftPort"),
         user="test",
         password=redshift_parameters.get("RedshiftPassword"),
     )
-    dataframe = pandas.read_csv(f"data_samples/{sample_name}.csv")
     path = f"s3://{bucket}/redshift-load/"
     with pytest.raises(exc):
         assert session.pandas.to_redshift(
@@ -180,7 +189,20 @@ def test_to_redshift_spark(session, bucket, redshift_parameters, sample_name,
                            mode, factor, diststyle, distkey, sortstyle,
                            sortkey):
     path = f"data_samples/{sample_name}.csv"
-    dataframe = session.spark.read_csv(path=path)
+    if sample_name == "micro":
+        schema = "id SMALLINT, name STRING, value FLOAT, date TIMESTAMP"
+        timestamp_format = "yyyy-MM-dd"
+    elif sample_name == "small":
+        schema = "id BIGINT, name STRING, date DATE"
+        timestamp_format = "dd-MM-yy"
+    elif sample_name == "nano":
+        schema = "id INTEGER, name STRING, value DOUBLE, date TIMESTAMP, time TIMESTAMP"
+        timestamp_format = "yyyy-MM-dd"
+    dataframe = session.spark.read_csv(path=path,
+                                       schema=schema,
+                                       timestampFormat=timestamp_format,
+                                       dateFormat=timestamp_format,
+                                       header=True)
     con = Redshift.generate_connection(
         database="test",
         host=redshift_parameters.get("RedshiftAddress"),
@@ -203,11 +225,12 @@ def test_to_redshift_spark(session, bucket, redshift_parameters, sample_name,
         min_num_partitions=2,
     )
     cursor = con.cursor()
-    cursor.execute("SELECT COUNT(*) as counter from public.test")
-    counter = cursor.fetchall()[0][0]
+    cursor.execute("SELECT * from public.test")
+    rows = cursor.fetchall()
     cursor.close()
     con.close()
-    assert dataframe.count() * factor == counter
+    assert (dataframe.count() * factor) == len(rows)
+    assert len(list(dataframe.columns)) == len(list(rows[0]))
 
 
 @pytest.mark.parametrize(