Merge pull request #33 from awslabs/pandas-list-support

igorborgest · web-flow · commit 7574474989d1 · 2019-09-26T21:28:43.000-03:00
Adding support for lists to Pandas
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -1,5 +1,6 @@
 from time import sleep
 import logging
+import ast
 
 from awswrangler.exceptions import UnsupportedType, QueryFailed, QueryCancelled
 
@@ -35,6 +36,8 @@ def _type_athena2pandas(dtype):
             return "datetime64"
         elif dtype == "date":
             return "date"
+        elif dtype == "array":
+            return "literal_eval"
         else:
             raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
@@ -44,18 +47,21 @@ def get_query_dtype(self, query_execution_id):
         dtype = {}
         parse_timestamps = []
         parse_dates = []
+        converters = {}
         for col_name, col_type in cols_metadata.items():
             ptype = Athena._type_athena2pandas(dtype=col_type)
             if ptype in ["datetime64", "date"]:
                 parse_timestamps.append(col_name)
                 if ptype == "date":
                     parse_dates.append(col_name)
+            elif ptype == "literal_eval":
+                converters[col_name] = ast.literal_eval
             else:
                 dtype[col_name] = ptype
         logger.debug(f"dtype: {dtype}")
         logger.debug(f"parse_timestamps: {parse_timestamps}")
         logger.debug(f"parse_dates: {parse_dates}")
-        return dtype, parse_timestamps, parse_dates
+        return dtype, parse_timestamps, parse_dates, converters
 
     def create_athena_bucket(self):
         """
diff --git a/awswrangler/glue.py b/awswrangler/glue.py
@@ -45,25 +45,53 @@ def get_table_python_types(self, database, table):
         dtypes = self.get_table_athena_types(database=database, table=table)
         return {k: Glue.type_athena2python(v) for k, v in dtypes.items()}
 
+    @staticmethod
+    def type_athena2pyarrow(dtype):
+        dtype = dtype.lower()
+        if dtype == "tinyint":
+            return "int8"
+        if dtype == "smallint":
+            return "int16"
+        elif dtype in ["int", "integer"]:
+            return "int32"
+        elif dtype == "bigint":
+            return "int64"
+        elif dtype == "float":
+            return "float32"
+        elif dtype == "double":
+            return "float64"
+        elif dtype in ["boolean", "bool"]:
+            return "bool"
+        elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
+            return "string"
+        elif dtype == "timestamp":
+            return "timestamp[ns]"
+        elif dtype == "date":
+            return "date32"
+        else:
+            raise UnsupportedType(f"Unsupported Athena type: {dtype}")
+
     @staticmethod
     def type_pyarrow2athena(dtype):
-        dtype = str(dtype).lower()
-        if dtype == "int32":
+        dtype_str = str(dtype).lower()
+        if dtype_str == "int32":
             return "int"
-        elif dtype == "int64":
+        elif dtype_str == "int64":
             return "bigint"
-        elif dtype == "float":
+        elif dtype_str == "float":
             return "float"
-        elif dtype == "double":
+        elif dtype_str == "double":
             return "double"
-        elif dtype == "bool":
+        elif dtype_str == "bool":
             return "boolean"
-        elif dtype == "string":
+        elif dtype_str == "string":
             return "string"
-        elif dtype.startswith("timestamp"):
+        elif dtype_str.startswith("timestamp"):
             return "timestamp"
-        elif dtype.startswith("date"):
+        elif dtype_str.startswith("date"):
             return "date"
+        elif dtype_str.startswith("list"):
+            return f"array<{Glue.type_pyarrow2athena(dtype.value_type)}>"
         else:
             raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
@@ -239,28 +267,22 @@ def get_connection_details(self, name):
             Name=name, HidePassword=False)["Connection"]
 
     @staticmethod
-    def _extract_pyarrow_schema(dataframe, preserve_index, cast_columns=None):
+    def _extract_pyarrow_schema(dataframe, preserve_index):
         cols = []
         cols_dtypes = {}
         schema = []
 
-        casted = []
-        if cast_columns is not None:
-            casted = cast_columns.keys()
-
         for name, dtype in dataframe.dtypes.to_dict().items():
             dtype = str(dtype)
             if dtype == "Int64":
                 cols_dtypes[name] = "int64"
-            elif name in casted:
-                cols_dtypes[name] = cast_columns[name]
             else:
                 cols.append(name)
 
         for field in pyarrow.Schema.from_pandas(df=dataframe[cols],
                                                 preserve_index=preserve_index):
             name = str(field.name)
-            dtype = str(field.type)
+            dtype = field.type
             cols_dtypes[name] = dtype
             if name not in dataframe.columns:
                 schema.append((name, dtype))
@@ -279,18 +301,22 @@ def _build_schema(dataframe,
             partition_cols = []
 
         pyarrow_schema = Glue._extract_pyarrow_schema(
-            dataframe=dataframe,
-            preserve_index=preserve_index,
-            cast_columns=cast_columns)
+            dataframe=dataframe, preserve_index=preserve_index)
 
         schema_built = []
         partition_cols_types = {}
         for name, dtype in pyarrow_schema:
-            athena_type = Glue.type_pyarrow2athena(dtype)
-            if name in partition_cols:
-                partition_cols_types[name] = athena_type
+            if (cast_columns is not None) and (name in cast_columns.keys()):
+                if name in partition_cols:
+                    partition_cols_types[name] = cast_columns[name]
+                else:
+                    schema_built.append((name, cast_columns[name]))
             else:
-                schema_built.append((name, athena_type))
+                athena_type = Glue.type_pyarrow2athena(dtype)
+                if name in partition_cols:
+                    partition_cols_types[name] = athena_type
+                else:
+                    schema_built.append((name, athena_type))
 
         partition_cols_schema_built = [(name, partition_cols_types[name])
                                        for name in partition_cols]
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -14,7 +14,7 @@
     AthenaQueryError, EmptyS3Object, LineTerminatorNotFound, EmptyDataframe, \
     InvalidSerDe, InvalidCompression
 from awswrangler.utils import calculate_bounders
-from awswrangler import s3
+from awswrangler import s3, glue
 
 logger = logging.getLogger(__name__)
 
@@ -56,6 +56,7 @@ def read_csv(
             parse_dates=False,
             infer_datetime_format=False,
             encoding="utf-8",
+            converters=None,
     ):
         """
         Read CSV file from AWS S3 using optimized strategies.
@@ -76,6 +77,7 @@ def read_csv(
         :param parse_dates: Same as pandas.read_csv()
         :param infer_datetime_format: Same as pandas.read_csv()
         :param encoding: Same as pandas.read_csv()
+        :param converters: Same as pandas.read_csv()
         :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
         """
         bucket_name, key_path = self._parse_path(path)
@@ -99,7 +101,8 @@ def read_csv(
                 escapechar=escapechar,
                 parse_dates=parse_dates,
                 infer_datetime_format=infer_datetime_format,
-                encoding=encoding)
+                encoding=encoding,
+                converters=converters)
         else:
             ret = Pandas._read_csv_once(
                 client_s3=client_s3,
@@ -115,7 +118,8 @@ def read_csv(
                 escapechar=escapechar,
                 parse_dates=parse_dates,
                 infer_datetime_format=infer_datetime_format,
-                encoding=encoding)
+                encoding=encoding,
+                converters=converters)
         return ret
 
     @staticmethod
@@ -135,6 +139,7 @@ def _read_csv_iterator(
             parse_dates=False,
             infer_datetime_format=False,
             encoding="utf-8",
+            converters=None,
     ):
         """
         Read CSV file from AWS S3 using optimized strategies.
@@ -156,6 +161,7 @@ def _read_csv_iterator(
         :param parse_dates: Same as pandas.read_csv()
         :param infer_datetime_format: Same as pandas.read_csv()
         :param encoding: Same as pandas.read_csv()
+        :param converters: Same as pandas.read_csv()
         :return: Pandas Dataframe
         """
         metadata = s3.S3.head_object_with_retry(client=client_s3,
@@ -181,7 +187,8 @@ def _read_csv_iterator(
                 escapechar=escapechar,
                 parse_dates=parse_dates,
                 infer_datetime_format=infer_datetime_format,
-                encoding=encoding)
+                encoding=encoding,
+                converters=converters)
         else:
             bounders = calculate_bounders(num_items=total_size,
                                           max_size=max_result_size)
@@ -234,7 +241,7 @@ def _read_csv_iterator(
                     lineterminator=lineterminator,
                     dtype=dtype,
                     encoding=encoding,
-                )
+                    converters=converters)
                 yield df
                 if count == 1:  # first chunk
                     names = df.columns
@@ -352,6 +359,7 @@ def _read_csv_once(
             parse_dates=False,
             infer_datetime_format=False,
             encoding=None,
+            converters=None,
     ):
         """
         Read CSV file from AWS S3 using optimized strategies.
@@ -372,6 +380,7 @@ def _read_csv_once(
         :param parse_dates: Same as pandas.read_csv()
         :param infer_datetime_format: Same as pandas.read_csv()
         :param encoding: Same as pandas.read_csv()
+        :param converters: Same as pandas.read_csv()
         :return: Pandas Dataframe
         """
         buff = BytesIO()
@@ -392,6 +401,7 @@ def _read_csv_once(
             lineterminator=lineterminator,
             dtype=dtype,
             encoding=encoding,
+            converters=converters,
         )
         buff.close()
         return dataframe
@@ -425,12 +435,13 @@ def read_sql_athena(self,
             message_error = f"Query error: {reason}"
             raise AthenaQueryError(message_error)
         else:
-            dtype, parse_timestamps, parse_dates = self._session.athena.get_query_dtype(
+            dtype, parse_timestamps, parse_dates, converters = self._session.athena.get_query_dtype(
                 query_execution_id=query_execution_id)
             path = f"{s3_output}{query_execution_id}.csv"
             ret = self.read_csv(path=path,
                                 dtype=dtype,
                                 parse_dates=parse_timestamps,
+                                converters=converters,
                                 quoting=csv.QUOTE_ALL,
                                 max_result_size=max_result_size)
             if max_result_size is None:
@@ -848,18 +859,21 @@ def write_parquet_dataframe(dataframe,
             if str(dtype) == "Int64":
                 dataframe[name] = dataframe[name].astype("float64")
                 casted_in_pandas.append(name)
-                cast_columns[name] = "int64"
+                cast_columns[name] = "bigint"
                 logger.debug(f"Casting column {name} Int64 to float64")
         table = pyarrow.Table.from_pandas(df=dataframe,
                                           preserve_index=preserve_index,
                                           safe=False)
         if cast_columns:
             for col_name, dtype in cast_columns.items():
                 col_index = table.column_names.index(col_name)
-                table = table.set_column(col_index,
-                                         table.column(col_name).cast(dtype))
+                pyarrow_dtype = glue.Glue.type_athena2pyarrow(dtype)
+                table = table.set_column(
+                    col_index,
+                    table.column(col_name).cast(pyarrow_dtype))
                 logger.debug(
-                    f"Casting column {col_name} ({col_index}) to {dtype}")
+                    f"Casting column {col_name} ({col_index}) to {dtype} ({pyarrow_dtype})"
+                )
         with fs.open(path, "wb") as f:
             parquet.write_table(table,
                                 f,
diff --git a/testing/test_awswrangler/test_cloudwatchlogs.py b/testing/test_awswrangler/test_cloudwatchlogs.py
@@ -64,7 +64,7 @@ def logstream(cloudformation_outputs, loggroup):
     if token:
         args["sequenceToken"] = token
     client.put_log_events(**args)
-    sleep(180)
+    sleep(300)
     yield logstream
 
 
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py