aws
diff --git a/‎awswrangler/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎awswrangler/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎awswrangler/athena.py‎
Lines changed: 39 additions & 0 deletions b/‎awswrangler/athena.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎awswrangler/exceptions.py‎
Lines changed: 4 additions & 0 deletions b/‎awswrangler/exceptions.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎awswrangler/glue.py‎
Lines changed: 31 additions & 29 deletions b/‎awswrangler/glue.py‎
Lines changed: 31 additions & 29 deletions
diff --git a/‎awswrangler/pandas.py‎
Lines changed: 106 additions & 32 deletions b/‎awswrangler/pandas.py‎
Lines changed: 106 additions & 32 deletions
@@ -1,4 +1,4 @@
 __title__ = "awswrangler"
 __description__ = "Utility belt to handle data on AWS."
-__version__ = "0.0b20"
+__version__ = "0.0b27"
 __license__ = "Apache License 2.0"
@@ -1,6 +1,8 @@
 from time import sleep
 import logging
 
+from awswrangler.exceptions import UnsupportedType
+
 logger = logging.getLogger(__name__)
 
 QUERY_WAIT_POLLING_DELAY = 0.2  # MILLISECONDS
@@ -12,6 +14,43 @@ def __init__(self, session):
         self._client_athena = session.boto3_session.client(
             service_name="athena", config=session.botocore_config)
 
+    def get_query_columns_metadata(self, query_execution_id):
+        response = self._client_athena.get_query_results(
+            QueryExecutionId=query_execution_id, MaxResults=1)
+        col_info = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]
+        return {x["Name"]: x["Type"] for x in col_info}
+
+    @staticmethod
+    def _type_athena2pandas(dtype):
+        dtype = dtype.lower()
+        if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
+            return "Int64"
+        elif dtype in ["float", "double", "real"]:
+            return "float64"
+        elif dtype == "boolean":
+            return "bool"
+        elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
+            return "object"
+        elif dtype in ["timestamp", "date"]:
+            return "datetime64"
+        else:
+            raise UnsupportedType(f"Unsupported Athena type: {dtype}")
+
+    def get_query_dtype(self, query_execution_id):
+        cols_metadata = self.get_query_columns_metadata(
+            query_execution_id=query_execution_id)
+        dtype = {}
+        parse_dates = []
+        for col_name, col_type in cols_metadata.items():
+            ptype = Athena._type_athena2pandas(dtype=col_type)
+            if ptype == "datetime64":
+                parse_dates.append(col_name)
+            else:
+                dtype[col_name] = ptype
+        logger.debug(f"dtype: {dtype}")
+        logger.debug(f"parse_dates: {parse_dates}")
+        return dtype, parse_dates
+
     def run_query(self, query, database, s3_output):
         response = self._client_athena.start_query_execution(
             QueryString=query,
 
@@ -30,5 +30,9 @@ class EmptyS3Object(Exception):
     pass
 
 
+class LineTerminatorNotFound(Exception):
+    pass
+
+
 class MissingBatchDetected(Exception):
     pass
@@ -1,6 +1,7 @@
 from math import ceil
 import re
 import logging
+from datetime import datetime, date
 
 from awswrangler.exceptions import UnsupportedType, UnsupportedFileFormat
 
@@ -13,7 +14,7 @@ def __init__(self, session):
         self._client_glue = session.boto3_session.client(
             service_name="glue", config=session.botocore_config)
 
-    def get_table_dtypes(self, database, table):
+    def get_table_athena_types(self, database, table):
         """
         Get all columns names and the related data types
         :param database: Glue database's name
@@ -37,24 +38,44 @@ def get_table_python_types(self, database, table):
         :param table: Glue table's name
         :return: A dictionary as {"col name": "col python type"}
         """
-        dtypes = self.get_table_dtypes(database=database, table=table)
+        dtypes = self.get_table_athena_types(database=database, table=table)
         return {k: Glue._type_athena2python(v) for k, v in dtypes.items()}
 
+    @staticmethod
+    def _type_pandas2athena(dtype):
+        dtype = dtype.lower()
+        if dtype == "int32":
+            return "int"
+        elif dtype in ["int64", "Int64"]:
+            return "bigint"
+        elif dtype == "float32":
+            return "float"
+        elif dtype == "float64":
+            return "double"
+        elif dtype == "bool":
+            return "boolean"
+        elif dtype == "object" and isinstance(dtype, str):
+            return "string"
+        elif dtype[:10] == "datetime64":
+            return "timestamp"
+        else:
+            raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
+
     @staticmethod
     def _type_athena2python(dtype):
         dtype = dtype.lower()
-        if dtype == "int":
-            return int
-        elif dtype == "bigint":
+        if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
             return int
-        elif dtype == "float":
-            return float
-        elif dtype == "double":
+        elif dtype in ["float", "double", "real"]:
             return float
         elif dtype == "boolean":
             return bool
-        elif dtype == "string":
+        elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
             return str
+        elif dtype == "timestamp":
+            return datetime
+        elif dtype == "date":
+            return date
         else:
             raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
@@ -157,6 +178,7 @@ def _build_schema(dataframe,
                       partition_cols,
                       preserve_index,
                       cast_columns=None):
+        print(f"dataframe.dtypes:\n{dataframe.dtypes}")
         if not partition_cols:
             partition_cols = []
         schema_built = []
@@ -180,26 +202,6 @@ def _build_schema(dataframe,
         logger.debug(f"schema_built:\n{schema_built}")
         return schema_built
 
-    @staticmethod
-    def _type_pandas2athena(dtype):
-        dtype = dtype.lower()
-        if dtype == "int32":
-            return "int"
-        elif dtype == "int64":
-            return "bigint"
-        elif dtype == "float32":
-            return "float"
-        elif dtype == "float64":
-            return "double"
-        elif dtype == "bool":
-            return "boolean"
-        elif dtype == "object" and isinstance(dtype, str):
-            return "string"
-        elif dtype[:10] == "datetime64":
-            return "timestamp"
-        else:
-            raise UnsupportedType(f"Unsupported Pandas type: {dtype}")
-
     @staticmethod
     def _parse_table_name(path):
         if path[-1] == "/":
 
@@ -2,12 +2,14 @@
 import multiprocessing as mp
 import logging
 from math import floor
+import copy
+import csv
 
 import pandas
 import pyarrow
 from pyarrow import parquet
 
-from awswrangler.exceptions import UnsupportedWriteMode, UnsupportedFileFormat, AthenaQueryError, EmptyS3Object
+from awswrangler.exceptions import UnsupportedWriteMode, UnsupportedFileFormat, AthenaQueryError, EmptyS3Object, LineTerminatorNotFound
 from awswrangler.utils import calculate_bounders
 from awswrangler import s3
 
@@ -41,7 +43,7 @@ def read_csv(
             sep=",",
             lineterminator="\n",
             quotechar='"',
-            quoting=0,
+            quoting=csv.QUOTE_MINIMAL,
             escapechar=None,
             parse_dates=False,
             infer_datetime_format=False,
@@ -119,7 +121,7 @@ def _read_csv_iterator(
             sep=",",
             lineterminator="\n",
             quotechar='"',
-            quoting=0,
+            quoting=csv.QUOTE_MINIMAL,
             escapechar=None,
             parse_dates=False,
             infer_datetime_format=False,
@@ -177,38 +179,38 @@ def _read_csv_iterator(
             bounders_len = len(bounders)
             count = 0
             forgotten_bytes = 0
-            cols_names = None
             for ini, end in bounders:
                 count += 1
+
                 ini -= forgotten_bytes
                 end -= 1  # Range is inclusive, contrary to Python's List
                 bytes_range = "bytes={}-{}".format(ini, end)
                 logger.debug(f"bytes_range: {bytes_range}")
                 body = client_s3.get_object(Bucket=bucket_name, Key=key_path, Range=bytes_range)["Body"]\
                     .read()\
-                    .decode(encoding, errors="ignore")
+                    .decode("utf-8")
                 chunk_size = len(body)
                 logger.debug(f"chunk_size: {chunk_size}")
-                if body[0] == lineterminator:
-                    first_char = 1
-                else:
-                    first_char = 0
-                if (count == 1) and (count == bounders_len):
-                    last_break_line_idx = chunk_size
-                elif count == 1:  # first chunk
-                    last_break_line_idx = body.rindex(lineterminator)
-                    forgotten_bytes = chunk_size - last_break_line_idx
+
+                if count == 1:  # first chunk
+                    last_char = Pandas._find_terminator(
+                        body=body,
+                        quoting=quoting,
+                        quotechar=quotechar,
+                        lineterminator=lineterminator)
+                    forgotten_bytes = len(body[last_char:].encode("utf-8"))
                 elif count == bounders_len:  # Last chunk
-                    header = None
-                    names = cols_names
-                    last_break_line_idx = chunk_size
+                    last_char = chunk_size
                 else:
-                    header = None
-                    names = cols_names
-                    last_break_line_idx = body.rindex(lineterminator)
-                    forgotten_bytes = chunk_size - last_break_line_idx
+                    last_char = Pandas._find_terminator(
+                        body=body,
+                        quoting=quoting,
+                        quotechar=quotechar,
+                        lineterminator=lineterminator)
+                    forgotten_bytes = len(body[last_char:].encode("utf-8"))
+
                 df = pandas.read_csv(
-                    StringIO(body[first_char:last_break_line_idx]),
+                    StringIO(body[:last_char]),
                     header=header,
                     names=names,
                     sep=sep,
@@ -223,7 +225,64 @@ def _read_csv_iterator(
                 )
                 yield df
                 if count == 1:  # first chunk
-                    cols_names = df.columns
+                    names = df.columns
+                    header = None
+
+    @staticmethod
+    def _find_terminator(body, quoting, quotechar, lineterminator):
+        """
+        Find for any suspicious of line terminator (From end to start)
+        :param body: String
+        :param quoting: Same as pandas.read_csv()
+        :param quotechar: Same as pandas.read_csv()
+        :param lineterminator: Same as pandas.read_csv()
+        :return: The index of the suspect line terminator
+        """
+        try:
+            if quoting == csv.QUOTE_ALL:
+                index = body.rindex(lineterminator)
+                while True:
+                    i = 0
+                    while True:
+                        i += 1
+                        if index + i <= len(body) - 1:
+                            c = body[index + i]
+                            if c == ",":
+                                pass
+                            elif c == quotechar:
+                                right = True
+                                break
+                            else:
+                                right = False
+                                break
+                        else:
+                            right = True
+                            break
+                    i = 0
+                    while True:
+                        i += 1
+                        if index - i >= 0:
+                            c = body[index - i]
+                            if c == ",":
+                                pass
+                            elif c == quotechar:
+                                left = True
+                                break
+                            else:
+                                left = False
+                                break
+                        else:
+                            left = True
+                            break
+
+                    if right and left:
+                        break
+                    index = body[:index].rindex(lineterminator)
+            else:
+                index = body.rindex(lineterminator)
+        except ValueError:
+            raise LineTerminatorNotFound()
+        return index
 
     @staticmethod
     def _read_csv_once(
@@ -293,7 +352,7 @@ def read_sql_athena(self,
         Executes any SQL query on AWS Athena and return a Dataframe of the result.
         P.S. If max_result_size is passed, then a iterator of Dataframes is returned.
         :param sql: SQL Query
-        :param database: Glue/Athena Databease
+        :param database: Glue/Athena Database
         :param s3_output: AWS S3 path
         :param max_result_size: Max number of bytes on each request to S3
         :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
@@ -318,8 +377,14 @@ def read_sql_athena(self,
             message_error = f"Query error: {reason}"
             raise AthenaQueryError(message_error)
         else:
+            dtype, parse_dates = self._session.athena.get_query_dtype(
+                query_execution_id=query_execution_id)
             path = f"{s3_output}{query_execution_id}.csv"
-            ret = self.read_csv(path=path, max_result_size=max_result_size)
+            ret = self.read_csv(path=path,
+                                dtype=dtype,
+                                parse_dates=parse_dates,
+                                quoting=csv.QUOTE_ALL,
+                                max_result_size=max_result_size)
         return ret
 
     def to_csv(
@@ -623,11 +688,18 @@ def write_csv_dataframe(dataframe, path, preserve_index, fs):
             f.write(csv_buffer)
 
     @staticmethod
-    def write_parquet_dataframe(dataframe,
-                                path,
-                                preserve_index,
-                                fs,
-                                cast_columns=None):
+    def write_parquet_dataframe(dataframe, path, preserve_index, fs,
+                                cast_columns):
+        if not cast_columns:
+            cast_columns = {}
+        casted_in_pandas = []
+        dtypes = copy.deepcopy(dataframe.dtypes.to_dict())
+        for name, dtype in dtypes.items():
+            if str(dtype) == "Int64":
+                dataframe[name] = dataframe[name].astype("float64")
+                casted_in_pandas.append(name)
+                cast_columns[name] = "int64"
+                logger.debug(f"Casting column {name} Int64 to float64")
         table = pyarrow.Table.from_pandas(df=dataframe,
                                           preserve_index=preserve_index,
                                           safe=False)
@@ -636,13 +708,15 @@ def write_parquet_dataframe(dataframe,
                 col_index = table.column_names.index(col_name)
                 table = table.set_column(col_index,
                                          table.column(col_name).cast(dtype))
-                logger.debug(f"{col_name} - {col_index} - {dtype}")
-        logger.debug(f"table.schema:\n{table.schema}")
+                logger.debug(
+                    f"Casting column {col_name} ({col_index}) to {dtype}")
         with fs.open(path, "wb") as f:
             parquet.write_table(table,
                                 f,
                                 coerce_timestamps="ms",
                                 flavor="spark")
+        for col in casted_in_pandas:
+            dataframe[col] = dataframe[col].astype("Int64")
 
     def to_redshift(
             self,