aws
diff --git a/‎awswrangler/aurora.py‎
Lines changed: 166 additions & 2 deletions b/‎awswrangler/aurora.py‎
Lines changed: 166 additions & 2 deletions
diff --git a/‎awswrangler/data_types.py‎
Lines changed: 52 additions & 0 deletions b/‎awswrangler/data_types.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎awswrangler/exceptions.py‎
Lines changed: 4 additions & 0 deletions b/‎awswrangler/exceptions.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎awswrangler/pandas.py‎
Lines changed: 79 additions & 3 deletions b/‎awswrangler/pandas.py‎
Lines changed: 79 additions & 3 deletions
@@ -1,17 +1,21 @@
-from typing import Union
+from typing import Union, List, Dict, Tuple, Any
 import logging
+import json
 
 import pg8000  # type: ignore
 import pymysql  # type: ignore
+import pandas as pd  # type: ignore
 
-from awswrangler.exceptions import InvalidEngine
+from awswrangler import data_types
+from awswrangler.exceptions import InvalidEngine, InvalidDataframeType, AuroraLoadError
 
 logger = logging.getLogger(__name__)
 
 
 class Aurora:
     def __init__(self, session):
         self._session = session
+        self._client_s3 = session.boto3_session.client(service_name="s3", use_ssl=True, config=session.botocore_config)
 
     @staticmethod
     def _validate_connection(database: str,
@@ -101,3 +105,163 @@ def generate_connection(database: str,
         else:
             raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!")
         return conn
+
+    def write_load_manifest(self, manifest_path: str,
+                            objects_paths: List[str]) -> Dict[str, List[Dict[str, Union[str, bool]]]]:
+        manifest: Dict[str, List[Dict[str, Union[str, bool]]]] = {"entries": []}
+        path: str
+        for path in objects_paths:
+            entry: Dict[str, Union[str, bool]] = {"url": path, "mandatory": True}
+            manifest["entries"].append(entry)
+        payload: str = json.dumps(manifest)
+        bucket: str
+        bucket, path = manifest_path.replace("s3://", "").split("/", 1)
+        logger.info(f"payload: {payload}")
+        self._client_s3.put_object(Body=payload, Bucket=bucket, Key=path)
+        return manifest
+
+    @staticmethod
+    def load_table(dataframe: pd.DataFrame,
+                   dataframe_type: str,
+                   load_paths: List[str],
+                   schema_name: str,
+                   table_name: str,
+                   connection: Any,
+                   num_files,
+                   mode: str = "append",
+                   preserve_index: bool = False,
+                   engine: str = "mysql",
+                   region: str = "us-east-1"):
+        """
+        Load text/CSV files into a Aurora table using a manifest file.
+        Creates the table if necessary.
+
+        :param dataframe: Pandas or Spark Dataframe
+        :param dataframe_type: "pandas" or "spark"
+        :param load_paths: S3 paths to be loaded (E.g. S3://...)
+        :param schema_name: Aurora schema
+        :param table_name: Aurora table name
+        :param connection: A PEP 249 compatible connection (Can be generated with Aurora.generate_connection())
+        :param num_files: Number of files to be loaded
+        :param mode: append or overwrite
+        :param preserve_index: Should we preserve the Dataframe index? (ONLY for Pandas Dataframe)
+        :param engine: "mysql" or "postgres"
+        :param region: AWS S3 bucket region (Required only for postgres engine)
+        :return: None
+        """
+        with connection.cursor() as cursor:
+            if mode == "overwrite":
+                Aurora._create_table(cursor=cursor,
+                                     dataframe=dataframe,
+                                     dataframe_type=dataframe_type,
+                                     schema_name=schema_name,
+                                     table_name=table_name,
+                                     preserve_index=preserve_index,
+                                     engine=engine)
+
+            for path in load_paths:
+                sql = Aurora._get_load_sql(path=path,
+                                           schema_name=schema_name,
+                                           table_name=table_name,
+                                           engine=engine,
+                                           region=region)
+                logger.debug(sql)
+                cursor.execute(sql)
+
+                if "mysql" in engine.lower():
+                    sql = ("-- AWS DATA WRANGLER\n"
+                           f"SELECT COUNT(*) as num_files_loaded FROM mysql.aurora_s3_load_history "
+                           f"WHERE load_prefix = '{path}'")
+                    logger.debug(sql)
+                    cursor.execute(sql)
+                    num_files_loaded = cursor.fetchall()[0][0]
+                    if num_files_loaded != (num_files + 1):
+                        connection.rollback()
+                        raise AuroraLoadError(
+                            f"Aurora load rolled back. {num_files_loaded} files counted. {num_files} expected.")
+
+        connection.commit()
+        logger.debug("Load committed.")
+
+    @staticmethod
+    def _parse_path(path):
+        path2 = path.replace("s3://", "")
+        parts = path2.partition("/")
+        return parts[0], parts[2]
+
+    @staticmethod
+    def _get_load_sql(path: str, schema_name: str, table_name: str, engine: str, region: str = "us-east-1") -> str:
+        if "postgres" in engine.lower():
+            bucket, key = Aurora._parse_path(path=path)
+            sql: str = ("-- AWS DATA WRANGLER\n"
+                        "SELECT aws_s3.table_import_from_s3(\n"
+                        f"'{schema_name}.{table_name}',\n"
+                        "'',\n"
+                        "'(FORMAT CSV, DELIMITER '','', QUOTE ''\"'', ESCAPE ''\\'')',\n"
+                        f"'({bucket},{key},{region})')")
+        elif "mysql" in engine.lower():
+            sql = ("-- AWS DATA WRANGLER\n"
+                   "SELECT aws_s3.table_import_from_s3(\n"
+                   f"LOAD DATA FROM S3 MANIFEST '{path}'\n"
+                   "REPLACE\n"
+                   f"INTO TABLE {schema_name}.{table_name}\n"
+                   "FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\\\\'\n"
+                   "LINES TERMINATED BY '\\n'")
+        else:
+            raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!")
+        return sql
+
+    @staticmethod
+    def _create_table(cursor,
+                      dataframe,
+                      dataframe_type,
+                      schema_name,
+                      table_name,
+                      preserve_index=False,
+                      engine: str = "mysql"):
+        """
+        Creates Aurora table.
+
+        :param cursor: A PEP 249 compatible cursor
+        :param dataframe: Pandas or Spark Dataframe
+        :param dataframe_type: "pandas" or "spark"
+        :param schema_name: Redshift schema
+        :param table_name: Redshift table name
+        :param preserve_index: Should we preserve the Dataframe index? (ONLY for Pandas Dataframe)
+        :param engine: "mysql" or "postgres"
+        :return: None
+        """
+        sql: str = f"-- AWS DATA WRANGLER\n" \
+                   f"DROP TABLE IF EXISTS {schema_name}.{table_name}"
+        logger.debug(f"Drop table query:\n{sql}")
+        cursor.execute(sql)
+        schema = Aurora._get_schema(dataframe=dataframe,
+                                    dataframe_type=dataframe_type,
+                                    preserve_index=preserve_index,
+                                    engine=engine)
+        cols_str: str = "".join([f"{col[0]} {col[1]},\n" for col in schema])[:-2]
+        sql = (f"-- AWS DATA WRANGLER\n" f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (\n" f"{cols_str})")
+        logger.debug(f"Create table query:\n{sql}")
+        cursor.execute(sql)
+
+    @staticmethod
+    def _get_schema(dataframe,
+                    dataframe_type: str,
+                    preserve_index: bool,
+                    engine: str = "mysql") -> List[Tuple[str, str]]:
+        schema_built: List[Tuple[str, str]] = []
+        if "postgres" in engine.lower():
+            convert_func = data_types.pyarrow2postgres
+        elif "mysql" in engine.lower():
+            convert_func = data_types.pyarrow2mysql
+        else:
+            raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!")
+        if dataframe_type.lower() == "pandas":
+            pyarrow_schema: List[Tuple[str, str]] = data_types.extract_pyarrow_schema_from_pandas(
+                dataframe=dataframe, preserve_index=preserve_index, indexes_position="right")
+            for name, dtype in pyarrow_schema:
+                aurora_type: str = convert_func(dtype)
+                schema_built.append((name, aurora_type))
+        else:
+            raise InvalidDataframeType(f"{dataframe_type} is not a valid DataFrame type. Please use 'pandas'!")
+        return schema_built
@@ -203,6 +203,58 @@ def pyarrow2redshift(dtype: pa.types) -> str:
         raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
 
+def pyarrow2postgres(dtype: pa.types) -> str:
+    dtype_str = str(dtype).lower()
+    if dtype_str == "int16":
+        return "SMALLINT"
+    elif dtype_str == "int32":
+        return "INT"
+    elif dtype_str == "int64":
+        return "BIGINT"
+    elif dtype_str == "float":
+        return "FLOAT4"
+    elif dtype_str == "double":
+        return "FLOAT8"
+    elif dtype_str == "bool":
+        return "BOOLEAN"
+    elif dtype_str == "string":
+        return "VARCHAR(256)"
+    elif dtype_str.startswith("timestamp"):
+        return "TIMESTAMP"
+    elif dtype_str.startswith("date"):
+        return "DATE"
+    elif dtype_str.startswith("decimal"):
+        return dtype_str.replace(" ", "").upper()
+    else:
+        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
+
+
+def pyarrow2mysql(dtype: pa.types) -> str:
+    dtype_str = str(dtype).lower()
+    if dtype_str == "int16":
+        return "SMALLINT"
+    elif dtype_str == "int32":
+        return "INT"
+    elif dtype_str == "int64":
+        return "BIGINT"
+    elif dtype_str == "float":
+        return "FLOAT"
+    elif dtype_str == "double":
+        return "DOUBLE"
+    elif dtype_str == "bool":
+        return "BOOLEAN"
+    elif dtype_str == "string":
+        return "VARCHAR(256)"
+    elif dtype_str.startswith("timestamp"):
+        return "TIMESTAMP"
+    elif dtype_str.startswith("date"):
+        return "DATE"
+    elif dtype_str.startswith("decimal"):
+        return dtype_str.replace(" ", "").upper()
+    else:
+        raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
+
+
 def python2athena(python_type: type) -> str:
     python_type_str: str = str(python_type)
     if python_type_str == "<class 'int'>":
 
@@ -26,6 +26,10 @@ class RedshiftLoadError(Exception):
     pass
 
 
+class AuroraLoadError(Exception):
+    pass
+
+
 class AthenaQueryError(Exception):
     pass
 
 
@@ -19,10 +19,11 @@
 from awswrangler import data_types
 from awswrangler.exceptions import (UnsupportedWriteMode, UnsupportedFileFormat, AthenaQueryError, EmptyS3Object,
                                     LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, InvalidCompression,
-                                    InvalidParameters)
+                                    InvalidParameters, InvalidEngine)
 from awswrangler.utils import calculate_bounders
 from awswrangler import s3
 from awswrangler.athena import Athena
+from awswrangler.aurora import Aurora
 
 logger = logging.getLogger(__name__)
 
@@ -834,9 +835,9 @@ def data_to_s3(self,
                    procs_io_bound=None,
                    cast_columns=None,
                    extra_args=None):
-        if not procs_cpu_bound:
+        if procs_cpu_bound is None:
             procs_cpu_bound = self._session.procs_cpu_bound
-        if not procs_io_bound:
+        if procs_io_bound is None:
             procs_io_bound = self._session.procs_io_bound
         logger.debug(f"procs_cpu_bound: {procs_cpu_bound}")
         logger.debug(f"procs_io_bound: {procs_io_bound}")
@@ -1473,3 +1474,78 @@ def read_sql_aurora(self,
             else:
                 self._session.s3.delete_objects(path=temp_s3_path)
             raise e
+
+    def to_aurora(self,
+                  dataframe: pd.DataFrame,
+                  connection: Any,
+                  schema: str,
+                  table: str,
+                  engine: str = "mysql",
+                  temp_s3_path: Optional[str] = None,
+                  preserve_index: bool = False,
+                  mode: str = "append",
+                  procs_cpu_bound: Optional[int] = None,
+                  procs_io_bound: Optional[int] = None,
+                  inplace=True) -> None:
+        """
+        Load Pandas Dataframe as a Table on Aurora
+
+        :param dataframe: Pandas Dataframe
+        :param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
+        :param schema: The Redshift Schema for the table
+        :param table: The name of the desired Redshift table
+        :param engine: "mysql" or "postgres"
+        :param temp_s3_path: S3 path to write temporary files (E.g. s3://BUCKET_NAME/ANY_NAME/)
+        :param preserve_index: Should we preserve the Dataframe index?
+        :param mode: append, overwrite or upsert
+        :param procs_cpu_bound: Number of cores used for CPU bound tasks
+        :param procs_io_bound: Number of cores used for I/O bound tasks
+        :param inplace: True is cheapest (CPU and Memory) but False leaves your DataFrame intact
+        :return: None
+        """
+        if temp_s3_path is None:
+            if self._session.aurora_temp_s3_path is not None:
+                temp_s3_path = self._session.aurora_temp_s3_path
+            else:
+                guid: str = pa.compat.guid()
+                temp_directory = f"temp_aurora_{guid}"
+                temp_s3_path = self._session.athena.create_athena_bucket() + temp_directory + "/"
+        temp_s3_path = temp_s3_path if temp_s3_path[-1] == "/" else temp_s3_path + "/"
+        logger.debug(f"temp_s3_path: {temp_s3_path}")
+
+        paths: List[str] = self.to_csv(dataframe=dataframe,
+                                       path=temp_s3_path,
+                                       sep=",",
+                                       preserve_index=preserve_index,
+                                       mode="overwrite",
+                                       procs_cpu_bound=procs_cpu_bound,
+                                       procs_io_bound=procs_io_bound,
+                                       inplace=inplace)
+
+        load_paths: List[str]
+        region: str = "us-east-1"
+        if "postgres" in engine.lower():
+            load_paths = paths.copy()
+            bucket, _ = Pandas._parse_path(path=load_paths[0])
+            region = self._session.s3.get_bucket_region(bucket=bucket)
+        elif "mysql" in engine.lower():
+            manifest_path: str = f"{temp_s3_path}manifest_{pa.compat.guid()}.json"
+            self._session.aurora.write_load_manifest(manifest_path=manifest_path, objects_paths=paths)
+            load_paths = [manifest_path]
+        else:
+            raise InvalidEngine(f"{engine} is not a valid engine. Please use 'mysql' or 'postgres'!")
+        logger.debug(f"load_paths: {load_paths}")
+
+        Aurora.load_table(dataframe=dataframe,
+                          dataframe_type="pandas",
+                          load_paths=load_paths,
+                          schema_name=schema,
+                          table_name=table,
+                          connection=connection,
+                          num_files=len(paths),
+                          mode=mode,
+                          preserve_index=preserve_index,
+                          engine=engine,
+                          region=region)
+
+        self._session.s3.delete_objects(path=temp_s3_path, procs_io_bound=procs_io_bound)