Update docs

igorborgest · igorborgest · commit 80b465398236 · 2019-12-30T18:22:46.000-03:00
diff --git a/README.md b/README.md
@@ -27,13 +27,14 @@
 * Pandas -> Glue Catalog Table
 * Pandas -> Athena (Parallel)
 * Pandas -> Redshift (Append/Overwrite/Upsert) (Parallel)
-* Parquet (S3) -> Pandas (Parallel) (NEW :star:)
+* Pandas -> Aurora (MySQL/PostgreSQL) (Append/Overwrite) (Via S3) (NEW :star:)
+* Parquet (S3) -> Pandas (Parallel)
 * CSV (S3) -> Pandas (One shot or Batching)
-* Glue Catalog Table -> Pandas (Parallel) (NEW :star:)
-* Athena -> Pandas (One shot, Batching or Parallel (NEW :star:))
-* Redshift -> Pandas (Parallel) (NEW :star:)
-* Redshift -> Parquet (S3) (NEW :star:)
+* Glue Catalog Table -> Pandas (Parallel)
+* Athena -> Pandas (One shot, Batching or Parallel)
+* Redshift -> Pandas (Parallel)
 * CloudWatch Logs Insights -> Pandas
+* Aurora -> Pandas (MySQL) (Via S3) (NEW :star:)
 * Encrypt Pandas Dataframes on S3 with KMS keys
 
 ### PySpark
@@ -60,6 +61,8 @@
 * Get EMR step state
 * Athena query to receive the result as python primitives (*Iterable[Dict[str, Any]*)
 * Load and Unzip SageMaker jobs outputs
+* Redshift -> Parquet (S3)
+* Aurora -> CSV (S3) (MySQL) (NEW :star:)
 
 ## Installation
 
@@ -147,6 +150,22 @@ df = sess.pandas.read_sql_athena(
 )
 ```
 
+#### Reading from Glue Catalog (Parquet) to Pandas
+
+```py3
+import awswrangler as wr
+
+df = wr.pandas.read_table(database="DATABASE_NAME", table="TABLE_NAME")
+```
+
+#### Reading from S3 (Parquet) to Pandas
+
+```py3
+import awswrangler as wr
+
+df = wr.pandas.read_parquet(path="s3://...", columns=["c1", "c3"], filters=[("c5", "=", 0)])
+```
+
 #### Reading from S3 (CSV) to Pandas
 
 ```py3
@@ -227,6 +246,30 @@ df = wr.pandas.read_sql_redshift(
     temp_s3_path="s3://temp_path")
 ```
 
+#### Loading Pandas Dataframe to Aurora (MySQL/PostgreSQL)
+
+```py3
+import awswrangler as wr
+
+wr.pandas.to_aurora(
+    dataframe=df,
+    connection=con,
+    schema="...",
+    table="..."
+)
+```
+
+#### Extract Aurora query to Pandas DataFrame (MySQL)
+
+```py3
+import awswrangler as wr
+
+df = wr.pandas.read_sql_aurora(
+    sql="SELECT ...",
+    connection=con
+)
+```
+
 ### PySpark
 
 #### Loading PySpark Dataframe to Redshift
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -1211,7 +1211,7 @@ def drop_duplicated_columns(dataframe: pd.DataFrame, inplace: bool = True) -> pd
     def read_parquet(self,
                      path: Union[str, List[str]],
                      columns: Optional[List[str]] = None,
-                     filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                     filters: Optional[Union[List[Tuple[Any]], List[List[Tuple[Any]]]]] = None,
                      procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
         """
         Read parquet data from S3
@@ -1274,7 +1274,7 @@ def _read_parquet_paths_remote(send_pipe: mp.connection.Connection,
                                    session_primitives: Any,
                                    path: Union[str, List[str]],
                                    columns: Optional[List[str]] = None,
-                                   filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                                   filters: Optional[Union[List[Tuple[Any]], List[List[Tuple[Any]]]]] = None,
                                    procs_cpu_bound: Optional[int] = None):
         df: pd.DataFrame = Pandas._read_parquet_paths(session_primitives=session_primitives,
                                                       path=path,
@@ -1288,7 +1288,7 @@ def _read_parquet_paths_remote(send_pipe: mp.connection.Connection,
     def _read_parquet_paths(session_primitives: Any,
                             path: Union[str, List[str]],
                             columns: Optional[List[str]] = None,
-                            filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                            filters: Optional[Union[List[Tuple[Any]], List[List[Tuple[Any]]]]] = None,
                             procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
         """
         Read parquet data from S3
@@ -1327,7 +1327,7 @@ def _read_parquet_paths(session_primitives: Any,
     def _read_parquet_path(session_primitives: Any,
                            path: str,
                            columns: Optional[List[str]] = None,
-                           filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                           filters: Optional[Union[List[Tuple[Any]], List[List[Tuple[Any]]]]] = None,
                            procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
         """
         Read parquet data from S3
@@ -1369,7 +1369,7 @@ def read_table(self,
                    database: str,
                    table: str,
                    columns: Optional[List[str]] = None,
-                   filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                   filters: Optional[Union[List[Tuple[Any]], List[List[Tuple[Any]]]]] = None,
                    procs_cpu_bound: Optional[int] = None) -> pd.DataFrame:
         """
         Read PARQUET table from S3 using the Glue Catalog location skipping Athena's necessity
@@ -1408,6 +1408,7 @@ def read_sql_redshift(self,
         temp_s3_path = temp_s3_path[:-1] if temp_s3_path[-1] == "/" else temp_s3_path
         temp_s3_path = f"{temp_s3_path}/{name}"
         logger.debug(f"temp_s3_path: {temp_s3_path}")
+        self._session.s3.delete_objects(path=temp_s3_path)
         paths: Optional[List[str]] = None
         try:
             paths = self._session.redshift.to_parquet(sql=sql,
@@ -1416,11 +1417,11 @@ def read_sql_redshift(self,
                                                       connection=connection)
             logger.debug(f"paths: {paths}")
             df: pd.DataFrame = self.read_parquet(path=paths, procs_cpu_bound=procs_cpu_bound)  # type: ignore
-            self._session.s3.delete_listed_objects(objects_paths=paths)
+            self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"])  # type: ignore
             return df
         except Exception as e:
             if paths is not None:
-                self._session.s3.delete_listed_objects(objects_paths=paths)
+                self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"])
             else:
                 self._session.s3.delete_objects(path=temp_s3_path)
             raise e
diff --git a/building/Dockerfile b/building/Dockerfile
@@ -6,7 +6,7 @@ RUN yum install -y \
     bison \
     flex \
     autoconf \
-    python37-devel
+    python36-devel
 
 RUN pip3 install --upgrade pip
 
diff --git a/building/build-lambda-layer.sh b/building/build-lambda-layer.sh
@@ -1,18 +1,15 @@
 #!/usr/bin/env bash
 set -e
 
-
 # Go back to AWSWRANGLER directory
 cd /aws-data-wrangler/
 
 rm -rf dist/*.zip
 
-# Build PyArrow files if necessary
-if [ ! -d "dist/pyarrow_files" ] ; then
-  cd building
-  ./build-pyarrow.sh
-  cd ..
-fi
+# Build PyArrow files
+cd building
+./build-pyarrow.sh
+cd ..
 
 # Preparing directories
 mkdir -p dist
diff --git a/building/build-pyarrow.sh b/building/build-pyarrow.sh
@@ -7,7 +7,6 @@ rm -rf \
   dist \
   /aws-data-wrangler/dist/pyarrow_wheels \
   /aws-data-wrangler/dist/pyarrow_files \
-  /aws-data-wrangler/dist/pyarrow_wheels/
 
 # Clone desired Arrow version
 git clone \
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -83,6 +83,23 @@ Reading from AWS Athena to Pandas with the blazing fast CTAS approach
         database="database"
     )
 
+Reading from Glue Catalog (Parquet) to Pandas
+`````````````````````````````````````````````
+
+.. code-block:: python
+
+    import awswrangler as wr
+
+    df = wr.pandas.read_table(database="DATABASE_NAME", table="TABLE_NAME")
+
+Reading from S3 (Parquet) to Pandas
+```````````````````````````````````
+
+.. code-block:: python
+
+    import awswrangler as wr
+
+    df = wr.pandas.read_parquet(path="s3://...", columns=["c1", "c3"], filters=[("c5", "=", 0)])
 
 Reading from S3 (CSV) to Pandas
 ```````````````````````````````
@@ -174,6 +191,32 @@ Extract Redshift query to Pandas DataFrame
         connection=con,
         temp_s3_path="s3://temp_path")
 
+Loading Pandas Dataframe to Aurora (MySQL/PostgreSQL)
+`````````````````````````````````````````````````````
+
+.. code-block:: python
+
+    import awswrangler as wr
+
+    wr.pandas.to_aurora(
+        dataframe=df,
+        connection=con,
+        schema="...",
+        table="..."
+    )
+
+
+Extract Aurora query to Pandas DataFrame (MySQL)
+````````````````````````````````````````````````
+
+.. code-block:: python
+
+    import awswrangler as wr
+
+    df = wr.pandas.read_sql_aurora(
+        sql="SELECT ...",
+        connection=con
+    )
 
 PySpark
 -------
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -20,13 +20,14 @@ Pandas
 * Pandas -> Glue Catalog Table
 * Pandas -> Athena (Parallel)
 * Pandas -> Redshift (Append/Overwrite/Upsert) (Parallel)
+* Pandas -> Aurora (MySQL/PostgreSQL) (Append/Overwrite) (Via S3) (NEW)
 * Parquet (S3) -> Pandas (Parallel)
 * CSV (S3) -> Pandas (One shot or Batching)
 * Glue Catalog Table -> Pandas (Parallel)
 * Athena -> Pandas (One shot, Batching or Parallel)
 * Redshift -> Pandas (Parallel)
-* Redshift -> Parquet (S3)
 * CloudWatch Logs Insights -> Pandas
+* Aurora -> Pandas (MySQL) (Via S3) (NEW)
 * Encrypt Pandas Dataframes on S3 with KMS keys
 
 PySpark
@@ -45,13 +46,16 @@ General
 * Get the size of S3 objects (Parallel)
 * Get CloudWatch Logs Insights query results
 * Load partitions on Athena/Glue table (repair table)
-* Create EMR cluster (For humans) (NEW)
-* Terminate EMR cluster (NEW)
-* Get EMR cluster state (NEW)
-* Submit EMR step(s) (For humans) (NEW)
-* Get EMR step state (NEW)
-* Athena query to receive the result as python primitives (Iterable[Dict[str, Any]) (NEW)
+* Create EMR cluster (For humans)
+* Terminate EMR cluster
+* Get EMR cluster state
+* Submit EMR step(s) (For humans)
+* Get EMR step state
+* Get EMR step state
+* Athena query to receive the result as python primitives (*Iterable[Dict[str, Any]*)
 * Load and Unzip SageMaker jobs outputs
+* Redshift -> Parquet (S3)
+* Aurora -> CSV (S3) (MySQL) (NEW :star:)
 
 
 Table Of Contents
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py