Adding S3 Select tutorial and small fixes (#748)

jaidisido · web-flow · commit f46f24f53380 · 2021-06-14T18:33:44.000+01:00
* Adding S3 Select tutorial and small fixes

* Minor - Reviewed tutorial

* Minor - Fix mypy

* Minor - Testing install types for mypy
diff --git a/.github/workflows/static-checking.yml b/.github/workflows/static-checking.yml
@@ -29,7 +29,7 @@ jobs:
           python -m pip install --upgrade pip
           pip install -U -r requirements-dev.txt
       - name: mypy check
-        run: mypy awswrangler
+        run: yes y | mypy --install-types awswrangler
       - name: Flake8 Lint
         run: flake8 .
       - name: Pylint Lint
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Easy integration with Athena, Glue, Redshift, Timestream, QuickSight, Chime, Clo
 [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
 [![Coverage](https://img.shields.io/badge/coverage-91%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 ![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=main)
-![Build Status](https://codebuild.us-east-1.amazonaws.com/badges?uuid=eyJlbmNyeXB0ZWREYXRhIjoiaHFXUk91Wks2MGFRMSsxM1R3ZFVjVGg3d0dCT05YTVpoL0VRakRieG43Y3dhdytYZjZtdFVBdG5Sek44anlweDFkM2Z2TWJibVRCRVB5TjlWSnhTdzRBPSIsIml2UGFyYW1ldGVyU3BlYyI6IjUzUllpN295VTUxeFNPQWQiLCJtYXRlcmlhbFNldFNlcmlhbCI6MX0%3D&branch=main)
+![Build Status](https://codebuild.us-east-1.amazonaws.com/badges?uuid=eyJlbmNyeXB0ZWREYXRhIjoiL05FNGxiZCtNT05ibGUrbzY5TzJxaFlOcnFrUFlyNjhWRm5tTmg1bXJXRkdnYUFySzgycEUvMTBBbWxEUzZ2eUpOdjVpcmNQV2hsNkRzQTZtTTVwSjF3PSIsIml2UGFyYW1ldGVyU3BlYyI6IkQ1RVkxWjg5YloyaTJOcVgiLCJtYXRlcmlhbFNldFNlcmlhbCI6MX0%3D&branch=main)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)
 
 | Source | Downloads | Installation Command |
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,9 +1,9 @@
 wheel==0.36.2
 isort==5.8.0
-black==21.5b2
+black==21.6b0
 pylint==2.8.3
 flake8==3.9.2
-mypy==0.812
+mypy==0.902
 pydocstyle==6.1.1
 doc8==0.8.1
 tox==3.23.1
@@ -20,7 +20,7 @@ sphinx_bootstrap_theme==0.7.1
 nbsphinx==0.8.6
 nbsphinx-link==1.3.0
 IPython~=7.16
-moto==2.0.8
+moto==2.0.9
 jupyterlab==3.0.16
 s3fs==0.4.2 # keep it at 0.4.2
 python-Levenshtein==0.12.2
diff --git a/tests/test_athena_parquet.py b/tests/test_athena_parquet.py
@@ -97,15 +97,15 @@ def test_parquet_catalog_duplicated(path, glue_table, glue_database):
         )
 
 
-def test_parquet_catalog_casting(path, glue_database):
+def test_parquet_catalog_casting(path, glue_database, glue_table):
     wr.s3.to_parquet(
         df=get_df_cast(),
         path=path,
         index=False,
         dataset=True,
         mode="overwrite",
         database=glue_database,
-        table="__test_parquet_catalog_casting",
+        table=glue_table,
         dtype={
             "iint8": "tinyint",
             "iint16": "smallint",
@@ -127,14 +127,14 @@ def test_parquet_catalog_casting(path, glue_database):
     df = wr.s3.read_parquet(path=path)
     assert df.shape == (3, 16)
     ensure_data_types(df=df, has_list=False)
-    df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=glue_database, ctas_approach=True)
+    df = wr.athena.read_sql_table(table=glue_table, database=glue_database, ctas_approach=True)
     assert df.shape == (3, 16)
     ensure_data_types(df=df, has_list=False)
-    df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=glue_database, ctas_approach=False)
+    df = wr.athena.read_sql_table(table=glue_table, database=glue_database, ctas_approach=False)
     assert df.shape == (3, 16)
     ensure_data_types(df=df, has_list=False)
     wr.s3.delete_objects(path=path)
-    assert wr.catalog.delete_table_if_exists(database=glue_database, table="__test_parquet_catalog_casting") is True
+    assert wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table) is True
 
 
 def test_parquet_catalog_casting_to_string_with_null(path, glue_table, glue_database):
diff --git a/tutorials/029 - S3 Select.ipynb b/tutorials/029 - S3 Select.ipynb
@@ -0,0 +1,227 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "pythonjvsc74a57bd0e4beff3b9c91951bd870e0e6d1ba9dfdd106cfe45c6f3d0f8d31550063fd3386",
+   "display_name": "Python 3.7.7 64-bit ('.env': venv)"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "e4beff3b9c91951bd870e0e6d1ba9dfdd106cfe45c6f3d0f8d31550063fd3386"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "source": [
+    "[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "# 29 - S3 Select"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "AWS Data Wrangler supports [Amazon S3 Select](https://aws.amazon.com/blogs/aws/s3-glacier-select/), enabling applications to use SQL statements in order to query and filter the contents of a single S3 object. It works on objects stored in CSV, JSON or Apache Parquet, including compressed and large files of several TBs.\n",
+    "\n",
+    "With S3 Select, the query workload is delegated to Amazon S3, leading to lower latency and cost, and to higher performance (up to 400% improvement). This is in comparison with other Wrangler operations such as `read_parquet` where the S3 object is downloaded and filtered on the client-side.\n",
+    "\n",
+    "This feature has a number of limitations however, and should be used for specific scenarios only:\n",
+    "* It operates on a single S3 object\n",
+    "* The maximum length of a record in the input or result is 1 MB\n",
+    "* The maximum uncompressed row group size is 256 MB (Parquet only)\n",
+    "* It can only emit nested data in JSON format\n",
+    "* Certain SQL operations are not supported (e.g. ORDER BY)\n"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "## Read full CSV file"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "  dispatching_base_num      pickup_datetime     dropoff_datetime PULocationID  \\\n",
+       "0               B00009  2019-09-01 00:35:00  2019-09-01 00:59:00          264   \n",
+       "1               B00009  2019-09-01 00:48:00  2019-09-01 01:09:00          264   \n",
+       "2               B00014  2019-09-01 00:16:18  2019-09-02 00:35:37          264   \n",
+       "3               B00014  2019-09-01 00:55:03  2019-09-01 01:09:35          264   \n",
+       "4               B00014  2019-09-01 00:13:08  2019-09-02 01:12:31          264   \n",
+       "\n",
+       "  DOLocationID SR_Flag  \n",
+       "0          264          \n",
+       "1          264          \n",
+       "2          264          \n",
+       "3          264          \n",
+       "4          264          "
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>dispatching_base_num</th>\n      <th>pickup_datetime</th>\n      <th>dropoff_datetime</th>\n      <th>PULocationID</th>\n      <th>DOLocationID</th>\n      <th>SR_Flag</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>B00009</td>\n      <td>2019-09-01 00:35:00</td>\n      <td>2019-09-01 00:59:00</td>\n      <td>264</td>\n      <td>264</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>B00009</td>\n      <td>2019-09-01 00:48:00</td>\n      <td>2019-09-01 01:09:00</td>\n      <td>264</td>\n      <td>264</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>B00014</td>\n      <td>2019-09-01 00:16:18</td>\n      <td>2019-09-02 00:35:37</td>\n      <td>264</td>\n      <td>264</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>B00014</td>\n      <td>2019-09-01 00:55:03</td>\n      <td>2019-09-01 01:09:35</td>\n      <td>264</td>\n      <td>264</td>\n      <td></td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>B00014</td>\n      <td>2019-09-01 00:13:08</td>\n      <td>2019-09-02 01:12:31</td>\n      <td>264</td>\n      <td>264</td>\n      <td></td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 1
+    }
+   ],
+   "source": [
+    "import awswrangler as wr\n",
+    "\n",
+    "df = wr.s3.select_query(\n",
+    "    sql=\"SELECT * FROM s3object\",\n",
+    "    path=\"s3://nyc-tlc/trip data/fhv_tripdata_2019-09.csv\",  # 58 MB\n",
+    "    input_serialization=\"CSV\",\n",
+    "    input_serialization_params={\n",
+    "        \"FileHeaderInfo\": \"Use\",\n",
+    "        \"RecordDelimiter\": \"\\r\\n\",\n",
+    "    },\n",
+    "    use_threads=True,\n",
+    ")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "source": [
+    "## Filter JSON file"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "  family_name                             contact_details               name  \\\n",
+       "0       Biden  [{'type': 'twitter', 'value': 'joebiden'}]  Joseph Biden, Jr.   \n",
+       "\n",
+       "                                               links gender  \\\n",
+       "0  [{'note': 'Wikipedia (ace)', 'url': 'https://a...   male   \n",
+       "\n",
+       "                                               image  \\\n",
+       "0  https://theunitedstates.io/images/congress/ori...   \n",
+       "\n",
+       "                                         identifiers  \\\n",
+       "0  [{'scheme': 'bioguide', 'identifier': 'B000444...   \n",
+       "\n",
+       "                                         other_names      sort_name  \\\n",
+       "0  [{'note': 'alternate', 'name': 'Joe Biden'}, {...  Biden, Joseph   \n",
+       "\n",
+       "                                              images given_name  birth_date  \\\n",
+       "0  [{'url': 'https://theunitedstates.io/images/co...     Joseph  1942-11-20   \n",
+       "\n",
+       "                                     id  \n",
+       "0  64239edf-8e06-4d2d-acc0-33d96bc79774  "
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>family_name</th>\n      <th>contact_details</th>\n      <th>name</th>\n      <th>links</th>\n      <th>gender</th>\n      <th>image</th>\n      <th>identifiers</th>\n      <th>other_names</th>\n      <th>sort_name</th>\n      <th>images</th>\n      <th>given_name</th>\n      <th>birth_date</th>\n      <th>id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Biden</td>\n      <td>[{'type': 'twitter', 'value': 'joebiden'}]</td>\n      <td>Joseph Biden, Jr.</td>\n      <td>[{'note': 'Wikipedia (ace)', 'url': 'https://a...</td>\n      <td>male</td>\n      <td>https://theunitedstates.io/images/congress/ori...</td>\n      <td>[{'scheme': 'bioguide', 'identifier': 'B000444...</td>\n      <td>[{'note': 'alternate', 'name': 'Joe Biden'}, {...</td>\n      <td>Biden, Joseph</td>\n      <td>[{'url': 'https://theunitedstates.io/images/co...</td>\n      <td>Joseph</td>\n      <td>1942-11-20</td>\n      <td>64239edf-8e06-4d2d-acc0-33d96bc79774</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 2
+    }
+   ],
+   "source": [
+    "wr.s3.select_query(\n",
+    "    sql=\"SELECT * FROM s3object[*] s where s.\\\"family_name\\\" = \\'Biden\\'\",\n",
+    "    path=\"s3://awsglue-datasets/examples/us-legislators/all/persons.json\",\n",
+    "    input_serialization=\"JSON\",\n",
+    "    input_serialization_params={\n",
+    "        \"Type\": \"Document\",\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "source": [
+    "## Read Snappy compressed Parquet"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "  marketplace customer_id       review_id  product_id product_parent  \\\n",
+       "0          US    52670295   RGPOFKORD8RTU  B0002CZPPG      867256265   \n",
+       "1          US    29964102  R2U8X8V5KPB4J3  B00H5BMF00      373287760   \n",
+       "2          US    25173351  R15XV3LXUMLTXL  B00PG40CO4      137115061   \n",
+       "3          US    12516181  R3G6G7H8TX4H0T  B0002CZPPG      867256265   \n",
+       "4          US    38355314  R2NJ7WNBU16YTQ  B00B2TFSO6       89375983   \n",
+       "\n",
+       "   star_rating  helpful_votes  total_votes vine verified_purchase  \\\n",
+       "0            5            105          107    N                 N   \n",
+       "1            5              0            0    N                 Y   \n",
+       "2            5              0            0    N                 Y   \n",
+       "3            5              6            6    N                 N   \n",
+       "4            5              0            0    N                 Y   \n",
+       "\n",
+       "       review_headline                                        review_body  \\\n",
+       "0  Excellent Gift Idea  I wonder if the other reviewer actually read t...   \n",
+       "1           Five Stars               convenience is the name of the game.   \n",
+       "2        Birthday Gift  This gift card was handled with accuracy in de...   \n",
+       "3            Love 'em.  Gotta love these iTunes Prepaid Card thingys. ...   \n",
+       "4           Five Stars                                            perfect   \n",
+       "\n",
+       "  review_date  year  \n",
+       "0  2005-02-08  2005  \n",
+       "1  2015-05-03  2015  \n",
+       "2  2015-05-03  2015  \n",
+       "3  2005-10-15  2005  \n",
+       "4  2015-05-03  2015  "
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>marketplace</th>\n      <th>customer_id</th>\n      <th>review_id</th>\n      <th>product_id</th>\n      <th>product_parent</th>\n      <th>star_rating</th>\n      <th>helpful_votes</th>\n      <th>total_votes</th>\n      <th>vine</th>\n      <th>verified_purchase</th>\n      <th>review_headline</th>\n      <th>review_body</th>\n      <th>review_date</th>\n      <th>year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>US</td>\n      <td>52670295</td>\n      <td>RGPOFKORD8RTU</td>\n      <td>B0002CZPPG</td>\n      <td>867256265</td>\n      <td>5</td>\n      <td>105</td>\n      <td>107</td>\n      <td>N</td>\n      <td>N</td>\n      <td>Excellent Gift Idea</td>\n      <td>I wonder if the other reviewer actually read t...</td>\n      <td>2005-02-08</td>\n      <td>2005</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>US</td>\n      <td>29964102</td>\n      <td>R2U8X8V5KPB4J3</td>\n      <td>B00H5BMF00</td>\n      <td>373287760</td>\n      <td>5</td>\n      <td>0</td>\n      <td>0</td>\n      <td>N</td>\n      <td>Y</td>\n      <td>Five Stars</td>\n      <td>convenience is the name of the game.</td>\n      <td>2015-05-03</td>\n      <td>2015</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>US</td>\n      <td>25173351</td>\n      <td>R15XV3LXUMLTXL</td>\n      <td>B00PG40CO4</td>\n      <td>137115061</td>\n      <td>5</td>\n      <td>0</td>\n      <td>0</td>\n      <td>N</td>\n      <td>Y</td>\n      <td>Birthday Gift</td>\n      <td>This gift card was handled with accuracy in de...</td>\n      <td>2015-05-03</td>\n      <td>2015</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>US</td>\n      <td>12516181</td>\n      <td>R3G6G7H8TX4H0T</td>\n      <td>B0002CZPPG</td>\n      <td>867256265</td>\n      <td>5</td>\n      <td>6</td>\n      <td>6</td>\n      <td>N</td>\n      <td>N</td>\n      <td>Love 'em.</td>\n      <td>Gotta love these iTunes Prepaid Card thingys. ...</td>\n      <td>2005-10-15</td>\n      <td>2005</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>US</td>\n      <td>38355314</td>\n      <td>R2NJ7WNBU16YTQ</td>\n      <td>B00B2TFSO6</td>\n      <td>89375983</td>\n      <td>5</td>\n      <td>0</td>\n      <td>0</td>\n      <td>N</td>\n      <td>Y</td>\n      <td>Five Stars</td>\n      <td>perfect</td>\n      <td>2015-05-03</td>\n      <td>2015</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 3
+    }
+   ],
+   "source": [
+    "df = wr.s3.select_query(\n",
+    "        sql=\"SELECT * FROM s3object s where s.\\\"star_rating\\\" >= 5\",\n",
+    "        path=\"s3://amazon-reviews-pds/parquet/product_category=Gift_Card/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet\",\n",
+    "        input_serialization=\"Parquet\",\n",
+    "        input_serialization_params={},\n",
+    "        use_threads=True,\n",
+    ")\n",
+    "df.loc[:, df.columns != \"product_title\"].head()"
+   ]
+  }
+ ]
+}
diff --git a/validate.sh b/validate.sh
@@ -3,7 +3,7 @@ set -ex
 
 isort --check .
 black --check .
-mypy awswrangler
+yes y | mypy --install-types awswrangler
 flake8 .
 pylint -j 0 awswrangler
 pydocstyle awswrangler/ --convention=numpy