1+ {
2+ "metadata" : {
3+ "language_info" : {
4+ "codemirror_mode" : {
5+ "name" : " ipython" ,
6+ "version" : 3
7+ },
8+ "file_extension" : " .py" ,
9+ "mimetype" : " text/x-python" ,
10+ "name" : " python" ,
11+ "nbconvert_exporter" : " python" ,
12+ "pygments_lexer" : " ipython3" ,
13+ "version" : " 3.7.7"
14+ },
15+ "orig_nbformat" : 2 ,
16+ "kernelspec" : {
17+ "name" : " pythonjvsc74a57bd0e4beff3b9c91951bd870e0e6d1ba9dfdd106cfe45c6f3d0f8d31550063fd3386" ,
18+ "display_name" : " Python 3.7.7 64-bit ('.env': venv)"
19+ },
20+ "metadata" : {
21+ "interpreter" : {
22+ "hash" : " e4beff3b9c91951bd870e0e6d1ba9dfdd106cfe45c6f3d0f8d31550063fd3386"
23+ }
24+ }
25+ },
26+ "nbformat" : 4 ,
27+ "nbformat_minor" : 2 ,
28+ "cells" : [
29+ {
30+ "source" : [
31+ " [](https://github.com/awslabs/aws-data-wrangler)"
32+ ],
33+ "cell_type" : " markdown" ,
34+ "metadata" : {}
35+ },
36+ {
37+ "source" : [
38+ " # 29 - S3 Select"
39+ ],
40+ "cell_type" : " markdown" ,
41+ "metadata" : {}
42+ },
43+ {
44+ "source" : [
45+ " AWS Data Wrangler supports [Amazon S3 Select](https://aws.amazon.com/blogs/aws/s3-glacier-select/), enabling applications to use SQL statements in order to query and filter the contents of a single S3 object. It works on objects stored in CSV, JSON or Apache Parquet, including compressed and large files of several TBs.\n " ,
46+ " \n " ,
47+ " With S3 Select, the query workload is delegated to Amazon S3, leading to lower latency and cost, and to higher performance (up to 400% improvement). This is in comparison with other Wrangler operations such as `read_parquet` where the S3 object is downloaded and filtered on the client-side.\n " ,
48+ " \n " ,
49+ " This feature has a number of limitations however, and should be used for specific scenarios only:\n " ,
50+ " * It operates on a single S3 object\n " ,
51+ " * The maximum length of a record in the input or result is 1 MB\n " ,
52+ " * The maximum uncompressed row group size is 256 MB (Parquet only)\n " ,
53+ " * It can only emit nested data in JSON format\n " ,
54+ " * Certain SQL operations are not supported (e.g. ORDER BY)\n "
55+ ],
56+ "cell_type" : " markdown" ,
57+ "metadata" : {}
58+ },
59+ {
60+ "source" : [
61+ " ## Read full CSV file"
62+ ],
63+ "cell_type" : " markdown" ,
64+ "metadata" : {}
65+ },
66+ {
67+ "cell_type" : " code" ,
68+ "execution_count" : 1 ,
69+ "metadata" : {},
70+ "outputs" : [
71+ {
72+ "output_type" : " execute_result" ,
73+ "data" : {
74+ "text/plain" : [
75+ " dispatching_base_num pickup_datetime dropoff_datetime PULocationID \\\n " ,
76+ " 0 B00009 2019-09-01 00:35:00 2019-09-01 00:59:00 264 \n " ,
77+ " 1 B00009 2019-09-01 00:48:00 2019-09-01 01:09:00 264 \n " ,
78+ " 2 B00014 2019-09-01 00:16:18 2019-09-02 00:35:37 264 \n " ,
79+ " 3 B00014 2019-09-01 00:55:03 2019-09-01 01:09:35 264 \n " ,
80+ " 4 B00014 2019-09-01 00:13:08 2019-09-02 01:12:31 264 \n " ,
81+ " \n " ,
82+ " DOLocationID SR_Flag \n " ,
83+ " 0 264 \n " ,
84+ " 1 264 \n " ,
85+ " 2 264 \n " ,
86+ " 3 264 \n " ,
87+ " 4 264 "
88+ ],
89+ "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>dispatching_base_num</th>\n <th>pickup_datetime</th>\n <th>dropoff_datetime</th>\n <th>PULocationID</th>\n <th>DOLocationID</th>\n <th>SR_Flag</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B00009</td>\n <td>2019-09-01 00:35:00</td>\n <td>2019-09-01 00:59:00</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>1</th>\n <td>B00009</td>\n <td>2019-09-01 00:48:00</td>\n <td>2019-09-01 01:09:00</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>2</th>\n <td>B00014</td>\n <td>2019-09-01 00:16:18</td>\n <td>2019-09-02 00:35:37</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>3</th>\n <td>B00014</td>\n <td>2019-09-01 00:55:03</td>\n <td>2019-09-01 01:09:35</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>4</th>\n <td>B00014</td>\n <td>2019-09-01 00:13:08</td>\n <td>2019-09-02 01:12:31</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
90+ },
91+ "metadata" : {},
92+ "execution_count" : 1
93+ }
94+ ],
95+ "source" : [
96+ " import awswrangler as wr\n " ,
97+ " \n " ,
98+ " df = wr.s3.select_query(\n " ,
99+ " sql=\" SELECT * FROM s3object\" ,\n " ,
100+ " path=\" s3://nyc-tlc/trip data/fhv_tripdata_2019-09.csv\" , # 58 MB\n " ,
101+ " input_serialization=\" CSV\" ,\n " ,
102+ " input_serialization_params={\n " ,
103+ " \" FileHeaderInfo\" : \" Use\" ,\n " ,
104+ " \" RecordDelimiter\" : \"\\ r\\ n\" ,\n " ,
105+ " },\n " ,
106+ " use_threads=True,\n " ,
107+ " )\n " ,
108+ " df.head()"
109+ ]
110+ },
111+ {
112+ "source" : [
113+ " ## Filter JSON file"
114+ ],
115+ "cell_type" : " markdown" ,
116+ "metadata" : {}
117+ },
118+ {
119+ "cell_type" : " code" ,
120+ "execution_count" : 2 ,
121+ "metadata" : {},
122+ "outputs" : [
123+ {
124+ "output_type" : " execute_result" ,
125+ "data" : {
126+ "text/plain" : [
127+ " family_name contact_details name \\\n " ,
128+ " 0 Biden [{'type': 'twitter', 'value': 'joebiden'}] Joseph Biden, Jr. \n " ,
129+ " \n " ,
130+ " links gender \\\n " ,
131+ " 0 [{'note': 'Wikipedia (ace)', 'url': 'https://a... male \n " ,
132+ " \n " ,
133+ " image \\\n " ,
134+ " 0 https://theunitedstates.io/images/congress/ori... \n " ,
135+ " \n " ,
136+ " identifiers \\\n " ,
137+ " 0 [{'scheme': 'bioguide', 'identifier': 'B000444... \n " ,
138+ " \n " ,
139+ " other_names sort_name \\\n " ,
140+ " 0 [{'note': 'alternate', 'name': 'Joe Biden'}, {... Biden, Joseph \n " ,
141+ " \n " ,
142+ " images given_name birth_date \\\n " ,
143+ " 0 [{'url': 'https://theunitedstates.io/images/co... Joseph 1942-11-20 \n " ,
144+ " \n " ,
145+ " id \n " ,
146+ " 0 64239edf-8e06-4d2d-acc0-33d96bc79774 "
147+ ],
148+ "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>family_name</th>\n <th>contact_details</th>\n <th>name</th>\n <th>links</th>\n <th>gender</th>\n <th>image</th>\n <th>identifiers</th>\n <th>other_names</th>\n <th>sort_name</th>\n <th>images</th>\n <th>given_name</th>\n <th>birth_date</th>\n <th>id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Biden</td>\n <td>[{'type': 'twitter', 'value': 'joebiden'}]</td>\n <td>Joseph Biden, Jr.</td>\n <td>[{'note': 'Wikipedia (ace)', 'url': 'https://a...</td>\n <td>male</td>\n <td>https://theunitedstates.io/images/congress/ori...</td>\n <td>[{'scheme': 'bioguide', 'identifier': 'B000444...</td>\n <td>[{'note': 'alternate', 'name': 'Joe Biden'}, {...</td>\n <td>Biden, Joseph</td>\n <td>[{'url': 'https://theunitedstates.io/images/co...</td>\n <td>Joseph</td>\n <td>1942-11-20</td>\n <td>64239edf-8e06-4d2d-acc0-33d96bc79774</td>\n </tr>\n </tbody>\n</table>\n</div>"
149+ },
150+ "metadata" : {},
151+ "execution_count" : 2
152+ }
153+ ],
154+ "source" : [
155+ " wr.s3.select_query(\n " ,
156+ " sql=\" SELECT * FROM s3object[*] s where s.\\\" family_name\\\" = \\ 'Biden\\ '\" ,\n " ,
157+ " path=\" s3://awsglue-datasets/examples/us-legislators/all/persons.json\" ,\n " ,
158+ " input_serialization=\" JSON\" ,\n " ,
159+ " input_serialization_params={\n " ,
160+ " \" Type\" : \" Document\" ,\n " ,
161+ " },\n " ,
162+ " )"
163+ ]
164+ },
165+ {
166+ "source" : [
167+ " ## Read Snappy compressed Parquet"
168+ ],
169+ "cell_type" : " markdown" ,
170+ "metadata" : {}
171+ },
172+ {
173+ "cell_type" : " code" ,
174+ "execution_count" : 3 ,
175+ "metadata" : {},
176+ "outputs" : [
177+ {
178+ "output_type" : " execute_result" ,
179+ "data" : {
180+ "text/plain" : [
181+ " marketplace customer_id review_id product_id product_parent \\\n " ,
182+ " 0 US 52670295 RGPOFKORD8RTU B0002CZPPG 867256265 \n " ,
183+ " 1 US 29964102 R2U8X8V5KPB4J3 B00H5BMF00 373287760 \n " ,
184+ " 2 US 25173351 R15XV3LXUMLTXL B00PG40CO4 137115061 \n " ,
185+ " 3 US 12516181 R3G6G7H8TX4H0T B0002CZPPG 867256265 \n " ,
186+ " 4 US 38355314 R2NJ7WNBU16YTQ B00B2TFSO6 89375983 \n " ,
187+ " \n " ,
188+ " star_rating helpful_votes total_votes vine verified_purchase \\\n " ,
189+ " 0 5 105 107 N N \n " ,
190+ " 1 5 0 0 N Y \n " ,
191+ " 2 5 0 0 N Y \n " ,
192+ " 3 5 6 6 N N \n " ,
193+ " 4 5 0 0 N Y \n " ,
194+ " \n " ,
195+ " review_headline review_body \\\n " ,
196+ " 0 Excellent Gift Idea I wonder if the other reviewer actually read t... \n " ,
197+ " 1 Five Stars convenience is the name of the game. \n " ,
198+ " 2 Birthday Gift This gift card was handled with accuracy in de... \n " ,
199+ " 3 Love 'em. Gotta love these iTunes Prepaid Card thingys. ... \n " ,
200+ " 4 Five Stars perfect \n " ,
201+ " \n " ,
202+ " review_date year \n " ,
203+ " 0 2005-02-08 2005 \n " ,
204+ " 1 2015-05-03 2015 \n " ,
205+ " 2 2015-05-03 2015 \n " ,
206+ " 3 2005-10-15 2005 \n " ,
207+ " 4 2015-05-03 2015 "
208+ ],
209+ "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>marketplace</th>\n <th>customer_id</th>\n <th>review_id</th>\n <th>product_id</th>\n <th>product_parent</th>\n <th>star_rating</th>\n <th>helpful_votes</th>\n <th>total_votes</th>\n <th>vine</th>\n <th>verified_purchase</th>\n <th>review_headline</th>\n <th>review_body</th>\n <th>review_date</th>\n <th>year</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>US</td>\n <td>52670295</td>\n <td>RGPOFKORD8RTU</td>\n <td>B0002CZPPG</td>\n <td>867256265</td>\n <td>5</td>\n <td>105</td>\n <td>107</td>\n <td>N</td>\n <td>N</td>\n <td>Excellent Gift Idea</td>\n <td>I wonder if the other reviewer actually read t...</td>\n <td>2005-02-08</td>\n <td>2005</td>\n </tr>\n <tr>\n <th>1</th>\n <td>US</td>\n <td>29964102</td>\n <td>R2U8X8V5KPB4J3</td>\n <td>B00H5BMF00</td>\n <td>373287760</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>N</td>\n <td>Y</td>\n <td>Five Stars</td>\n <td>convenience is the name of the game.</td>\n <td>2015-05-03</td>\n <td>2015</td>\n </tr>\n <tr>\n <th>2</th>\n <td>US</td>\n <td>25173351</td>\n <td>R15XV3LXUMLTXL</td>\n <td>B00PG40CO4</td>\n <td>137115061</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>N</td>\n <td>Y</td>\n <td>Birthday Gift</td>\n <td>This gift card was handled with accuracy in de...</td>\n <td>2015-05-03</td>\n <td>2015</td>\n </tr>\n <tr>\n <th>3</th>\n <td>US</td>\n <td>12516181</td>\n <td>R3G6G7H8TX4H0T</td>\n <td>B0002CZPPG</td>\n <td>867256265</td>\n <td>5</td>\n <td>6</td>\n <td>6</td>\n <td>N</td>\n <td>N</td>\n <td>Love 'em.</td>\n <td>Gotta love these iTunes Prepaid Card thingys. ...</td>\n <td>2005-10-15</td>\n <td>2005</td>\n </tr>\n <tr>\n <th>4</th>\n <td>US</td>\n <td>38355314</td>\n <td>R2NJ7WNBU16YTQ</td>\n <td>B00B2TFSO6</td>\n <td>89375983</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>N</td>\n <td>Y</td>\n <td>Five Stars</td>\n <td>perfect</td>\n <td>2015-05-03</td>\n <td>2015</td>\n </tr>\n </tbody>\n</table>\n</div>"
210+ },
211+ "metadata" : {},
212+ "execution_count" : 3
213+ }
214+ ],
215+ "source" : [
216+ " df = wr.s3.select_query(\n " ,
217+ " sql=\" SELECT * FROM s3object s where s.\\\" star_rating\\\" >= 5\" ,\n " ,
218+ " path=\" s3://amazon-reviews-pds/parquet/product_category=Gift_Card/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet\" ,\n " ,
219+ " input_serialization=\" Parquet\" ,\n " ,
220+ " input_serialization_params={},\n " ,
221+ " use_threads=True,\n " ,
222+ " )\n " ,
223+ " df.loc[:, df.columns != \" product_title\" ].head()"
224+ ]
225+ }
226+ ]
227+ }
0 commit comments