Skip to content

Commit bf54802

Browse files
committed
Add athena_nested.ipynb tutorial!
1 parent cd10c72 commit bf54802

File tree

1 file changed

+378
-0
lines changed

1 file changed

+378
-0
lines changed

tutorials/athena_nested.ipynb

Lines changed: 378 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,378 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Athena with nested data types"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"### Target Dataset:\n",
15+
"\n",
16+
"```sql\n",
17+
"WITH dataset AS (\n",
18+
" SELECT ARRAY[\n",
19+
" CAST(ROW('ARN1', 'ACCOUTID1', 'TYPE1') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
20+
" CAST(ROW('ARN2', 'ACCOUTID2', 'TYPE2') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
21+
" CAST(ROW('ARN3', 'ACCOUTID3', 'TYPE3') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR))\n",
22+
" ] AS your_field\n",
23+
")\n",
24+
"SELECT\n",
25+
" *\n",
26+
"FROM dataset\n",
27+
"```"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": 1,
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"import awswrangler as wr"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"### Unnesting the inner struct"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 2,
49+
"metadata": {},
50+
"outputs": [
51+
{
52+
"data": {
53+
"text/html": [
54+
"<div>\n",
55+
"<style scoped>\n",
56+
" .dataframe tbody tr th:only-of-type {\n",
57+
" vertical-align: middle;\n",
58+
" }\n",
59+
"\n",
60+
" .dataframe tbody tr th {\n",
61+
" vertical-align: top;\n",
62+
" }\n",
63+
"\n",
64+
" .dataframe thead th {\n",
65+
" text-align: right;\n",
66+
" }\n",
67+
"</style>\n",
68+
"<table border=\"1\" class=\"dataframe\">\n",
69+
" <thead>\n",
70+
" <tr style=\"text-align: right;\">\n",
71+
" <th></th>\n",
72+
" <th>arn</th>\n",
73+
" <th>accountid</th>\n",
74+
" <th>type</th>\n",
75+
" </tr>\n",
76+
" </thead>\n",
77+
" <tbody>\n",
78+
" <tr>\n",
79+
" <th>0</th>\n",
80+
" <td>[ARN1, ARN2, ARN3]</td>\n",
81+
" <td>[ACCOUTID1, ACCOUTID2, ACCOUTID3]</td>\n",
82+
" <td>[TYPE1, TYPE2, TYPE3]</td>\n",
83+
" </tr>\n",
84+
" </tbody>\n",
85+
"</table>\n",
86+
"</div>"
87+
],
88+
"text/plain": [
89+
" arn accountid \\\n",
90+
"0 [ARN1, ARN2, ARN3] [ACCOUTID1, ACCOUTID2, ACCOUTID3] \n",
91+
"\n",
92+
" type \n",
93+
"0 [TYPE1, TYPE2, TYPE3] "
94+
]
95+
},
96+
"execution_count": 2,
97+
"metadata": {},
98+
"output_type": "execute_result"
99+
}
100+
],
101+
"source": [
102+
"sql = \"\"\"\n",
103+
"WITH dataset AS (\n",
104+
" SELECT ARRAY[\n",
105+
" CAST(ROW('ARN1', 'ACCOUTID1', 'TYPE1') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
106+
" CAST(ROW('ARN2', 'ACCOUTID2', 'TYPE2') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
107+
" CAST(ROW('ARN3', 'ACCOUTID3', 'TYPE3') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR))\n",
108+
" ] AS your_field\n",
109+
")\n",
110+
"SELECT\n",
111+
" transform(your_field, x -> x.arn) AS arn,\n",
112+
" transform(your_field, x -> x.accountid) AS accountid,\n",
113+
" transform(your_field, x -> x.type) AS type\n",
114+
"FROM dataset\n",
115+
"\"\"\"\n",
116+
"\n",
117+
"df = wr.pandas.read_sql_athena(sql)\n",
118+
"df.head()"
119+
]
120+
},
121+
{
122+
"cell_type": "code",
123+
"execution_count": 3,
124+
"metadata": {},
125+
"outputs": [
126+
{
127+
"data": {
128+
"text/plain": [
129+
"'ARN1'"
130+
]
131+
},
132+
"execution_count": 3,
133+
"metadata": {},
134+
"output_type": "execute_result"
135+
}
136+
],
137+
"source": [
138+
"df.iloc[0].arn[0]"
139+
]
140+
},
141+
{
142+
"cell_type": "markdown",
143+
"metadata": {},
144+
"source": [
145+
"### Unnesting the outer array (Only with CTAS approach)"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": 4,
151+
"metadata": {},
152+
"outputs": [
153+
{
154+
"data": {
155+
"text/html": [
156+
"<div>\n",
157+
"<style scoped>\n",
158+
" .dataframe tbody tr th:only-of-type {\n",
159+
" vertical-align: middle;\n",
160+
" }\n",
161+
"\n",
162+
" .dataframe tbody tr th {\n",
163+
" vertical-align: top;\n",
164+
" }\n",
165+
"\n",
166+
" .dataframe thead th {\n",
167+
" text-align: right;\n",
168+
" }\n",
169+
"</style>\n",
170+
"<table border=\"1\" class=\"dataframe\">\n",
171+
" <thead>\n",
172+
" <tr style=\"text-align: right;\">\n",
173+
" <th></th>\n",
174+
" <th>your_field</th>\n",
175+
" </tr>\n",
176+
" </thead>\n",
177+
" <tbody>\n",
178+
" <tr>\n",
179+
" <th>0</th>\n",
180+
" <td>{'arn': 'ARN1', 'accountid': 'ACCOUTID1', 'typ...</td>\n",
181+
" </tr>\n",
182+
" <tr>\n",
183+
" <th>1</th>\n",
184+
" <td>{'arn': 'ARN2', 'accountid': 'ACCOUTID2', 'typ...</td>\n",
185+
" </tr>\n",
186+
" <tr>\n",
187+
" <th>2</th>\n",
188+
" <td>{'arn': 'ARN3', 'accountid': 'ACCOUTID3', 'typ...</td>\n",
189+
" </tr>\n",
190+
" </tbody>\n",
191+
"</table>\n",
192+
"</div>"
193+
],
194+
"text/plain": [
195+
" your_field\n",
196+
"0 {'arn': 'ARN1', 'accountid': 'ACCOUTID1', 'typ...\n",
197+
"1 {'arn': 'ARN2', 'accountid': 'ACCOUTID2', 'typ...\n",
198+
"2 {'arn': 'ARN3', 'accountid': 'ACCOUTID3', 'typ..."
199+
]
200+
},
201+
"execution_count": 4,
202+
"metadata": {},
203+
"output_type": "execute_result"
204+
}
205+
],
206+
"source": [
207+
"sql = \"\"\"\n",
208+
"WITH dataset AS (\n",
209+
" SELECT ARRAY[\n",
210+
" CAST(ROW('ARN1', 'ACCOUTID1', 'TYPE1') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
211+
" CAST(ROW('ARN2', 'ACCOUTID2', 'TYPE2') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
212+
" CAST(ROW('ARN3', 'ACCOUTID3', 'TYPE3') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR))\n",
213+
" ] AS your_field\n",
214+
")\n",
215+
"SELECT t.your_field\n",
216+
"FROM dataset, UNNEST(your_field) as t(your_field)\n",
217+
"\"\"\"\n",
218+
"\n",
219+
"df = wr.pandas.read_sql_athena(sql, ctas_approach=True)\n",
220+
"df.head()"
221+
]
222+
},
223+
{
224+
"cell_type": "code",
225+
"execution_count": 5,
226+
"metadata": {},
227+
"outputs": [
228+
{
229+
"data": {
230+
"text/plain": [
231+
"'ARN1'"
232+
]
233+
},
234+
"execution_count": 5,
235+
"metadata": {},
236+
"output_type": "execute_result"
237+
}
238+
],
239+
"source": [
240+
"df.iloc[0].your_field[\"arn\"]"
241+
]
242+
},
243+
{
244+
"cell_type": "markdown",
245+
"metadata": {},
246+
"source": [
247+
"### Unnesting the outer array and the inner struct (Fully unnested)"
248+
]
249+
},
250+
{
251+
"cell_type": "code",
252+
"execution_count": 6,
253+
"metadata": {},
254+
"outputs": [
255+
{
256+
"data": {
257+
"text/html": [
258+
"<div>\n",
259+
"<style scoped>\n",
260+
" .dataframe tbody tr th:only-of-type {\n",
261+
" vertical-align: middle;\n",
262+
" }\n",
263+
"\n",
264+
" .dataframe tbody tr th {\n",
265+
" vertical-align: top;\n",
266+
" }\n",
267+
"\n",
268+
" .dataframe thead th {\n",
269+
" text-align: right;\n",
270+
" }\n",
271+
"</style>\n",
272+
"<table border=\"1\" class=\"dataframe\">\n",
273+
" <thead>\n",
274+
" <tr style=\"text-align: right;\">\n",
275+
" <th></th>\n",
276+
" <th>arn</th>\n",
277+
" <th>accountid</th>\n",
278+
" <th>type</th>\n",
279+
" </tr>\n",
280+
" </thead>\n",
281+
" <tbody>\n",
282+
" <tr>\n",
283+
" <th>0</th>\n",
284+
" <td>ARN1</td>\n",
285+
" <td>ACCOUTID1</td>\n",
286+
" <td>TYPE1</td>\n",
287+
" </tr>\n",
288+
" <tr>\n",
289+
" <th>1</th>\n",
290+
" <td>ARN2</td>\n",
291+
" <td>ACCOUTID2</td>\n",
292+
" <td>TYPE2</td>\n",
293+
" </tr>\n",
294+
" <tr>\n",
295+
" <th>2</th>\n",
296+
" <td>ARN3</td>\n",
297+
" <td>ACCOUTID3</td>\n",
298+
" <td>TYPE3</td>\n",
299+
" </tr>\n",
300+
" </tbody>\n",
301+
"</table>\n",
302+
"</div>"
303+
],
304+
"text/plain": [
305+
" arn accountid type\n",
306+
"0 ARN1 ACCOUTID1 TYPE1\n",
307+
"1 ARN2 ACCOUTID2 TYPE2\n",
308+
"2 ARN3 ACCOUTID3 TYPE3"
309+
]
310+
},
311+
"execution_count": 6,
312+
"metadata": {},
313+
"output_type": "execute_result"
314+
}
315+
],
316+
"source": [
317+
"sql = \"\"\"\n",
318+
"WITH dataset AS (\n",
319+
" SELECT ARRAY[\n",
320+
" CAST(ROW('ARN1', 'ACCOUTID1', 'TYPE1') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
321+
" CAST(ROW('ARN2', 'ACCOUTID2', 'TYPE2') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR)),\n",
322+
" CAST(ROW('ARN3', 'ACCOUTID3', 'TYPE3') AS ROW(arn VARCHAR, accountid VARCHAR, type VARCHAR))\n",
323+
" ] AS your_field\n",
324+
")\n",
325+
"SELECT\n",
326+
" t.your_field.arn,\n",
327+
" t.your_field.accountid,\n",
328+
" t.your_field.type\n",
329+
"FROM dataset, UNNEST(your_field) as t(your_field)\n",
330+
"\"\"\"\n",
331+
"\n",
332+
"df = wr.pandas.read_sql_athena(sql)\n",
333+
"df.head()"
334+
]
335+
},
336+
{
337+
"cell_type": "code",
338+
"execution_count": 7,
339+
"metadata": {},
340+
"outputs": [
341+
{
342+
"data": {
343+
"text/plain": [
344+
"'ARN1'"
345+
]
346+
},
347+
"execution_count": 7,
348+
"metadata": {},
349+
"output_type": "execute_result"
350+
}
351+
],
352+
"source": [
353+
"df.iloc[0].arn"
354+
]
355+
}
356+
],
357+
"metadata": {
358+
"kernelspec": {
359+
"display_name": "Python 3",
360+
"language": "python",
361+
"name": "python3"
362+
},
363+
"language_info": {
364+
"codemirror_mode": {
365+
"name": "ipython",
366+
"version": 3
367+
},
368+
"file_extension": ".py",
369+
"mimetype": "text/x-python",
370+
"name": "python",
371+
"nbconvert_exporter": "python",
372+
"pygments_lexer": "ipython3",
373+
"version": "3.7.4"
374+
}
375+
},
376+
"nbformat": 4,
377+
"nbformat_minor": 4
378+
}

0 commit comments

Comments
 (0)