Skip to content

Commit 7d6c2ad

Browse files
committed
work on example
1 parent 7dcd7bb commit 7d6c2ad

File tree

1 file changed

+88
-31
lines changed

1 file changed

+88
-31
lines changed

Examples/Methods/MethodWarnings.ipynb

Lines changed: 88 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
},
2929
"outputs": [],
3030
"source": [
31+
"import numpy as np\n",
3132
"import pandas as pd\n",
3233
"from data_algebra.data_ops import descr\n",
3334
"import data_algebra.test_util\n",
@@ -153,6 +154,34 @@
153154
}
154155
}
155156
},
157+
{
158+
"cell_type": "markdown",
159+
"source": [
160+
"We can check this matches expectations."
161+
],
162+
"metadata": {
163+
"collapsed": false,
164+
"pycharm": {
165+
"name": "#%% md\n"
166+
}
167+
}
168+
},
169+
{
170+
"cell_type": "code",
171+
"execution_count": 5,
172+
"outputs": [],
173+
"source": [
174+
"for group in set(d['g']):\n",
175+
" assert np.all(pandas_res.loc[pandas_res['g'] == group, 'xm']\n",
176+
" == np.median(d.loc[d['g'] == group, 'x']))"
177+
],
178+
"metadata": {
179+
"collapsed": false,
180+
"pycharm": {
181+
"name": "#%%\n"
182+
}
183+
}
184+
},
156185
{
157186
"cell_type": "markdown",
158187
"source": [
@@ -171,13 +200,13 @@
171200
},
172201
{
173202
"cell_type": "code",
174-
"execution_count": 5,
203+
"execution_count": 6,
175204
"outputs": [
176205
{
177206
"data": {
178207
"text/plain": "(TableDescription(table_name=\"d\", column_names=[\"id\", \"x\", \"g\"]))"
179208
},
180-
"execution_count": 5,
209+
"execution_count": 6,
181210
"metadata": {},
182211
"output_type": "execute_result"
183212
}
@@ -209,7 +238,7 @@
209238
},
210239
{
211240
"cell_type": "code",
212-
"execution_count": 6,
241+
"execution_count": 7,
213242
"outputs": [],
214243
"source": [
215244
"bigquery_sql = bigquery_handle.to_sql(ops)\n"
@@ -235,14 +264,14 @@
235264
},
236265
{
237266
"cell_type": "code",
238-
"execution_count": 7,
267+
"execution_count": 8,
239268
"outputs": [
240269
{
241270
"data": {
242-
"text/plain": " g id x xm\n0 a 0 4.0 3.0\n1 b 1 50.0 26.1\n2 a 2 1.0 3.0\n3 a 3 3.0 3.0\n4 b 4 2.2 26.1",
243-
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>g</th>\n <th>id</th>\n <th>x</th>\n <th>xm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a</td>\n <td>0</td>\n <td>4.0</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>b</td>\n <td>1</td>\n <td>50.0</td>\n <td>26.1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>a</td>\n <td>2</td>\n <td>1.0</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>a</td>\n <td>3</td>\n <td>3.0</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>b</td>\n <td>4</td>\n <td>2.2</td>\n <td>26.1</td>\n </tr>\n </tbody>\n</table>\n</div>"
271+
"text/plain": " id x g xm\n0 0 4.0 a 3.0\n1 1 50.0 b 26.1\n2 2 1.0 a 3.0\n3 3 3.0 a 3.0\n4 4 2.2 b 26.1",
272+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>x</th>\n <th>g</th>\n <th>xm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>4.0</td>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>50.0</td>\n <td>b</td>\n <td>26.1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>1.0</td>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>3.0</td>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>2.2</td>\n <td>b</td>\n <td>26.1</td>\n </tr>\n </tbody>\n</table>\n</div>"
244273
},
245-
"execution_count": 7,
274+
"execution_count": 8,
246275
"metadata": {},
247276
"output_type": "execute_result"
248277
}
@@ -262,7 +291,7 @@
262291
},
263292
{
264293
"cell_type": "code",
265-
"execution_count": 8,
294+
"execution_count": 9,
266295
"outputs": [],
267296
"source": [
268297
"assert data_algebra.test_util.equivalent_frames(pandas_res, db_res)"
@@ -290,7 +319,7 @@
290319
},
291320
{
292321
"cell_type": "code",
293-
"execution_count": 9,
322+
"execution_count": 10,
294323
"outputs": [],
295324
"source": [
296325
"ops_p = (\n",
@@ -311,7 +340,7 @@
311340
{
312341
"cell_type": "markdown",
313342
"source": [
314-
"This pipeline works as follows."
343+
"This pipeline works in Pandas as follows."
315344
],
316345
"metadata": {
317346
"collapsed": false,
@@ -322,14 +351,14 @@
322351
},
323352
{
324353
"cell_type": "code",
325-
"execution_count": 10,
354+
"execution_count": 11,
326355
"outputs": [
327356
{
328357
"data": {
329358
"text/plain": " g xm\n0 a 3.0\n1 b 26.1",
330359
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>g</th>\n <th>xm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>b</td>\n <td>26.1</td>\n </tr>\n </tbody>\n</table>\n</div>"
331360
},
332-
"execution_count": 10,
361+
"execution_count": 11,
333362
"metadata": {},
334363
"output_type": "execute_result"
335364
}
@@ -349,7 +378,7 @@
349378
{
350379
"cell_type": "markdown",
351380
"source": [
352-
"But we get a warning if we attempt to convert this to BigQuery SQL."
381+
"And we again see expected results."
353382
],
354383
"metadata": {
355384
"collapsed": false,
@@ -360,13 +389,41 @@
360389
},
361390
{
362391
"cell_type": "code",
363-
"execution_count": 11,
392+
"execution_count": 12,
393+
"outputs": [],
394+
"source": [
395+
"for group in set(d['g']):\n",
396+
" assert (pandas_res_p.loc[pandas_res_p['g'] == group, 'xm'].values[0]\n",
397+
" == np.median(d.loc[d['g'] == group, 'x']))\n"
398+
],
399+
"metadata": {
400+
"collapsed": false,
401+
"pycharm": {
402+
"name": "#%%\n"
403+
}
404+
}
405+
},
406+
{
407+
"cell_type": "markdown",
408+
"source": [
409+
"But, we get a warning if we attempt to convert this to BigQuery SQL."
410+
],
411+
"metadata": {
412+
"collapsed": false,
413+
"pycharm": {
414+
"name": "#%% md\n"
415+
}
416+
}
417+
},
418+
{
419+
"cell_type": "code",
420+
"execution_count": 13,
364421
"outputs": [
365422
{
366423
"name": "stderr",
367424
"output_type": "stream",
368425
"text": [
369-
"/Users/johnmount/Documents/work/data_algebra/data_algebra/db_model.py:1692: UserWarning: BigQueryModel translation doesn't fully support method context: [MethodUse(op_name='median', is_project=True, is_windowed=False, is_ordered=False)]\n",
426+
"/Users/johnmount/Documents/work/data_algebra/data_algebra/db_model.py:1694: UserWarning: BigQueryModel translation doesn't fully support method context: [MethodUse(op_name='median', is_project=True, is_windowed=False, is_ordered=False)]\n",
370427
" warnings.warn(f\"{self} translation doesn't fully support method context: {non_rec}\", UserWarning)\n"
371428
]
372429
}
@@ -396,15 +453,15 @@
396453
},
397454
{
398455
"cell_type": "code",
399-
"execution_count": 12,
456+
"execution_count": 14,
400457
"outputs": [
401458
{
402459
"name": "stdout",
403460
"output_type": "stream",
404461
"text": [
405462
"caught: 400 percentile_cont aggregate function is not supported.\n",
406463
"\n",
407-
"(job ID: 0f2cd57d-9e20-4366-a95c-109997b8c75f)\n",
464+
"(job ID: f5d44be0-65d1-4860-b985-ca427b38a83b)\n",
408465
"\n",
409466
" -----Query Job SQL Follows----- \n",
410467
"\n",
@@ -416,8 +473,8 @@
416473
" 5:WITH\n",
417474
" 6: `table_reference_0` AS (\n",
418475
" 7: SELECT\n",
419-
" 8: `g` ,\n",
420-
" 9: `x`\n",
476+
" 8: `x` ,\n",
477+
" 9: `g`\n",
421478
" 10: FROM\n",
422479
" 11: `data-algebra-test.test_1.d`\n",
423480
" 12: )\n",
@@ -463,7 +520,7 @@
463520
},
464521
{
465522
"cell_type": "code",
466-
"execution_count": 13,
523+
"execution_count": 15,
467524
"outputs": [
468525
{
469526
"name": "stdout",
@@ -476,8 +533,8 @@
476533
"WITH\n",
477534
" `table_reference_0` AS (\n",
478535
" SELECT\n",
479-
" `g` ,\n",
480-
" `x`\n",
536+
" `x` ,\n",
537+
" `g`\n",
481538
" FROM\n",
482539
" `data-algebra-test.test_1.d`\n",
483540
" )\n",
@@ -516,7 +573,7 @@
516573
},
517574
{
518575
"cell_type": "code",
519-
"execution_count": 14,
576+
"execution_count": 16,
520577
"outputs": [
521578
{
522579
"name": "stdout",
@@ -529,9 +586,9 @@
529586
"WITH\n",
530587
" `extend_0` AS (\n",
531588
" SELECT -- .extend({ 'xm': 'x.median()'}, partition_by=['g'])\n",
532-
" `g` ,\n",
533589
" `id` ,\n",
534590
" `x` ,\n",
591+
" `g` ,\n",
535592
" PERCENTILE_CONT(`x`, 0.5) OVER ( PARTITION BY `g` ) AS `xm`\n",
536593
" FROM\n",
537594
" `data-algebra-test.test_1.d`\n",
@@ -559,7 +616,7 @@
559616
{
560617
"cell_type": "markdown",
561618
"source": [
562-
"The above failure can come as a surprise. But the new feature of the data algebra is: the \"translate to SQL\" step warned we had a potential problem. This doesn't even require a full database handle, it is data incorporated into the database model during package assembly.\n",
619+
"Given how similar the two SQL queries are, the above failure can come as a surprise. But a new feature of the data algebra is: the \"translate to SQL\" step warns we have a potential problem. This doesn't even require a full database handle, it is data incorporated into the database model during package assembly.\n",
563620
"\n",
564621
"## Patching The Solution\n",
565622
"\n",
@@ -574,14 +631,14 @@
574631
},
575632
{
576633
"cell_type": "code",
577-
"execution_count": 15,
634+
"execution_count": 17,
578635
"outputs": [
579636
{
580637
"data": {
581638
"text/plain": " xm g\n0 3.0 a\n1 26.1 b",
582639
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>xm</th>\n <th>g</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>3.0</td>\n <td>a</td>\n </tr>\n <tr>\n <th>1</th>\n <td>26.1</td>\n <td>b</td>\n </tr>\n </tbody>\n</table>\n</div>"
583640
},
584-
"execution_count": 15,
641+
"execution_count": 17,
585642
"metadata": {},
586643
"output_type": "execute_result"
587644
}
@@ -608,7 +665,7 @@
608665
},
609666
{
610667
"cell_type": "code",
611-
"execution_count": 16,
668+
"execution_count": 18,
612669
"outputs": [],
613670
"source": [
614671
"assert data_algebra.test_util.equivalent_frames(pandas_res_p, db_res_p)"
@@ -641,13 +698,13 @@
641698
},
642699
{
643700
"cell_type": "code",
644-
"execution_count": 17,
701+
"execution_count": 19,
645702
"outputs": [
646703
{
647704
"data": {
648705
"text/plain": "(\n TableDescription(table_name=\"d\", column_names=[\"id\", \"x\", \"g\"])\n .extend({\"xm\": \"x.median()\"}, partition_by=[\"g\"])\n .project({\"xm\": \"xm.mean()\"}, group_by=[\"g\"])\n)"
649706
},
650-
"execution_count": 17,
707+
"execution_count": 19,
651708
"metadata": {},
652709
"output_type": "execute_result"
653710
}
@@ -676,7 +733,7 @@
676733
},
677734
{
678735
"cell_type": "code",
679-
"execution_count": 18,
736+
"execution_count": 20,
680737
"outputs": [],
681738
"source": [
682739
"# clean up\n",

0 commit comments

Comments
 (0)