|
28 | 28 | }, |
29 | 29 | "outputs": [], |
30 | 30 | "source": [ |
| 31 | + "import numpy as np\n", |
31 | 32 | "import pandas as pd\n", |
32 | 33 | "from data_algebra.data_ops import descr\n", |
33 | 34 | "import data_algebra.test_util\n", |
|
153 | 154 | } |
154 | 155 | } |
155 | 156 | }, |
| 157 | + { |
| 158 | + "cell_type": "markdown", |
| 159 | + "source": [ |
| 160 | + "We can check this matches expectations." |
| 161 | + ], |
| 162 | + "metadata": { |
| 163 | + "collapsed": false, |
| 164 | + "pycharm": { |
| 165 | + "name": "#%% md\n" |
| 166 | + } |
| 167 | + } |
| 168 | + }, |
| 169 | + { |
| 170 | + "cell_type": "code", |
| 171 | + "execution_count": 5, |
| 172 | + "outputs": [], |
| 173 | + "source": [ |
| 174 | + "for group in set(d['g']):\n", |
| 175 | + " assert np.all(pandas_res.loc[pandas_res['g'] == group, 'xm']\n", |
| 176 | + " == np.median(d.loc[d['g'] == group, 'x']))" |
| 177 | + ], |
| 178 | + "metadata": { |
| 179 | + "collapsed": false, |
| 180 | + "pycharm": { |
| 181 | + "name": "#%%\n" |
| 182 | + } |
| 183 | + } |
| 184 | + }, |
156 | 185 | { |
157 | 186 | "cell_type": "markdown", |
158 | 187 | "source": [ |
|
171 | 200 | }, |
172 | 201 | { |
173 | 202 | "cell_type": "code", |
174 | | - "execution_count": 5, |
| 203 | + "execution_count": 6, |
175 | 204 | "outputs": [ |
176 | 205 | { |
177 | 206 | "data": { |
178 | 207 | "text/plain": "(TableDescription(table_name=\"d\", column_names=[\"id\", \"x\", \"g\"]))" |
179 | 208 | }, |
180 | | - "execution_count": 5, |
| 209 | + "execution_count": 6, |
181 | 210 | "metadata": {}, |
182 | 211 | "output_type": "execute_result" |
183 | 212 | } |
|
209 | 238 | }, |
210 | 239 | { |
211 | 240 | "cell_type": "code", |
212 | | - "execution_count": 6, |
| 241 | + "execution_count": 7, |
213 | 242 | "outputs": [], |
214 | 243 | "source": [ |
215 | 244 | "bigquery_sql = bigquery_handle.to_sql(ops)\n" |
|
235 | 264 | }, |
236 | 265 | { |
237 | 266 | "cell_type": "code", |
238 | | - "execution_count": 7, |
| 267 | + "execution_count": 8, |
239 | 268 | "outputs": [ |
240 | 269 | { |
241 | 270 | "data": { |
242 | | - "text/plain": " g id x xm\n0 a 0 4.0 3.0\n1 b 1 50.0 26.1\n2 a 2 1.0 3.0\n3 a 3 3.0 3.0\n4 b 4 2.2 26.1", |
243 | | - "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>g</th>\n <th>id</th>\n <th>x</th>\n <th>xm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a</td>\n <td>0</td>\n <td>4.0</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>b</td>\n <td>1</td>\n <td>50.0</td>\n <td>26.1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>a</td>\n <td>2</td>\n <td>1.0</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>a</td>\n <td>3</td>\n <td>3.0</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>b</td>\n <td>4</td>\n <td>2.2</td>\n <td>26.1</td>\n </tr>\n </tbody>\n</table>\n</div>" |
| 271 | + "text/plain": " id x g xm\n0 0 4.0 a 3.0\n1 1 50.0 b 26.1\n2 2 1.0 a 3.0\n3 3 3.0 a 3.0\n4 4 2.2 b 26.1", |
| 272 | + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>x</th>\n <th>g</th>\n <th>xm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>4.0</td>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>50.0</td>\n <td>b</td>\n <td>26.1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2</td>\n <td>1.0</td>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3</td>\n <td>3.0</td>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>2.2</td>\n <td>b</td>\n <td>26.1</td>\n </tr>\n </tbody>\n</table>\n</div>" |
244 | 273 | }, |
245 | | - "execution_count": 7, |
| 274 | + "execution_count": 8, |
246 | 275 | "metadata": {}, |
247 | 276 | "output_type": "execute_result" |
248 | 277 | } |
|
262 | 291 | }, |
263 | 292 | { |
264 | 293 | "cell_type": "code", |
265 | | - "execution_count": 8, |
| 294 | + "execution_count": 9, |
266 | 295 | "outputs": [], |
267 | 296 | "source": [ |
268 | 297 | "assert data_algebra.test_util.equivalent_frames(pandas_res, db_res)" |
|
290 | 319 | }, |
291 | 320 | { |
292 | 321 | "cell_type": "code", |
293 | | - "execution_count": 9, |
| 322 | + "execution_count": 10, |
294 | 323 | "outputs": [], |
295 | 324 | "source": [ |
296 | 325 | "ops_p = (\n", |
|
311 | 340 | { |
312 | 341 | "cell_type": "markdown", |
313 | 342 | "source": [ |
314 | | - "This pipeline works as follows." |
| 343 | + "This pipeline works in Pandas as follows." |
315 | 344 | ], |
316 | 345 | "metadata": { |
317 | 346 | "collapsed": false, |
|
322 | 351 | }, |
323 | 352 | { |
324 | 353 | "cell_type": "code", |
325 | | - "execution_count": 10, |
| 354 | + "execution_count": 11, |
326 | 355 | "outputs": [ |
327 | 356 | { |
328 | 357 | "data": { |
329 | 358 | "text/plain": " g xm\n0 a 3.0\n1 b 26.1", |
330 | 359 | "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>g</th>\n <th>xm</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>a</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>b</td>\n <td>26.1</td>\n </tr>\n </tbody>\n</table>\n</div>" |
331 | 360 | }, |
332 | | - "execution_count": 10, |
| 361 | + "execution_count": 11, |
333 | 362 | "metadata": {}, |
334 | 363 | "output_type": "execute_result" |
335 | 364 | } |
|
349 | 378 | { |
350 | 379 | "cell_type": "markdown", |
351 | 380 | "source": [ |
352 | | - "But we get a warning if we attempt to convert this to BigQuery SQL." |
| 381 | + "And we again see expected results." |
353 | 382 | ], |
354 | 383 | "metadata": { |
355 | 384 | "collapsed": false, |
|
360 | 389 | }, |
361 | 390 | { |
362 | 391 | "cell_type": "code", |
363 | | - "execution_count": 11, |
| 392 | + "execution_count": 12, |
| 393 | + "outputs": [], |
| 394 | + "source": [ |
| 395 | + "for group in set(d['g']):\n", |
| 396 | + " assert (pandas_res_p.loc[pandas_res_p['g'] == group, 'xm'].values[0]\n", |
| 397 | + " == np.median(d.loc[d['g'] == group, 'x']))\n" |
| 398 | + ], |
| 399 | + "metadata": { |
| 400 | + "collapsed": false, |
| 401 | + "pycharm": { |
| 402 | + "name": "#%%\n" |
| 403 | + } |
| 404 | + } |
| 405 | + }, |
| 406 | + { |
| 407 | + "cell_type": "markdown", |
| 408 | + "source": [ |
| 409 | + "But, we get a warning if we attempt to convert this to BigQuery SQL." |
| 410 | + ], |
| 411 | + "metadata": { |
| 412 | + "collapsed": false, |
| 413 | + "pycharm": { |
| 414 | + "name": "#%% md\n" |
| 415 | + } |
| 416 | + } |
| 417 | + }, |
| 418 | + { |
| 419 | + "cell_type": "code", |
| 420 | + "execution_count": 13, |
364 | 421 | "outputs": [ |
365 | 422 | { |
366 | 423 | "name": "stderr", |
367 | 424 | "output_type": "stream", |
368 | 425 | "text": [ |
369 | | - "/Users/johnmount/Documents/work/data_algebra/data_algebra/db_model.py:1692: UserWarning: BigQueryModel translation doesn't fully support method context: [MethodUse(op_name='median', is_project=True, is_windowed=False, is_ordered=False)]\n", |
| 426 | + "/Users/johnmount/Documents/work/data_algebra/data_algebra/db_model.py:1694: UserWarning: BigQueryModel translation doesn't fully support method context: [MethodUse(op_name='median', is_project=True, is_windowed=False, is_ordered=False)]\n", |
370 | 427 | " warnings.warn(f\"{self} translation doesn't fully support method context: {non_rec}\", UserWarning)\n" |
371 | 428 | ] |
372 | 429 | } |
|
396 | 453 | }, |
397 | 454 | { |
398 | 455 | "cell_type": "code", |
399 | | - "execution_count": 12, |
| 456 | + "execution_count": 14, |
400 | 457 | "outputs": [ |
401 | 458 | { |
402 | 459 | "name": "stdout", |
403 | 460 | "output_type": "stream", |
404 | 461 | "text": [ |
405 | 462 | "caught: 400 percentile_cont aggregate function is not supported.\n", |
406 | 463 | "\n", |
407 | | - "(job ID: 0f2cd57d-9e20-4366-a95c-109997b8c75f)\n", |
| 464 | + "(job ID: f5d44be0-65d1-4860-b985-ca427b38a83b)\n", |
408 | 465 | "\n", |
409 | 466 | " -----Query Job SQL Follows----- \n", |
410 | 467 | "\n", |
|
416 | 473 | " 5:WITH\n", |
417 | 474 | " 6: `table_reference_0` AS (\n", |
418 | 475 | " 7: SELECT\n", |
419 | | - " 8: `g` ,\n", |
420 | | - " 9: `x`\n", |
| 476 | + " 8: `x` ,\n", |
| 477 | + " 9: `g`\n", |
421 | 478 | " 10: FROM\n", |
422 | 479 | " 11: `data-algebra-test.test_1.d`\n", |
423 | 480 | " 12: )\n", |
|
463 | 520 | }, |
464 | 521 | { |
465 | 522 | "cell_type": "code", |
466 | | - "execution_count": 13, |
| 523 | + "execution_count": 15, |
467 | 524 | "outputs": [ |
468 | 525 | { |
469 | 526 | "name": "stdout", |
|
476 | 533 | "WITH\n", |
477 | 534 | " `table_reference_0` AS (\n", |
478 | 535 | " SELECT\n", |
479 | | - " `g` ,\n", |
480 | | - " `x`\n", |
| 536 | + " `x` ,\n", |
| 537 | + " `g`\n", |
481 | 538 | " FROM\n", |
482 | 539 | " `data-algebra-test.test_1.d`\n", |
483 | 540 | " )\n", |
|
516 | 573 | }, |
517 | 574 | { |
518 | 575 | "cell_type": "code", |
519 | | - "execution_count": 14, |
| 576 | + "execution_count": 16, |
520 | 577 | "outputs": [ |
521 | 578 | { |
522 | 579 | "name": "stdout", |
|
529 | 586 | "WITH\n", |
530 | 587 | " `extend_0` AS (\n", |
531 | 588 | " SELECT -- .extend({ 'xm': 'x.median()'}, partition_by=['g'])\n", |
532 | | - " `g` ,\n", |
533 | 589 | " `id` ,\n", |
534 | 590 | " `x` ,\n", |
| 591 | + " `g` ,\n", |
535 | 592 | " PERCENTILE_CONT(`x`, 0.5) OVER ( PARTITION BY `g` ) AS `xm`\n", |
536 | 593 | " FROM\n", |
537 | 594 | " `data-algebra-test.test_1.d`\n", |
|
559 | 616 | { |
560 | 617 | "cell_type": "markdown", |
561 | 618 | "source": [ |
562 | | - "The above failure can come as a surprise. But the new feature of the data algebra is: the \"translate to SQL\" step warned we had a potential problem. This doesn't even require a full database handle, it is data incorporated into the database model during package assembly.\n", |
| 619 | + "Given how similar the two SQL queries are, the above failure can come as a surprise. But a new feature of the data algebra is: the \"translate to SQL\" step warns we have a potential problem. This doesn't even require a full database handle, it is data incorporated into the database model during package assembly.\n", |
563 | 620 | "\n", |
564 | 621 | "## Patching The Solution\n", |
565 | 622 | "\n", |
|
574 | 631 | }, |
575 | 632 | { |
576 | 633 | "cell_type": "code", |
577 | | - "execution_count": 15, |
| 634 | + "execution_count": 17, |
578 | 635 | "outputs": [ |
579 | 636 | { |
580 | 637 | "data": { |
581 | 638 | "text/plain": " xm g\n0 3.0 a\n1 26.1 b", |
582 | 639 | "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>xm</th>\n <th>g</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>3.0</td>\n <td>a</td>\n </tr>\n <tr>\n <th>1</th>\n <td>26.1</td>\n <td>b</td>\n </tr>\n </tbody>\n</table>\n</div>" |
583 | 640 | }, |
584 | | - "execution_count": 15, |
| 641 | + "execution_count": 17, |
585 | 642 | "metadata": {}, |
586 | 643 | "output_type": "execute_result" |
587 | 644 | } |
|
608 | 665 | }, |
609 | 666 | { |
610 | 667 | "cell_type": "code", |
611 | | - "execution_count": 16, |
| 668 | + "execution_count": 18, |
612 | 669 | "outputs": [], |
613 | 670 | "source": [ |
614 | 671 | "assert data_algebra.test_util.equivalent_frames(pandas_res_p, db_res_p)" |
|
641 | 698 | }, |
642 | 699 | { |
643 | 700 | "cell_type": "code", |
644 | | - "execution_count": 17, |
| 701 | + "execution_count": 19, |
645 | 702 | "outputs": [ |
646 | 703 | { |
647 | 704 | "data": { |
648 | 705 | "text/plain": "(\n TableDescription(table_name=\"d\", column_names=[\"id\", \"x\", \"g\"])\n .extend({\"xm\": \"x.median()\"}, partition_by=[\"g\"])\n .project({\"xm\": \"xm.mean()\"}, group_by=[\"g\"])\n)" |
649 | 706 | }, |
650 | | - "execution_count": 17, |
| 707 | + "execution_count": 19, |
651 | 708 | "metadata": {}, |
652 | 709 | "output_type": "execute_result" |
653 | 710 | } |
|
676 | 733 | }, |
677 | 734 | { |
678 | 735 | "cell_type": "code", |
679 | | - "execution_count": 18, |
| 736 | + "execution_count": 20, |
680 | 737 | "outputs": [], |
681 | 738 | "source": [ |
682 | 739 | "# clean up\n", |
|
0 commit comments