|
26 | 26 | }, |
27 | 27 | { |
28 | 28 | "cell_type": "code", |
29 | | - "execution_count": 30, |
| 29 | + "execution_count": 2, |
30 | 30 | "metadata": {}, |
31 | 31 | "outputs": [ |
32 | 32 | { |
|
363 | 363 | "- ugpa: The student's undergraduate GPA, continous variable;\n", |
364 | 364 | "- bar: Ground truth label indicating whether or not the student passed the bar, i.e. passed 1st time, passed 2nd time, failed, non-graduated\n", |
365 | 365 | "\n", |
366 | | - "The CART method will be used evaluate the distribution and correlation differences between the real and synthetic data.\n", |
| 366 | + "The CART method will be used evaluate the distribution and correlation differences between the real and synthetic data. CART generally produces higher quality synthetic datasets, but might not run on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n", |
367 | 367 | "\n", |
368 | 368 | "*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)." |
369 | 369 | ] |
|
571 | 571 | }, |
572 | 572 | { |
573 | 573 | "cell_type": "code", |
574 | | - "execution_count": 10, |
| 574 | + "execution_count": 12, |
575 | 575 | "metadata": {}, |
576 | 576 | "outputs": [ |
577 | | - { |
578 | | - "ename": "NameError", |
579 | | - "evalue": "name 'df_encoded' is not defined", |
580 | | - "output_type": "error", |
581 | | - "traceback": [ |
582 | | - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
583 | | - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", |
584 | | - "Cell \u001b[0;32mIn[10], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m plt\u001b[38;5;241m.\u001b[39mfigure(figsize\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m7\u001b[39m, \u001b[38;5;241m6\u001b[39m))\n\u001b[1;32m 2\u001b[0m cmap \u001b[38;5;241m=\u001b[39m LinearSegmentedColormap\u001b[38;5;241m.\u001b[39mfrom_list(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcustom_blues\u001b[39m\u001b[38;5;124m'\u001b[39m, [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m#ffffff\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m#005AA7\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m----> 3\u001b[0m sns\u001b[38;5;241m.\u001b[39mheatmap(\u001b[43mdf_encoded\u001b[49m\u001b[38;5;241m.\u001b[39mcorr(), annot\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, cmap\u001b[38;5;241m=\u001b[39mcmap)\n\u001b[1;32m 4\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCorrelation matrix of attached data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", |
585 | | - "\u001b[0;31mNameError\u001b[0m: name 'df_encoded' is not defined" |
586 | | - ] |
587 | | - }, |
588 | 577 | { |
589 | 578 | "data": { |
| 579 | + "text/html": [ |
| 580 | + "<div>\n", |
| 581 | + "<style scoped>\n", |
| 582 | + " .dataframe tbody tr th:only-of-type {\n", |
| 583 | + " vertical-align: middle;\n", |
| 584 | + " }\n", |
| 585 | + "\n", |
| 586 | + " .dataframe tbody tr th {\n", |
| 587 | + " vertical-align: top;\n", |
| 588 | + " }\n", |
| 589 | + "\n", |
| 590 | + " .dataframe thead th {\n", |
| 591 | + " text-align: right;\n", |
| 592 | + " }\n", |
| 593 | + "</style>\n", |
| 594 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 595 | + " <thead>\n", |
| 596 | + " <tr style=\"text-align: right;\">\n", |
| 597 | + " <th></th>\n", |
| 598 | + " <th>sex</th>\n", |
| 599 | + " <th>race1</th>\n", |
| 600 | + " <th>ugpa</th>\n", |
| 601 | + " <th>bar</th>\n", |
| 602 | + " </tr>\n", |
| 603 | + " </thead>\n", |
| 604 | + " <tbody>\n", |
| 605 | + " <tr>\n", |
| 606 | + " <th>0</th>\n", |
| 607 | + " <td>male</td>\n", |
| 608 | + " <td>white</td>\n", |
| 609 | + " <td>3.5</td>\n", |
| 610 | + " <td>a Passed 1st time</td>\n", |
| 611 | + " </tr>\n", |
| 612 | + " <tr>\n", |
| 613 | + " <th>1</th>\n", |
| 614 | + " <td>male</td>\n", |
| 615 | + " <td>white</td>\n", |
| 616 | + " <td>3.5</td>\n", |
| 617 | + " <td>a Passed 1st time</td>\n", |
| 618 | + " </tr>\n", |
| 619 | + " <tr>\n", |
| 620 | + " <th>2</th>\n", |
| 621 | + " <td>female</td>\n", |
| 622 | + " <td>white</td>\n", |
| 623 | + " <td>3.5</td>\n", |
| 624 | + " <td>a Passed 1st time</td>\n", |
| 625 | + " </tr>\n", |
| 626 | + " <tr>\n", |
| 627 | + " <th>3</th>\n", |
| 628 | + " <td>female</td>\n", |
| 629 | + " <td>white</td>\n", |
| 630 | + " <td>3.5</td>\n", |
| 631 | + " <td>a Passed 1st time</td>\n", |
| 632 | + " </tr>\n", |
| 633 | + " <tr>\n", |
| 634 | + " <th>4</th>\n", |
| 635 | + " <td>female</td>\n", |
| 636 | + " <td>white</td>\n", |
| 637 | + " <td>3.5</td>\n", |
| 638 | + " <td>a Passed 1st time</td>\n", |
| 639 | + " </tr>\n", |
| 640 | + " </tbody>\n", |
| 641 | + "</table>\n", |
| 642 | + "</div>" |
| 643 | + ], |
590 | 644 | "text/plain": [ |
591 | | - "<Figure size 700x600 with 0 Axes>" |
| 645 | + " sex race1 ugpa bar\n", |
| 646 | + "0 male white 3.5 a Passed 1st time\n", |
| 647 | + "1 male white 3.5 a Passed 1st time\n", |
| 648 | + "2 female white 3.5 a Passed 1st time\n", |
| 649 | + "3 female white 3.5 a Passed 1st time\n", |
| 650 | + "4 female white 3.5 a Passed 1st time" |
592 | 651 | ] |
593 | 652 | }, |
| 653 | + "execution_count": 12, |
594 | 654 | "metadata": {}, |
595 | | - "output_type": "display_data" |
| 655 | + "output_type": "execute_result" |
596 | 656 | } |
597 | 657 | ], |
598 | 658 | "source": [ |
599 | | - "plt.figure(figsize=(7, 6))\n", |
600 | | - "cmap = LinearSegmentedColormap.from_list('custom_blues', ['#ffffff', '#005AA7'])\n", |
601 | | - "sns.heatmap(df_encoded.corr(), annot=True, cmap=cmap)\n", |
602 | | - "plt.title('Correlation matrix of attached data')" |
| 659 | + "df.head()" |
603 | 660 | ] |
604 | 661 | }, |
605 | 662 | { |
|
0 commit comments