Skip to content

Commit 1add713

Browse files
committed
Minor edits
1 parent 004ea92 commit 1add713

File tree

2 files changed

+79
-22
lines changed

2 files changed

+79
-22
lines changed

notebooks/CART_LawSchoolAdmissionBar.ipynb

Lines changed: 77 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
{
2828
"cell_type": "code",
29-
"execution_count": 30,
29+
"execution_count": 2,
3030
"metadata": {},
3131
"outputs": [
3232
{
@@ -363,7 +363,7 @@
363363
"- ugpa: The student's undergraduate GPA, continous variable;\n",
364364
"- bar: Ground truth label indicating whether or not the student passed the bar, i.e. passed 1st time, passed 2nd time, failed, non-graduated\n",
365365
"\n",
366-
"The CART method will be used evaluate the distribution and correlation differences between the real and synthetic data.\n",
366+
"The CART method will be used evaluate the distribution and correlation differences between the real and synthetic data. CART generally produces higher quality synthetic datasets, but might not run on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n",
367367
"\n",
368368
"*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)."
369369
]
@@ -571,35 +571,92 @@
571571
},
572572
{
573573
"cell_type": "code",
574-
"execution_count": 10,
574+
"execution_count": 12,
575575
"metadata": {},
576576
"outputs": [
577-
{
578-
"ename": "NameError",
579-
"evalue": "name 'df_encoded' is not defined",
580-
"output_type": "error",
581-
"traceback": [
582-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
583-
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
584-
"Cell \u001b[0;32mIn[10], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m plt\u001b[38;5;241m.\u001b[39mfigure(figsize\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m7\u001b[39m, \u001b[38;5;241m6\u001b[39m))\n\u001b[1;32m 2\u001b[0m cmap \u001b[38;5;241m=\u001b[39m LinearSegmentedColormap\u001b[38;5;241m.\u001b[39mfrom_list(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcustom_blues\u001b[39m\u001b[38;5;124m'\u001b[39m, [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m#ffffff\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m#005AA7\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m----> 3\u001b[0m sns\u001b[38;5;241m.\u001b[39mheatmap(\u001b[43mdf_encoded\u001b[49m\u001b[38;5;241m.\u001b[39mcorr(), annot\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, cmap\u001b[38;5;241m=\u001b[39mcmap)\n\u001b[1;32m 4\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCorrelation matrix of attached data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
585-
"\u001b[0;31mNameError\u001b[0m: name 'df_encoded' is not defined"
586-
]
587-
},
588577
{
589578
"data": {
579+
"text/html": [
580+
"<div>\n",
581+
"<style scoped>\n",
582+
" .dataframe tbody tr th:only-of-type {\n",
583+
" vertical-align: middle;\n",
584+
" }\n",
585+
"\n",
586+
" .dataframe tbody tr th {\n",
587+
" vertical-align: top;\n",
588+
" }\n",
589+
"\n",
590+
" .dataframe thead th {\n",
591+
" text-align: right;\n",
592+
" }\n",
593+
"</style>\n",
594+
"<table border=\"1\" class=\"dataframe\">\n",
595+
" <thead>\n",
596+
" <tr style=\"text-align: right;\">\n",
597+
" <th></th>\n",
598+
" <th>sex</th>\n",
599+
" <th>race1</th>\n",
600+
" <th>ugpa</th>\n",
601+
" <th>bar</th>\n",
602+
" </tr>\n",
603+
" </thead>\n",
604+
" <tbody>\n",
605+
" <tr>\n",
606+
" <th>0</th>\n",
607+
" <td>male</td>\n",
608+
" <td>white</td>\n",
609+
" <td>3.5</td>\n",
610+
" <td>a Passed 1st time</td>\n",
611+
" </tr>\n",
612+
" <tr>\n",
613+
" <th>1</th>\n",
614+
" <td>male</td>\n",
615+
" <td>white</td>\n",
616+
" <td>3.5</td>\n",
617+
" <td>a Passed 1st time</td>\n",
618+
" </tr>\n",
619+
" <tr>\n",
620+
" <th>2</th>\n",
621+
" <td>female</td>\n",
622+
" <td>white</td>\n",
623+
" <td>3.5</td>\n",
624+
" <td>a Passed 1st time</td>\n",
625+
" </tr>\n",
626+
" <tr>\n",
627+
" <th>3</th>\n",
628+
" <td>female</td>\n",
629+
" <td>white</td>\n",
630+
" <td>3.5</td>\n",
631+
" <td>a Passed 1st time</td>\n",
632+
" </tr>\n",
633+
" <tr>\n",
634+
" <th>4</th>\n",
635+
" <td>female</td>\n",
636+
" <td>white</td>\n",
637+
" <td>3.5</td>\n",
638+
" <td>a Passed 1st time</td>\n",
639+
" </tr>\n",
640+
" </tbody>\n",
641+
"</table>\n",
642+
"</div>"
643+
],
590644
"text/plain": [
591-
"<Figure size 700x600 with 0 Axes>"
645+
" sex race1 ugpa bar\n",
646+
"0 male white 3.5 a Passed 1st time\n",
647+
"1 male white 3.5 a Passed 1st time\n",
648+
"2 female white 3.5 a Passed 1st time\n",
649+
"3 female white 3.5 a Passed 1st time\n",
650+
"4 female white 3.5 a Passed 1st time"
592651
]
593652
},
653+
"execution_count": 12,
594654
"metadata": {},
595-
"output_type": "display_data"
655+
"output_type": "execute_result"
596656
}
597657
],
598658
"source": [
599-
"plt.figure(figsize=(7, 6))\n",
600-
"cmap = LinearSegmentedColormap.from_list('custom_blues', ['#ffffff', '#005AA7'])\n",
601-
"sns.heatmap(df_encoded.corr(), annot=True, cmap=cmap)\n",
602-
"plt.title('Correlation matrix of attached data')"
659+
"df.head()"
603660
]
604661
},
605662
{

src/locales/en.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,13 @@
5353
},
5454
"demoCard": {
5555
"title": "Try it out!",
56-
"description": "Do you not have a dataset at hand? No worries use our demo dataset."
56+
"description": "Alternatively, use our demo dataset."
5757
}
5858
},
5959
"syntheticData": {
6060
"demo": {
6161
"heading": "Information about demo dataset",
62-
"description": "A subset of the [Law School Admission Bar](https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage)* dataset is used as a demo. Synthetic data will be generated for the following columns:\n \n&nbsp;&nbsp;\n- sex: student gender, i.e. 1 (male), 2 (female);\n- race1: race of student, i.e., asian, black, hispanic, white, other;\n- ugpa: undergraduate GPA of student (average course grades), continous variable;\n- bar: Ground truth label indicating whether or not the student passed the bar, i.e., passed 1st time, passed 2nd time, failed, non-graduated.\n \n&nbsp;&nbsp;\n\nThe CART method will be used evaluate the distribution and correlation differences between the real and synthetic data.\n \n&nbsp;&nbsp;\n\n*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)\n \n&nbsp;&nbsp;\n"
62+
"description": "A subset of the [Law School Admission Bar](https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage)* dataset is used as a demo. Synthetic data will be generated for the following columns:\n \n&nbsp;&nbsp;\n- sex: student gender, i.e. 1 (male), 2 (female);\n- race1: race of student, i.e., asian, black, hispanic, white, other;\n- ugpa: undergraduate GPA of student (average course grades), continous variable;\n- bar: Ground truth label indicating whether or not the student passed the bar, i.e., passed 1st time, passed 2nd time, failed, non-graduated.\n \n&nbsp;&nbsp;\n\nThe CART method will be used evaluate the distribution and correlation differences between the real and synthetic data. CART generally produces higher quality synthetic datasets, but might not run on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n \n&nbsp;&nbsp;\n\n*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)\n \n&nbsp;&nbsp;\n"
6363
},
6464
"exportToPDF": "Download evaluation report as pdf",
6565
"exportToJSON": "Download synthetic data as json",

0 commit comments

Comments
 (0)