Minor edits

jfparie · jfparie · commit 1add71317dfc · 2025-01-30T15:47:47.000+01:00
diff --git a/notebooks/CART_LawSchoolAdmissionBar.ipynb b/notebooks/CART_LawSchoolAdmissionBar.ipynb
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -363,7 +363,7 @@
     "- ugpa: The student's undergraduate GPA, continous variable;\n",
     "- bar: Ground truth label indicating whether or not the student passed the bar, i.e. passed 1st time, passed 2nd time, failed, non-graduated\n",
     "\n",
-    "The CART method will be used  evaluate the distribution and correlation differences between the real and synthetic data.\n",
+    "The CART method will be used  evaluate the distribution and correlation differences between the real and synthetic data. CART generally produces higher quality synthetic datasets, but might not run on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n",
     "\n",
     "*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)."
    ]
@@ -571,35 +571,92 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'df_encoded' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[10], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m plt\u001b[38;5;241m.\u001b[39mfigure(figsize\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m7\u001b[39m, \u001b[38;5;241m6\u001b[39m))\n\u001b[1;32m      2\u001b[0m cmap \u001b[38;5;241m=\u001b[39m LinearSegmentedColormap\u001b[38;5;241m.\u001b[39mfrom_list(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcustom_blues\u001b[39m\u001b[38;5;124m'\u001b[39m, [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m#ffffff\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m#005AA7\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m----> 3\u001b[0m sns\u001b[38;5;241m.\u001b[39mheatmap(\u001b[43mdf_encoded\u001b[49m\u001b[38;5;241m.\u001b[39mcorr(), annot\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, cmap\u001b[38;5;241m=\u001b[39mcmap)\n\u001b[1;32m      4\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCorrelation matrix of attached data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'df_encoded' is not defined"
-     ]
-    },
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sex</th>\n",
+       "      <th>race1</th>\n",
+       "      <th>ugpa</th>\n",
+       "      <th>bar</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>male</td>\n",
+       "      <td>white</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>a Passed 1st time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>male</td>\n",
+       "      <td>white</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>a Passed 1st time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>female</td>\n",
+       "      <td>white</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>a Passed 1st time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>female</td>\n",
+       "      <td>white</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>a Passed 1st time</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>female</td>\n",
+       "      <td>white</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>a Passed 1st time</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "<Figure size 700x600 with 0 Axes>"
+       "      sex  race1  ugpa                bar\n",
+       "0    male  white   3.5  a Passed 1st time\n",
+       "1    male  white   3.5  a Passed 1st time\n",
+       "2  female  white   3.5  a Passed 1st time\n",
+       "3  female  white   3.5  a Passed 1st time\n",
+       "4  female  white   3.5  a Passed 1st time"
       ]
      },
+     "execution_count": 12,
      "metadata": {},
-     "output_type": "display_data"
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "plt.figure(figsize=(7, 6))\n",
-    "cmap = LinearSegmentedColormap.from_list('custom_blues', ['#ffffff', '#005AA7'])\n",
-    "sns.heatmap(df_encoded.corr(), annot=True, cmap=cmap)\n",
-    "plt.title('Correlation matrix of attached data')"
+    "df.head()"
    ]
   },
   {
diff --git a/src/locales/en.json b/src/locales/en.json
@@ -53,13 +53,13 @@
         },
         "demoCard": {
             "title": "Try it out!",
-            "description": "Do you not have a dataset at hand? No worries use our demo dataset."
+            "description": "Alternatively, use our demo dataset."
         }
     },
     "syntheticData": {
         "demo": {
             "heading": "Information about demo dataset",
-            "description": "A subset of the [Law School Admission Bar](https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage)* dataset is used as a demo. Synthetic data will be generated for the following columns:\n  \n&nbsp;&nbsp;\n- sex: student gender, i.e. 1 (male), 2 (female);\n- race1: race of student, i.e., asian, black, hispanic, white, other;\n- ugpa: undergraduate GPA of student (average course grades), continous variable;\n- bar: Ground truth label indicating whether or not the student passed the bar, i.e., passed 1st time, passed 2nd time, failed, non-graduated.\n  \n&nbsp;&nbsp;\n\nThe CART method will be used  evaluate the distribution and correlation differences between the real and synthetic data.\n  \n&nbsp;&nbsp;\n\n*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)\n \n&nbsp;&nbsp;\n"
+            "description": "A subset of the [Law School Admission Bar](https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage)* dataset is used as a demo. Synthetic data will be generated for the following columns:\n  \n&nbsp;&nbsp;\n- sex: student gender, i.e. 1 (male), 2 (female);\n- race1: race of student, i.e., asian, black, hispanic, white, other;\n- ugpa: undergraduate GPA of student (average course grades), continous variable;\n- bar: Ground truth label indicating whether or not the student passed the bar, i.e., passed 1st time, passed 2nd time, failed, non-graduated.\n  \n&nbsp;&nbsp;\n\nThe CART method will be used  evaluate the distribution and correlation differences between the real and synthetic data. CART generally produces higher quality synthetic datasets, but might not run on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n  \n&nbsp;&nbsp;\n\n*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)\n \n&nbsp;&nbsp;\n"
         },
         "exportToPDF": "Download evaluation report as pdf",
         "exportToJSON": "Download synthetic data as json",